In [None]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
import nltk
from nltk.lm.preprocessing import pad_both_ends
import numpy as np
from sklearn import metrics
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, f1_score
from sklearn.svm import SVC
import re
from sklearn.feature_selection import VarianceThreshold
import time
from sklearn.preprocessing import StandardScaler

In [None]:
#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tweet_tokenizer = TweetTokenizer()
    stopword_list=nltk.corpus.stopwords.words('english')
    tokens = tweet_tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [None]:
def clean_df(file):
    data = pd.read_csv(file, sep='\t', names=["id", "polarity", "tweet"])
    data = data.drop_duplicates()
    data['tweet']=data['tweet'].apply(remove_stopwords)
    data["tweet"] = data["tweet"].str.lower() # lowercase
    data = data.reset_index(drop=True)
    return data

In [None]:
def add_neg_suffix_to_df(data):
    neg_words = ["never", "no", "nothing", "nowhere", "noone", "none", "not", "havent", "hasnt", "hadnt", "cant", "couldnt",
                 "shouldnt", "wont", "wouldnt", "dont", "doesnt", "didnt", "isnt", "arent", "aint"]
    punctuations = '.*[.?;:!,]+'
    tweet_tokenizer = TweetTokenizer()
    
    for i in range(len(data)):
        temp = []
        neg_start_idx = 0
        neg_end_idx = 0
        is_negated = False
        count = 0
        tokens = tweet_tokenizer.tokenize(data['tweet'][i])
        for j in range(0, len(tokens)):
            token = "".join(tokens[j]).lower()
            if token in neg_words or token.endswith("n't"):
                is_negated = True
                neg_start_idx = j
                if count == 0:
                    no_neg_suffix = True
            if is_negated:
                old_token = token
                if re.match(punctuations, token):
                    neg_end_idx = j - 1
                    is_negated = False
                else:
                    token += "_NEG"
                    data['tweet'][i] = data['tweet'][i].replace(old_token, token)
                    cur_polarity = data['polarity'][i]
                    if no_neg_suffix:
                        data['polarity'][i] = data['polarity'][i].replace(cur_polarity, cur_polarity + "_NEG")
                        no_neg_suffix = False
                        count += 1
        if neg_end_idx < neg_start_idx:
            neg_end_idx = len(tokens) - 1
            is_negated = False

In [None]:
def get_word_dict(only_tweet_data):
    all_word_dict = dict()
    tweet_tokenizer = TweetTokenizer()
    for i in range(len(only_tweet_data)):
        all_grams_combinations = list(nltk.everygrams(tweet_tokenizer.tokenize(only_tweet_data[i]), max_len = 2))
        for gram_combination in all_grams_combinations:
            if gram_combination in all_word_dict.keys():
                all_word_dict[gram_combination] += 1
            else:
                all_word_dict[gram_combination] = 1
    return all_word_dict

In [None]:
def get_feature_array(only_tweet_data):
    word_dict = get_word_dict(only_tweet_data)
    prune_features = []
    for key, value in word_dict.items():
        if (len(key) == 1 and value >= 5) or (len(key) == 2 and value >= 7): 
            prune_features.append(key)
    return prune_features

In [None]:
def get_feature(only_tweet_data, features_array):
    tweet_tokenizer = TweetTokenizer()
    feature_result = []
    for tweet in only_tweet_data:
        all_grams_combinations = list(nltk.everygrams(tweet_tokenizer.tokenize(tweet), max_len = 2))
        tweet_dict = dict()
        tweet_arr = []
        for gram_combination in all_grams_combinations:
            if gram_combination in tweet_dict.keys():
                tweet_dict[gram_combination] += 1
            else:
                tweet_dict[gram_combination] = 1
        for feature in features_array:
            if feature in tweet_dict.keys():
                tweet_arr.append(tweet_dict[feature])
            else:
                tweet_arr.append(0)
        feature_result.append(tweet_arr)
    return feature_result    

In [None]:
train_data = clean_df("./data/dataset/twitter-2013train-A.txt")
add_suffix_train_data = train_data.copy()

add_neg_suffix_to_df(add_suffix_train_data)
only_tweet_add_suffix_train_data = add_suffix_train_data['tweet']
features_array = get_feature_array(only_tweet_add_suffix_train_data)
train_n_gram_add_suffix_feature = get_feature(only_tweet_add_suffix_train_data, features_array)

train_labels = add_suffix_train_data.polarity
result = []
for x in train_labels:
    if x == "positive":
        result.append(2)
    elif x == "negative":
        result.append(1)
    elif x == "neutral":
        result.append(0)
    elif x == "positive_NEG":
        result.append(5)
    elif x == "negative_NEG":
        result.append(4)
    elif x == "neutral_NEG":
        result.append(3)
train_labels = np.array(result)
scaler = StandardScaler()
train_n_gram_add_suffix_feature = scaler.fit_transform(train_n_gram_add_suffix_feature)
train_features = np.array(train_n_gram_add_suffix_feature)


print("train labels: ", train_labels) 
print("train features:", train_features) 
print("train labels shape: ", train_labels.shape) 
print("train features shape:", train_features.shape)



In [None]:
dev_data = clean_df("./data/dataset/twitter-2013test-A.txt")
add_suffix_dev_data = dev_data.copy()

add_neg_suffix_to_df(add_suffix_dev_data)
only_tweet_add_suffix_dev_data = add_suffix_dev_data['tweet']
features_array = get_feature_array(only_tweet_add_suffix_train_data)
dev_n_gram_add_suffix_feature = get_feature(only_tweet_add_suffix_dev_data, features_array)

dev_labels = add_suffix_dev_data.polarity
result = []
for x in dev_labels:
    if x == "positive":
        result.append(2)
    elif x == "negative":
        result.append(1)
    elif x == "neutral":
        result.append(0)
    elif x == "positive_NEG":
        result.append(5)
    elif x == "negative_NEG":
        result.append(4)
    elif x == "neutral_NEG":
        result.append(3)

dev_labels = np.array(result)
scaler = StandardScaler()
dev_n_gram_add_suffix_feature = scaler.fit_transform(dev_n_gram_add_suffix_feature)
dev_features = np.array(dev_n_gram_add_suffix_feature)

print("dev labels: ", dev_labels) 
print("dev features:", dev_features) 
print("dev_labels shape: ", dev_labels.shape) 
print("dev_features shape:", dev_features.shape) 

In [None]:
file = open("n_gram_neg_suffix_vector.txt", "w+")
for i in dev_n_gram_add_suffix_feature:
    content = str(i)
    file.write(content)
file.close()

In [None]:

clf = SVC(kernel='linear', C=0.005, probability=True)

sample_weight = np.array([3.14 if i == 1 else 1 for i in train_labels])

clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=0.005 \nNegative weight = 3.14 \n\n", metrics.classification_report(dev_labels, predictions))




clf = SVC(kernel='linear', C=1, probability=True)

sample_weight = np.array([3.14 if i == 1 else 1 for i in train_labels])

clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=1 \nNegative weight = 3.14 \n\n", metrics.classification_report(dev_labels, predictions, target_names=['neutral','negative','positive','neutral_NEG', 'negative_NEG', 'positive_NEG']))





clf = SVC(kernel='linear', C=0.005, probability=True)

arr = []

for i in train_labels:
    if i == 1:
        arr.append(3.14)
    elif i == 2:
        arr.append(1.25)
    else:
        arr.append(1)


sample_weight = np.array(arr)
clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=0.005 \nNegative weight = 3.14 \nPositive weight = 1.25 \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive','neutral_NEG', 'negative_NEG', 'positive_NEG']))


clf = SVC(kernel='linear', C=1, probability=True)

arr = []

for i in train_labels:
    if i == 1:
        arr.append(3.14)
    elif i == 2:
        arr.append(1.25)
    else:
        arr.append(1)


sample_weight = np.array(arr)
clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=1 \nNegative weight = 3.14 \nPositive weight = 1.25 \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive','neutral_NEG', 'negative_NEG', 'positive_NEG']))


from sklearn.model_selection import GridSearchCV
  
param_grid = {'C': [0.005, 0.1, 0.5, 1], 
              'kernel': ['linear','rbf']} 
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
  
grid.fit(train_features, train_labels)

print(grid.best_params_)
  
print(grid.best_estimator_)

grid_predictions = grid.predict(dev_features)
  
print(classification_report(dev_labels, grid_predictions))

In [None]:
dev_data = clean_df("./data/dataset/twitter-2013dev-A.txt")
add_suffix_dev_data = dev_data.copy()

add_neg_suffix_to_df(add_suffix_dev_data)
only_tweet_add_suffix_dev_data = add_suffix_dev_data['tweet']
features_array = get_feature_array(only_tweet_add_suffix_train_data)
dev_n_gram_add_suffix_feature = get_feature(only_tweet_add_suffix_dev_data, features_array)

dev_labels = add_suffix_dev_data.polarity
result = []
for x in dev_labels:
    if x == "positive":
        result.append(2)
    elif x == "negative":
        result.append(1)
    elif x == "neutral":
        result.append(0)
    elif x == "positive_NEG":
        result.append(5)
    elif x == "negative_NEG":
        result.append(4)
    elif x == "neutral_NEG":
        result.append(3)

dev_labels = np.array(result)
scaler = StandardScaler()
dev_n_gram_add_suffix_feature = scaler.fit_transform(dev_n_gram_add_suffix_feature)
dev_features = np.array(dev_n_gram_add_suffix_feature)

print("dev labels: ", dev_labels) 
print("dev features:", dev_features) 
print("dev_labels shape: ", dev_labels.shape) 
print("dev_features shape:", dev_features.shape) 

In [None]:

clf = SVC(kernel='linear', C=0.005, probability=True)

sample_weight = np.array([3.14 if i == 1 else 1 for i in train_labels])

clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=0.005 \nNegative weight = 3.14 \n\n", metrics.classification_report(dev_labels, predictions, target_names=['neutral','negative','positive','neutral_NEG', 'negative_NEG', 'positive_NEG']))



clf = SVC(kernel='linear', C=1, probability=True)

sample_weight = np.array([3.14 if i == 1 else 1 for i in train_labels])

clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=1 \nNegative weight = 3.14 \n\n", metrics.classification_report(dev_labels, predictions, target_names=['neutral','negative','positive','neutral_NEG', 'negative_NEG', 'positive_NEG']))


clf = SVC(kernel='linear', C=0.005, probability=True)

arr = []

for i in train_labels:
    if i == 1:
        arr.append(3.14)
    elif i == 2:
        arr.append(1.25)
    else:
        arr.append(1)


sample_weight = np.array(arr)
clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=0.005 \nNegative weight = 3.14 \nPositive weight = 1.25 \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive','neutral_NEG', 'negative_NEG', 'positive_NEG']))


clf = SVC(kernel='linear', C=1, probability=True)

arr = []

for i in train_labels:
    if i == 1:
        arr.append(3.14)
    elif i == 2:
        arr.append(1.25)
    else:
        arr.append(1)


sample_weight = np.array(arr)
clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=1 \nNegative weight = 3.14 \nPositive weight = 1.25 \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive','neutral_NEG', 'negative_NEG', 'positive_NEG']))

