In [None]:
from nltk.tokenize import TweetTokenizer
import pandas as pd
import nltk
import numpy as np
from nltk.corpus import stopwords
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.svm import SVC
import re
from sklearn.preprocessing import StandardScaler

In [None]:
#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tweet_tokenizer = TweetTokenizer()
    stopword_list=nltk.corpus.stopwords.words('english')
    tokens = tweet_tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [None]:
def clean_df(file):
    data = pd.read_csv(file, sep='\t', names=["id", "polarity", "tweet"])
    data = data.drop_duplicates()
    data['tweet']=data['tweet'].apply(remove_stopwords)
    data["tweet"] = data["tweet"].str.lower() # lowercase
    data = data.reset_index(drop=True)
    return data

In [None]:
def get_feature(only_tweet_data):
    neg_words = ["never", "no", "nothing", "nowhere", "noone", "none", "not", "havent", "hasnt", "hadnt", "cant", "couldnt",
                 "shouldnt", "wont", "wouldnt", "dont", "doesnt", "didnt", "isnt", "arent", "aint"]
    output = open("negation_result.csv", "w+", encoding="utf-8")
    punctuations = '.*[.?;:!,]+'
    
    tweet_tokenizer = TweetTokenizer()
    result = []
    
    for tweet in only_tweet_data:
        temp = []
        neg_start_idx = 0
        neg_start_arr = []
        neg_end_idx = 0
        neg_end_arr = []
        neg_context = 0
        is_negated = False
        tokens = tweet_tokenizer.tokenize(tweet)
        for i in range(0, len(tokens)):
            token = "".join(tokens[i]).lower()

            if token in neg_words or token.endswith("n't"):
                is_negated = True
                neg_start_idx = i
                neg_start_arr.append(neg_start_idx)
            if is_negated:
                if re.match(punctuations, token):
                    neg_end_idx = i - 1
                    neg_end_arr.append(neg_end_idx)
                    is_negated = False

        if neg_end_idx < neg_start_idx:
            neg_end_idx = len(tokens) - 1
            neg_end_arr.append(neg_end_idx)
            is_negated = False

        for i in range(0, len(neg_end_arr)):
            neg_context += neg_end_arr[i] - neg_start_arr[i]
        if neg_context != 0:
            temp.append(neg_context)
            output.write("%s " %neg_context)
            for i in range(0, len(neg_end_arr)):
                output.write("%s %s " %(neg_start_arr[i], neg_end_arr[i]))
                temp.append(neg_start_arr[i])
                temp.append(neg_end_arr[i])

        output.write("\n")
        result.append(temp)

    output.close()
    
    max_length = 0
    for array in result:
        max_length = max(max_length, len(array))

    for array in result:
        array += [0] * (max_length - len(array))
    return result

In [None]:
train_data = clean_df("./data/dataset/twitter-2013train-A.txt")
only_tweet_train_data = train_data['tweet']

train_negation_feature = get_feature(only_tweet_train_data)

train_labels = train_data.polarity
result = []
for x in train_labels:
    if x == "positive":
        result.append(2)
    elif x == "negative":
        result.append(1)
    elif x == "neutral":
        result.append(0)
train_labels = np.array(result)
scaler = StandardScaler()
train_negation_feature = scaler.fit_transform(train_negation_feature)
train_features = np.array(train_negation_feature)

print("train labels: ", train_labels) 
print("train features:", train_features) 
print("train labels shape: ", train_labels.shape) 
print("train features shape:", train_features.shape)
    

In [None]:
test_data = clean_df("./data/dataset/twitter-2013test-A.txt")
only_tweet_test_data = test_data['tweet']
test_negation_feature = get_feature(only_tweet_test_data)

test_labels = test_data.polarity
result = []
for x in test_labels:
    if x == "positive":
        result.append(2)
    elif x == "negative":
        result.append(1)
    elif x == "neutral":
        result.append(0)
test_labels = np.array(result)
scaler = StandardScaler()
test_negation_feature = scaler.fit_transform(test_negation_feature)
test_features = np.array(test_negation_feature)

print("test labels: ", test_labels) 
print("test features:", test_features) 
print("test_labels shape: ", test_labels.shape) 
print("test_features shape:", test_features.shape) 

In [None]:
file = open("negation_vector.txt", "w+")
for i in test_negation_feature:
    content = str(i)
    file.write(content)
file.close()

In [None]:
dev_data = clean_df("./data/dataset/twitter-2013dev-A.txt")
only_tweet_dev_data = dev_data['tweet']
dev_negation_feature = get_feature(only_tweet_dev_data)

dev_labels = dev_data.polarity
result = []
for x in dev_labels:
    if x == "positive":
        result.append(2)
    elif x == "negative":
        result.append(1)
    elif x == "neutral":
        result.append(0)
dev_labels = np.array(result)
scaler = StandardScaler()
dev_negation_feature = scaler.fit_transform(dev_negation_feature)
dev_features = np.array(dev_negation_feature)

print("dev labels: ", dev_labels) 
print("dev features:", dev_features) 
print("dev_labels shape: ", dev_labels.shape) 
print("dev_features shape:", dev_features.shape) 

In [None]:

clf = SVC(kernel='linear', C=0.005, probability=True)

sample_weight = np.array([3.14 if i == 1 else 1 for i in train_labels])

clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))

In [None]:

clf = SVC(kernel='linear', C=1, probability=True)

sample_weight = np.array([3.14 if i == 1 else 1 for i in train_labels])

clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))

In [None]:

clf = SVC(kernel='linear', C=0.005, probability=True)

arr = []

for i in train_labels:
    if i == 1:
        arr.append(3.14)
    elif i == 2:
        arr.append(1.25)
    else:
        arr.append(1)


sample_weight = np.array(arr)
clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))

In [None]:

clf = SVC(kernel='linear', C=1, probability=True)

arr = []

for i in train_labels:
    if i == 1:
        arr.append(3.14)
    elif i == 2:
        arr.append(1.25)
    else:
        arr.append(1)


sample_weight = np.array(arr)
clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))

In [None]:
from sklearn.model_selection import GridSearchCV
  
param_grid = {'C': [0.005, 0.1, 0.5, 1], 
              'kernel': ['linear','rbf']} 
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
  
grid.fit(train_features, train_labels)

print(grid.best_params_)
  
print(grid.best_estimator_)

grid_predictions = grid.predict(dev_features)
  
print(classification_report(dev_labels, grid_predictions))