In [None]:
from collections import OrderedDict, defaultdict, Counter
import pandas as pd
import csv
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

In [None]:
#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tweet_tokenizer = TweetTokenizer()
    stopword_list=nltk.corpus.stopwords.words('english')
    tokens = tweet_tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [None]:
def clean_df(file):
    data = pd.read_csv(file, sep='\t', names=["id", "polarity", "tweet"])
    data = data.drop_duplicates()
    data['tweet']=data['tweet'].apply(remove_stopwords)
    data["tweaet"] = data["tweet"].str.lower() # lowercase
    data = data.reset_index(drop=True)
    return data

In [None]:
def generate_emotion_count(string, tokenizer):
    count_pos = 0
    count_neg = 0
    count_anger = 0
    count_anticipation = 0
    count_disgust = 0
    count_fear = 0
    count_joy = 0
    count_sadness = 0
    count_surprise = 0
    count_trust = 0
    for token in tokenizer.tokenize(string):
        token = token.lower()
        if token in word_list.keys():
            for value in word_list[token]:
                if value == 'anger':
                    count_anger += 1
                elif value == 'anticipation':
                    count_anticipation += 1
                elif value == 'disgust':
                    count_disgust += 1
                elif value == 'fear':
                    count_fear += 1
                elif value == 'joy':
                    count_joy += 1
                elif value == 'negative':
                    count_neg += 1
                elif value == 'positive':
                    count_pos += 1
                elif value == 'sadness':
                    count_sadness += 1
                elif value == 'surprise':
                    count_surprise += 1
                elif value == 'trust':
                    count_trust += 1
    result_dict = dict()
    result_dict['anger'] = count_anger
    result_dict['anticipation'] = count_anticipation
    result_dict['disgust'] = count_disgust
    result_dict['fear'] = count_fear
    result_dict['joy'] = count_joy
    result_dict['negative'] = count_neg
    result_dict['positive'] = count_pos
    result_dict['sadness'] = count_sadness
    result_dict['surprise'] = count_surprise
    result_dict['trust'] = count_trust
    return result_dict

In [None]:
def get_feature(only_tweet_data, tokenizer):
    emotionCounts = [generate_emotion_count(tweet, tokenizer) for tweet in only_tweet_data]
    emotion_df = pd.DataFrame(emotionCounts, index=only_tweet_data.index)
    emotion_df = emotion_df.fillna(0)
    emotion_np = emotion_df.to_numpy()
    return emotion_np

In [None]:
word_list = defaultdict(list)
emotionList = defaultdict(list)
with open('./data/NRC-Emotion-Lexicon/NRC-Emotion-Lexicon-v0.92/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt', 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    headerRows = [i for i in range(0, 1)]
    for row in headerRows:
        next(reader)
    for word, emotion, present in reader:
        if int(present) == 1:
            word_list[word].append(emotion)
            emotionList[emotion].append(word)

In [None]:
tweet_tokenizer = TweetTokenizer()
train_data = clean_df("./data/dataset/twitter-2013train-A.txt")
only_tweet_train_data = train_data['tweet']
train_emotion_feature = get_feature(only_tweet_train_data, tweet_tokenizer)

train_labels = train_data.polarity
result = []
for x in train_labels:
    if x == "positive":
        result.append(2)
    elif x == "negative":
        result.append(1)
    elif x == "neutral":
        result.append(0)
train_labels = np.array(result)
scaler = StandardScaler()
train_emotion_feature = scaler.fit_transform(train_emotion_feature)
train_features = np.array(train_emotion_feature)

print("train labels: ", train_labels) 
print("train features:", train_features) 
print("train labels shape: ", train_labels.shape) 
print("train features shape:", train_features.shape)

In [None]:
test_data = clean_df("./data/dataset/twitter-2013test-A.txt")
only_tweet_test_data = test_data['tweet']
test_emotion_feature = get_feature(only_tweet_test_data, tweet_tokenizer)

test_labels = test_data.polarity
result = []
for x in test_labels:
    if x == "positive":
        result.append(2)
    elif x == "negative":
        result.append(1)
    elif x == "neutral":
        result.append(0)
test_labels = np.array(result)
scaler = StandardScaler()
test_emotion_feature = scaler.fit_transform(test_emotion_feature)
test_features = np.array(test_emotion_feature)

print("test labels: ", test_labels) 
print("test features:", test_features) 
print("test_labels shape: ", test_labels.shape) 
print("test_features shape:", test_features.shape) 

In [None]:
file = open("emotion_vector.txt", "w+")
for i in test_emotion_feature:
    content = str(i)
    file.write(content)
file.close()

In [None]:

clf = SVC(kernel='linear', C=0.005, probability=True)

sample_weight = np.array([3.14 if i == 1 else 1 for i in train_labels])

clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(test_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=0.005 \nNegative weight = 3.14 \n\n", metrics.classification_report(test_labels, predictions,target_names=['neutral','negative','positive']))




clf = SVC(kernel='linear', C=1, probability=True)

sample_weight = np.array([3.14 if i == 1 else 1 for i in train_labels])

clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(test_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=1 \nNegative weight = 3.14 \n\n", metrics.classification_report(test_labels, predictions,target_names=['neutral','negative','positive']))





clf = SVC(kernel='linear', C=0.005, probability=True)

arr = []

for i in train_labels:
    if i == 1:
        arr.append(3.14)
    elif i == 2:
        arr.append(1.25)
    else:
        arr.append(1)


sample_weight = np.array(arr)
clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(test_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=0.005 \nNegative weight = 3.14 \nPositive weight = 1.25 \n\n", metrics.classification_report(test_labels, predictions,target_names=['neutral','negative','positive']))


clf = SVC(kernel='linear', C=1, probability=True)

arr = []

for i in train_labels:
    if i == 1:
        arr.append(3.14)
    elif i == 2:
        arr.append(1.25)
    else:
        arr.append(1)


sample_weight = np.array(arr)
clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(test_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=1 \nNegative weight = 3.14 \nPositive weight = 1.25 \n\n", metrics.classification_report(test_labels, predictions,target_names=['neutral','negative','positive']))


from sklearn.model_selection import GridSearchCV
  
param_grid = {'C': [0.005, 0.1, 0.5, 1], 
              'kernel': ['linear','rbf']} 
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
  
grid.fit(train_features, train_labels)

print(grid.best_params_)
  
print(grid.best_estimator_)

grid_predictions = grid.predict(dev_features)
  
print(classification_report(dev_labels, grid_predictions))

In [None]:
tweet_tokenizer = TweetTokenizer()
dev_data = clean_df("./data/dataset/twitter-2013dev-A.txt")
only_tweet_dev_data = dev_data['tweet']
dev_emotion_feature = get_feature(only_tweet_dev_data, tweet_tokenizer)

dev_labels = dev_data.polarity
result = []
for x in dev_labels:
    if x == "positive":
        result.append(2)
    elif x == "negative":
        result.append(1)
    elif x == "neutral":
        result.append(0)
dev_labels = np.array(result)
scaler = StandardScaler()
dev_emotion_feature = scaler.fit_transform(dev_emotion_feature)
dev_features = np.array(dev_emotion_feature)

print("dev labels: ", dev_labels) 
print("dev features:", dev_features) 
print("dev_labels shape: ", dev_labels.shape) 
print("dev_features shape:", dev_features.shape) 

In [None]:

clf = SVC(kernel='linear', C=0.005, probability=True)

sample_weight = np.array([3.14 if i == 1 else 1 for i in train_labels])

clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=0.005 \nNegative weight = 3.14 \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))




clf = SVC(kernel='linear', C=1, probability=True)

sample_weight = np.array([3.14 if i == 1 else 1 for i in train_labels])

clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=1 \nNegative weight = 3.14 \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))





clf = SVC(kernel='linear', C=0.005, probability=True)

arr = []

for i in train_labels:
    if i == 1:
        arr.append(3.14)
    elif i == 2:
        arr.append(1.25)
    else:
        arr.append(1)


sample_weight = np.array(arr)
clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=0.005 \nNegative weight = 3.14 \nPositive weight = 1.25 \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))


clf = SVC(kernel='linear', C=1, probability=True)

arr = []

for i in train_labels:
    if i == 1:
        arr.append(3.14)
    elif i == 2:
        arr.append(1.25)
    else:
        arr.append(1)


sample_weight = np.array(arr)
clf.fit(train_features, train_labels, sample_weight = sample_weight)

predictions = clf.predict(dev_features)

print("A Classification Report showing the per-class Precision, Recall and F1-score \n\nC=1 \nNegative weight = 3.14 \nPositive weight = 1.25 \n\n", metrics.classification_report(dev_labels, predictions,target_names=['neutral','negative','positive']))


from sklearn.model_selection import GridSearchCV
  
param_grid = {'C': [0.005, 0.1, 0.5, 1], 
              'kernel': ['linear','rbf']} 
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
  
grid.fit(train_features, train_labels)

print(grid.best_params_)
  
print(grid.best_estimator_)

grid_predictions = grid.predict(dev_features)
  
print(classification_report(dev_labels, grid_predictions))