In [1]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('sentiwordnet')

[nltk_data] Downloading package punkt to /home/akshala/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/akshala/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/akshala/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /home/akshala/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


True

In [2]:
import pandas as pd
import regex as re
import numpy as np
import vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.tokenize import TweetTokenizer
from nltk.sentiment.vader import VaderConstants
from afinn import Afinn
from nltk.corpus import sentiwordnet as swn
from nltk import pos_tag
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import pickle

In [3]:
vader = SentimentIntensityAnalyzer()
tk = TweetTokenizer()

In [4]:
def preprocess(x):
    # x: raw input text
    x = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))', '', x)  # removing website URLs
    x = re.sub(r'http\S+', '', x)   
    x = re.sub('@[^\s]+', '', x)   # removing usernames
    return x  # preprocessed text is returned

In [5]:
def vaderScore(x):
    # x: preprocessed text
    sentiment_dict = vader.polarity_scores(x) # dictionary containing the vader scores
    return list(sentiment_dict.values())  # list of vader sentiment scores

In [6]:
def bing_liu_lexicon():  # reading positive and negative words from bing liu lexicon
    bing_liu = {}
    file1 = open('bing_liu_positive.txt', 'r')  # positive word file
    while True:
        line = file1.readline() 
        if not line:
            break
        line = line.strip()
        bing_liu[line] = 'pos'

    file2 = open('bing_liu_negative.txt', 'r')  # negative word file
    while True:
        line = file2.readline()
        if not line:
            break
        line = line.strip()
        bing_liu[line] = 'neg'
    return bing_liu  # returns dictionary with words as keys and pos/neg as value

In [7]:
def mpqa_lexicon():  # reading positive, negative and neutral words from mpqa lexicon
    mpqa = {}
    file1 = open('mpqa_pos_neg.tff', 'r')
    while True:
        line = file1.readline()
        if not line:
            break
        line = line.strip()
        line = line.split()
        try:    
            priorpolarity = line[5].split('=')[1]  # get the priorpolarity
        except IndexError:
            priorpolarity = line[6].split('=')[1]  
        word = line[2].split('=')[1]
        if priorpolarity == 'positive':
            mpqa[word] = 'pos'
        if priorpolarity == 'negative':
            mpqa[word] = 'neg'
        if priorpolarity == 'neutral':
            mpqa[word] = 'neu'
    return mpqa    # returns dictionary with words as keys and pos/neg/neu as value

In [8]:
def bing_liu_polar_word_count(x):
    # x: preprocessed text
    bing_liu = bing_liu_lexicon()  # dictionary with words as keys and pos/neg as value
    x = tk.tokenize(x)   # get list of words by tokenizing x
    positive_words = 0
    negative_words = 0
    for elt in x:
        elt = elt.lower()  # lowercase words
        try:
            if bing_liu[elt] == 'pos':
                positive_words += 1  # increment if positive word
            else:
                negative_words += 1  # increment if negative word
        except KeyError:
            pass
    return [positive_words, negative_words]  # returns list of pos/neg word counts

In [9]:
def mpqa_polar_word_count(x):
    # x: preprocessed text
    mpqa = bing_liu_lexicon()   # dictionary with words as keys and pos/neg/neu as value
    x = tk.tokenize(x)          # get list of words by tokenizing x
    positive_words = 0
    negative_words = 0
    neutral_words = 0
    for elt in x:
        elt = elt.lower()       # lowercase words
        try:
            if mpqa[elt] == 'pos':
                positive_words += 1    # increment if positive word
            elif mpqa[elt] == 'neg':
                negative_words += 1    # increment if negative word
            else:
                neutral_words += 1     # increment if neutral word
        except KeyError:
            pass
    return [positive_words, negative_words, neutral_words]   # returns list of pos/neg/neu word counts

In [10]:
def sentiment140_lexicon():  # reading word and corresponding polarity score
    file1 = open('sentiment140_unigram.txt', 'r')
    sentiment_score = {}
    while True:
        line = file1.readline()
        if not line:
            break
        line = line.split()
        line = list(filter(None, line))
        sentiment_score[line[0]] = float(line[1])
    return sentiment_score  # dictionary with words as keys and polarity score as value

In [11]:
def sentiment140_polarity_score(x):
    # x: preprocessed text
    sentiment_score = sentiment140_lexicon()
    score = [0, 0]
    x = tk.tokenize(x)   # get list of words by tokenizing x
    for elt in x:
        if elt in sentiment_score:
            if sentiment_score[elt] > 0:  # if score is greater than 0 add to positive score
                score[0] += sentiment_score[elt]
            else:                         # else add to negative score
                score[1] += sentiment_score[elt]
    return score   # list containing positive and negative score

In [12]:
def afinn_polarity_score(x):
    # x: preprocessed text
    af = Afinn()
    polarity_score = [0,0]
    x = tk.tokenize(x)     # get list of words by tokenizing x
    for elt in x:
        elt = elt.lower()  # lowercase words
        aff_score = af.score(elt)  # get the AFINN score
        if aff_score > 0:
            polarity_score[0] += aff_score   # if score is greater than 0 add to positive score
        else:
            polarity_score[1] += aff_score   # else add to negative score
    return polarity_score   # list containing positive and negative score

In [13]:
def sentiword_polarity_score(x):
    # x: preprocessed text
    score = [0,0]
    lemmatizer = WordNetLemmatizer()           # lemmatizer
    tagged_sentence = pos_tag(tk.tokenize(x))  # tokenize x and get pos tags  of words
    for word, tag in tagged_sentence:
        wn_tag = ''
        if tag.startswith('N'):  # assign wordnet noun tag
            wn_tag = wn.NOUN
        elif tag.startswith('J'): # assign wordnet adjective tag
            wn_tag = wn.ADJ
        elif tag.startswith('R'): # assign wordnet adverb tag
            wn_tag = wn.ADV
        elif tag.startswith('V'): # assign wordnet verb tag
            wn_tag = wn.VERB  
        if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV, wn.VERB):  # if no pos tag then ignore word
            continue     
        lemma = lemmatizer.lemmatize(word, pos=wn_tag)  # if no lemma for the pos tag and word then ignore word
        if not lemma:
            continue
        synsets = wn.synsets(lemma, pos=wn_tag)  # synsets
        if not synsets:
            continue
        synset = synsets[0]
        swn_synset = swn.senti_synset(synset.name())    
        score[0] += swn_synset.pos_score()    # add to positive score
        score[1] += swn_synset.neg_score()    # add to negative score
    return score   # list containing positive and negative score

In [14]:
def NRC_hashtag_sentiment_lexicon():  # reading hashtags and polarity score
    file1 = open('NRC_hashtag_sentiment_lexicon.txt', 'r')
    hashtag = {}
    while True:
        line = file1.readline()
        if not line:
            break
        line = line.split()
        line = list(filter(None, line))
        hashtag[line[0]] = float(line[1])
    return hashtag       # dictionary with words as keys and polarity score as value

In [15]:
def polarity_score_hashtag(x):
    # x: preprocessed text
    hashtag = NRC_hashtag_sentiment_lexicon()  # dictionary with words as keys and polarity score as value
    score = [0, 0]
    x = tk.tokenize(x)
    for elt in x:
        if elt in hashtag:
            if hashtag[elt] > 0:           # if score is greater than 0 add to positive score
                score[0] += hashtag[elt]
            else:
                score[1] += hashtag[elt]   # else add to negative score
    return score                           # list containing positive and negative score

In [16]:
def NRC_emotion_lexicon():   # reading NRC emotion lexicon 
    file1 = open('NRC_word_emotion_lexicon.txt', 'r')
    nrc_emotions = {}
    while True:
        line = file1.readline()
        if not line:
            break
        line = line.split()
        line = list(filter(None, line))
        word = line[0]
        emotion = line[1]
        presence = line[2]
        if word not in nrc_emotions.keys():   # if emotion is present for the=at word append to list
            nrc_emotions[word] = []
        if presence == '1':
            nrc_emotions[word].append(emotion)
    return nrc_emotions     # dictionary with words as keys and list of emotions associated with it

In [17]:
def emotion_word_count(x):   
    # x: preprocessed text
    nrc_emotions = NRC_emotion_lexicon()
    emotions = {'anger':0, 'anticipation':1, 'disgust':2, 'fear':3, 'joy':4, 'negative':5, 'positive':6, 'sadness':7, 'surprise':8, 'trust':9}
    emotion_vector = [0] * 10
    x = tk.tokenize(x)     # get list of words by tokenizing x
    for elt in x:
        elt = elt.lower()  # lowercase
        if elt in nrc_emotions:
            word_emotions = nrc_emotions[elt]     # add 1 to particular emotion if word corresponding to it comes
            for emo in word_emotions:
                emotion_vector[emotions[emo]] += 1
    return emotion_vector       # count of words corresponding to each emotion

In [18]:
def NRC10_expanded_lexicon():   # reading words and corresponding emotion score
    file1 = open('NRC-10-expanded.csv', 'r')
    expanded_emotions = {}
    count = 0
    while True:
        line = file1.readline()
        count += 1
        if count == 1:    # ignore header row
            continue
        if not line:
            break
        line = line.split('\t')
        line = list(filter(None, line))
        word = line[0]
        emotion_scores = [float(elt.strip('\n')) for elt in line[1:]]   # convert to float
        expanded_emotions[word] = emotion_scores
    return expanded_emotions   # dictionary with words as keys and emotion score as value

In [19]:
def emotion_score(x):
    # x: preprocessed text
    expanded_emotions = NRC10_expanded_lexicon()
    emotion_scores = [0] * 10
    x = tk.tokenize(x)   # get list of words by tokenizing x
    for elt in x:
        elt = elt.lower()   # lowercase
        if elt in expanded_emotions:
            emotion_scores = np.add(emotion_scores, expanded_emotions[elt])   # add emotion score of word to corresponding element in score list
    return emotion_scores   # return list with aggregate emotion scores

In [20]:
def NRC_hashtag_emotion_lexicon():   # reading hashtag and corresponding emotion score
    file1 = open('NRC_hashtag_emotion_lexicon.txt', 'r')
    hashtag = {}
    while True:
        line = file1.readline()
        if not line:
            break
        line = line.split()
        line = list(filter(None, line))
        if line[1] not in hashtag:
            hashtag[line[1]] = []
        hashtag[line[1]].append((line[0], float(line[2])))
    return hashtag     # dictionary with word as key and list of tuples with emotion and score as values

In [21]:
def emotion_score_hashtag(x):
    hashtag = NRC_hashtag_emotion_lexicon()
    emotions = {'anger':0, 'fear':1, 'anticipation':2, 'trust':3, 'surprise':4, 'sadness':5, 'joy':6, 'disgust':7}
    emotion_vector = [0] * 8
    x = tk.tokenize(x)      # get list of words by tokenizing x
    for elt in x:
        elt = elt.lower()   # lowercase
        if elt in hashtag:
            hashtag_emotions = hashtag[elt]
            for pair in hashtag_emotions:
                emotion_vector[emotions[pair[0]]] += pair[1]  # add word score to corresponding emotion total score
    return emotion_vector

In [22]:
def affin_emoticon():   # read emoticon and corresponding score
    file1 = open('affin_emoticon.txt', 'r')
    emoticon = {}
    while True:
        line = file1.readline()
        if not line:
            break
        line = line.split()
        line = list(filter(None, line))
        emoticon[line[0]] = int(line[1])
    return emoticon    # dictionary with emoticon as key and score as value

In [23]:
def emoticon_score(x):
    # x: preprocessed text
    emoticon = affin_emoticon()
    score = [0, 0]
    x = tk.tokenize(x)  # get list of words by tokenizing x
    for elt in x:
        if elt in emoticon:
            if emoticon[elt] > 0:          # if score is greater than 0 add to positive score
                score[0] += emoticon[elt]
            else:                          # else add to negative score
                score[1] += emoticon[elt]   
    return score           # list containing positive and negative score

In [24]:
def negation_count(x):
    # x: preprocessed text
    count = 0
    vd = VaderConstants
    x = tk.tokenize(x)    # get list of words by tokenizing x
    for elt in x:
        elt = elt.lower()  # lowercase
        if vd.negated(vd, [elt]):
            count += 1
    return count  # number of negation words

In [25]:
def ngramFeatures(preprocessed, curr, n, min_df):
    #preprocessed: list of tweets
    #n is n in ngram
    #curr: list of tweets that we want the ngram feature for
    #min_df: threshold value
    count_vect = CountVectorizer(ngram_range=(n,n), tokenizer=tk.tokenize, min_df=min_df)
    count_vect.fit(preprocessed)
    feature_mat = count_vect.transform(curr).toarray()
    return feature_mat   # feature matrix for ngram

In [26]:
def lexicon_features(df):  # apply above functions to generate columns in pandas dataframe
    # df: input dataframe with text
    df['Text'] = df['Text'].apply(lambda x: preprocess(x))

    df['vader'] = df['Text'].apply(lambda x: vaderScore(x))

    df['bing_liu'] = df['Text'].apply(lambda x: bing_liu_polar_word_count(x))

    df['mpqa'] = df['Text'].apply(lambda x: mpqa_polar_word_count(x))

    df['sentiment140_polarity_score'] = df['Text'].apply(lambda x: sentiment140_polarity_score(x))
    df['afinn_polarity_score'] = df['Text'].apply(lambda x: afinn_polarity_score(x))
    df['sentiword_polarity_score'] = df['Text'].apply(lambda x: sentiword_polarity_score(x))

    df['polarity_score_hashtag'] = df['Text'].apply(lambda x: polarity_score_hashtag(x))

    df['emotion_count'] = df['Text'].apply(lambda x: emotion_word_count(x))

    df['emotion_score'] = df['Text'].apply(lambda x: emotion_score(x))

    df['emotion_score_hashtag'] = df['Text'].apply(lambda x: emotion_score_hashtag(x))

    df['emoticon_score'] = df['Text'].apply(lambda x: emoticon_score(x))

    df['negation_count'] = df['Text'].apply(lambda x: negation_count(x))

In [27]:
df_train = pd.read_csv('anger_train.tsv', sep='\t')
lexicon_features(df_train)

In [28]:
df_test = pd.read_csv('anger_test.tsv', sep='\t')
lexicon_features(df_test)

In [29]:
train_preprocessed = df_train['Text'].to_list()   
test_preprocessed = df_test['Text'].to_list()
preprocessed = train_preprocessed + test_preprocessed   # total text preprocessed

In [30]:
# adding unigram and bigram features
df_train = df_train.assign(unigram=[*ngramFeatures(preprocessed, train_preprocessed, 1, 5)])
df_train = df_train.assign(bigram=[*ngramFeatures(preprocessed, train_preprocessed, 2, 2)])
df_test = df_test.assign(unigram=[*ngramFeatures(preprocessed, test_preprocessed, 1, 5)])
df_test = df_test.assign(bigram=[*ngramFeatures(preprocessed, test_preprocessed, 2, 2)])



In [31]:
df_train.to_csv('features_anger_train.csv')
df_test.to_csv('features_anger_test.csv')

In [32]:
y_train = list(df_train['intensity'])
y_train = [float(x) for x in y_train]

y_test = list(df_test['intensity'])
y_test = [float(x) for x in y_test]

In [33]:
train_features = df_train.drop(['S.No', 'Text', 'Emotion', 'intensity'], axis=1)
test_features = df_test.drop(['S.No', 'Text', 'Emotion', 'intensity'], axis=1)

In [34]:
def get_x(feature_dict):  # get a single list from dictionary containing int, float and list
    # feature_dict: input dictionary with features
    features = []
    for key, val in feature_dict.items():
        if isinstance(val, np.ndarray):
            val = list(val)
            val = [float(x) for x in val]  # convert to float
            features.extend(val)
        elif isinstance(val, list):
            features.extend(val)
        else:
            features.append(val)
    features = np.array(features, dtype=float)
    return features   # final feature list

In [35]:
train_features['x_train'] = train_features.apply(lambda row: get_x(row), axis=1)
test_features['x_test'] = test_features.apply(lambda row: get_x(row), axis=1)

In [36]:
X_train = train_features['x_train']
X_train = [list(x) for x in X_train]

X_test = test_features['x_test']
X_test = [list(x) for x in X_test]

In [37]:
import scipy.stats

In [38]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)  
regressor.fit(X_train, y_train)
pickle.dump(regressor, open('anger_DecisionTree.sav', 'wb'))   # saving model using pickle
y_pred = regressor.predict(X_test)

y_pred = np.array(y_pred)
y_pred.tofile('anger_pred_DT.csv', sep = ',')

In [39]:
from sklearn.svm import SVR
regressor = SVR(kernel='rbf', C=1)
regressor.fit(X_train, y_train)
pickle.dump(regressor, open('anger_SVM.sav', 'wb'))   # saving model using pickle
y_pred = regressor.predict(X_test)

y_pred = np.array(y_pred)
y_pred.tofile('anger_pred_SVM.csv', sep = ',')

In [40]:
from sklearn.neural_network import MLPRegressor
regressor = MLPRegressor(random_state=0, max_iter=500)
regressor.fit(X_train, y_train)
pickle.dump(regressor, open('anger_MLP.sav', 'wb'))   # saving model using pickle
y_pred = regressor.predict(X_test)

y_pred = np.array(y_pred)
y_pred.tofile('anger_pred_MLP.csv', sep = ',')

In [41]:
df_train = pd.read_csv('joy_train.tsv', sep='\t')
lexicon_features(df_train)

In [42]:
df_test = pd.read_csv('joy_test.tsv', sep='\t')
lexicon_features(df_test)

In [43]:
train_preprocessed = df_train['Text'].to_list()
test_preprocessed = df_test['Text'].to_list()
preprocessed = train_preprocessed + test_preprocessed   # total text preprocessed

In [44]:
df_train = df_train.assign(unigram=[*ngramFeatures(preprocessed, train_preprocessed, 1, 5)])
df_train = df_train.assign(bigram=[*ngramFeatures(preprocessed, train_preprocessed, 2, 2)])
df_test = df_test.assign(unigram=[*ngramFeatures(preprocessed, test_preprocessed, 1, 5)])
df_test = df_test.assign(bigram=[*ngramFeatures(preprocessed, test_preprocessed, 2, 2)])



In [45]:
df_train.to_csv('features_joy_train.csv')
df_test.to_csv('features_joy_test.csv')

In [46]:
y_train = list(df_train['intensity'])
y_train = [float(x) for x in y_train]

y_test = list(df_test['intensity'])
y_test = [float(x) for x in y_test]

In [47]:
train_features = df_train.drop(['S.No', 'Text', 'Emotion', 'intensity'], axis=1)
test_features = df_test.drop(['S.No', 'Text', 'Emotion', 'intensity'], axis=1)

In [48]:
train_features['x_train'] = train_features.apply(lambda row: get_x(row), axis=1)
test_features['x_test'] = test_features.apply(lambda row: get_x(row), axis=1)

In [49]:
X_train = train_features['x_train']
X_train = [list(x) for x in X_train]

X_test = test_features['x_test']
X_test = [list(x) for x in X_test]

In [50]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)  
regressor.fit(X_train, y_train)
pickle.dump(regressor, open('joy_DecisionTree.sav', 'wb'))   # saving model using pickle
y_pred = regressor.predict(X_test)

y_pred = np.array(y_pred)
y_pred.tofile('joy_pred_DT.csv', sep = ',')

In [51]:
from sklearn.svm import SVR
regressor = SVR(kernel='rbf', C=1)
regressor.fit(X_train, y_train)
pickle.dump(regressor, open('joy_SVM.sav', 'wb'))    # saving model using pickle
y_pred = regressor.predict(X_test)

y_pred = np.array(y_pred)
y_pred.tofile('joy_pred_SVM.csv', sep = ',')

In [52]:
from sklearn.neural_network import MLPRegressor
regressor = MLPRegressor(random_state=0, max_iter=500)
regressor.fit(X_train, y_train)
pickle.dump(regressor, open('joy_MLP.sav', 'wb'))     # saving model using pickle
y_pred = regressor.predict(X_test)
y_pred = np.array(y_pred)
y_pred.tofile('joy_pred_MLP.csv', sep = ',')