# Emotion Detection
## Data Preprocess

In [1]:
import nltk, re
from nltk import word_tokenize
from nltk.corpus.reader import XMLCorpusReader

In [2]:
reader = XMLCorpusReader("data", fileids=["affectivetext_trial.xml", "affectivetext_test.xml"])

In [3]:
reader.fileids()

['affectivetext_trial.xml', 'affectivetext_test.xml']

In [4]:
#train_headers = reader.xml('affectivetext_trial.xml')
train_headers = reader.xml('affectivetext_test.xml')
#test_headers = reader.xml('affectivetext_test.xml')
test_headers = reader.xml('affectivetext_trial.xml')
print("Train data: " + str(len(train_headers)))
print("Test data: " + str(len(test_headers)))

Train data: 1000
Test data: 250


In [13]:
def configVecData(data):
    for i, vec in enumerate(data):
        data[i] = [float(el) for el in vec[1:]]
    return data

def getVectors(filename):
    file = open(filename, 'r')
    data = [line.split(" ") for line in file.read().splitlines()]
    file.close()
    return configVecData(data)

#train_emot = getVectors("data/AffectiveText.trial/affectivetext_trial.emotions.gold")
#test_emot = getVectors("data/AffectiveText.test/affectivetext_test.emotions.gold")
test_emot = getVectors("data/AffectiveText.trial/affectivetext_trial.emotions.gold")
train_emot = getVectors("data/AffectiveText.test/affectivetext_test.emotions.gold")
print("Train vector: " + str(len(train_emot)))
print("Test vector: " + str(len(test_emot)))

Train vector: 1000
Test vector: 250


In [14]:
def labeldict():
    dictionary = {}
    labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise"]
    for i,emotion in enumerate(labels):
        dictionary[i] = emotion
    return dictionary

labels = labeldict()

In [15]:
def labeler(vector):
    out = list()
    for i, headline in enumerate(vector):
        emotion = headline.index(max(headline))
        out.append(emotion)
    return out

In [16]:
labels

{0: 'anger', 1: 'disgust', 2: 'fear', 3: 'joy', 4: 'sadness', 5: 'surprise'}

In [17]:
y_train = labeler(train_emot)
y_test = labeler(test_emot)

In [18]:
def countLabel(labels):
    dictionary = {}
    for x in labels:
        if x not in dictionary:
            dictionary[x] = 1
        else:
            dictionary[x] += 1
    return dictionary

countlabel = countLabel(y_train)

In [19]:
print("Frequency Table")
for key in countlabel.keys():
    print("{:<8}\t{}".format(labels[key], str(countlabel[key])))
    #print(labels[key] + "\t" + str(countlabel[key]))

Frequency Table
joy     	362
sadness 	202
fear    	160
surprise	184
anger   	66
disgust 	26


## Classification Algorithms
### Multinomial Naive Bayes

In [20]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.linear_model import SGDClassifier

import warnings
warnings.filterwarnings('ignore')
import pandas as pd

In [21]:
def crossValidation(algorithm, parameters, folds, X, y):
    gs = GridSearchCV(algorithm, parameters, cv=folds, iid=False, n_jobs=-1)
    gs = gs.fit(X, y)
    print("Best Score %.3f" % (gs.best_score_))
    for param_name in sorted(parameters.keys()):
        print("%s: %r" % (param_name, gs.best_params_[param_name]))
    return pd.DataFrame(gs.cv_results_)

In [22]:
def createData(headers):
    corpus = list()
    for i in headers.itertext():
        if i != "\n":
            corpus.append(i)
    return corpus

In [23]:
X_train = createData(train_headers)
#Mnaive_clf.fit(X_train, y_train)

In [24]:
import numpy as np
X_test = createData(test_headers) 
#predicted = Mnaive_clf.predict(X_test)
#np.mean(predicted == y_test)  

In [25]:
from sklearn.pipeline import Pipeline
Mnaive_clf = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB()),
 ])

#### Cross Validation for best parameters

In [26]:
parameters = {'vect__stop_words': (None, 'english'), 
              'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)], 
              'tfidf__use_idf': (True, False), 
              'clf__alpha': (1e-2, 1e-1, 1),
             }

In [27]:
result_nb = crossValidation(X=X_train, y=y_train, algorithm=Mnaive_clf, parameters=parameters, folds=5)

Best Score 0.487
clf__alpha: 0.1
tfidf__use_idf: True
vect__ngram_range: (1, 3)
vect__stop_words: 'english'


In [28]:
result_nb = result_nb[["rank_test_score", "param_tfidf__use_idf","param_clf__alpha","param_vect__ngram_range",
                              "param_vect__stop_words", "mean_test_score", "mean_train_score"]]

result_nb.columns = ["Rank", "tfidf", "clf alpha", "N-gram range", "Stop words", "Test Score",
                           "Train Score"]

In [29]:
sorted_result_nb = result_nb.sort_values(by="Rank")
sorted_result_nb[:5]

Unnamed: 0,Rank,tfidf,clf alpha,N-gram range,Stop words,Test Score,Train Score
21,1,True,0.1,"(1, 3)",english,0.486748,0.997752
31,2,False,0.1,"(1, 4)",english,0.483763,0.997752
27,3,False,0.1,"(1, 2)",english,0.483763,0.996252
29,4,False,0.1,"(1, 3)",english,0.483748,0.997251
19,5,True,0.1,"(1, 2)",english,0.483718,0.997003


### SVM

In [30]:
svm = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None)),
 ])

In [31]:
parameters = {'vect__stop_words': (None, 'english'), 
              'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)], 
              'tfidf__use_idf': (True, False), 
              'clf__alpha': (1e-2, 1e-3),
              'clf__loss': ('log', 'hinge', 'squared_hinge')
             }

In [32]:
result_svm = crossValidation(X=X_train, y=y_train, algorithm=svm, folds=5, parameters=parameters)

Best Score 0.496
clf__alpha: 0.001
clf__loss: 'squared_hinge'
tfidf__use_idf: True
vect__ngram_range: (1, 4)
vect__stop_words: 'english'


In [33]:
result_svm = result_svm[["rank_test_score","param_clf__loss", "param_tfidf__use_idf","param_clf__alpha",
                         "param_vect__ngram_range","param_vect__stop_words", "mean_test_score", "mean_train_score"]]

result_svm.columns = ["Rank","Loss Function", "tfidf", "clf alpha", "N-gram range", "Stop words", "Test Score", "Train Score"]

In [34]:
sorted_result_svm = result_svm.sort_values(by="Rank")
sorted_result_svm[:5]

Unnamed: 0,Rank,Loss Function,tfidf,clf alpha,N-gram range,Stop words,Test Score,Train Score
87,1,squared_hinge,True,0.001,"(1, 4)",english,0.495734,0.997752
85,2,squared_hinge,True,0.001,"(1, 3)",english,0.494729,0.997752
79,3,hinge,False,0.001,"(1, 4)",english,0.490813,0.996002
93,4,squared_hinge,False,0.001,"(1, 3)",english,0.489778,0.997752
71,5,hinge,True,0.001,"(1, 4)",english,0.488743,0.996002


## Regression Models
### Linear Regression

For regression models we need a larger train data set so we use 1000 lenght test dataset as train and smaller train set as test dataset.

In [35]:
from sklearn.linear_model import LinearRegression
linear_reg = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('reg', LinearRegression()),
 ])

In [36]:
parameters = {'vect__stop_words': (None, 'english'), 
              'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)], 
              'tfidf__use_idf': (True, False),
              'reg__normalize': (True, False)
             }

In [37]:
result_lreg = crossValidation(X=X_test, algorithm=linear_reg, folds=5, parameters=parameters, y=test_emot)

Best Score 0.189
reg__normalize: True
tfidf__use_idf: True
vect__ngram_range: (1, 2)
vect__stop_words: None


In [38]:
result_lreg = result_lreg[["rank_test_score", "param_tfidf__use_idf","param_reg__normalize","param_vect__ngram_range",
                           "param_vect__stop_words", "mean_test_score", "mean_train_score"]]

result_lreg.columns = ["Rank", "tfidf", "Normalize", "N-gram range", "Stop words", "Test Score", "Train Score"]

In [39]:
sorted_result_lreg = result_lreg.sort_values(by="Rank")

In [40]:
sorted_result_lreg[:5]

Unnamed: 0,Rank,tfidf,Normalize,N-gram range,Stop words,Test Score,Train Score
2,1,True,True,"(1, 2)",,0.188755,0.997348
4,2,True,True,"(1, 3)",,0.18728,0.997348
10,3,False,True,"(1, 2)",,0.185108,0.997348
6,4,True,True,"(1, 4)",,0.183634,0.997348
5,5,True,True,"(1, 3)",english,0.182627,0.997348


### Logistic Regression

In [41]:
from sklearn.linear_model import LogisticRegression
log_reg = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('reg', LogisticRegression(solver='lbfgs', multi_class='multinomial', n_jobs=-1, random_state=0, )),
 ])

In [42]:
parameters = {'vect__stop_words': (None, 'english'), 
              'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)], 
              'tfidf__use_idf': (True, False),
              'reg__C': (1e-1, 1e-2, 1),
             }

In [43]:
result_log = crossValidation(X=X_train, y=y_train, algorithm=log_reg, folds=5, parameters=parameters)

Best Score 0.453
reg__C: 1
tfidf__use_idf: False
vect__ngram_range: (1, 1)
vect__stop_words: 'english'


In [44]:
result_log = result_log[["rank_test_score", "param_tfidf__use_idf","param_reg__C", "param_vect__ngram_range",
                         "param_vect__stop_words", "mean_test_score", "mean_train_score"]]

result_log.columns = ["Rank", "tfidf", "C", "N-gram range", "Stop words", "Test Score", "Train Score"]

In [45]:
sorted_result_log = result_log.sort_values(by="Rank")
sorted_result_log[:5]

Unnamed: 0,Rank,tfidf,C,N-gram range,Stop words,Test Score,Train Score
41,1,False,1,"(1, 1)",english,0.452927,0.847495
33,2,True,1,"(1, 1)",english,0.446846,0.862996
43,3,False,1,"(1, 2)",english,0.442851,0.878248
32,4,True,1,"(1, 1)",,0.441996,0.841252
45,5,False,1,"(1, 3)",english,0.437891,0.886998


### Perceptron

In [46]:
from sklearn.linear_model import Perceptron
perc = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('reg', Perceptron(n_jobs=-1, random_state=0)),
 ])

In [47]:
parameters = {'vect__stop_words': (None, 'english'), 
              'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)], 
              'tfidf__use_idf': (True, False),
              'reg__tol': (1e-3, 1e-2, 1e-1),
             }

In [48]:
result_perc = crossValidation(X=X_train, y=y_train, algorithm=perc, folds=5, parameters=parameters)

Best Score 0.458
reg__tol: 0.1
tfidf__use_idf: True
vect__ngram_range: (1, 1)
vect__stop_words: None


In [49]:
result_perc = result_perc[["rank_test_score", "param_tfidf__use_idf","param_reg__tol", "param_vect__ngram_range",
                           "param_vect__stop_words", "mean_test_score", "mean_train_score"]]

result_perc.columns = ["Rank", "tfidf", "Tolerance", "N-gram range", "Stop words", "Test Score", "Train Score"]

In [50]:
sorted_result_perc = result_perc.sort_values(by="Rank")
sorted_result_perc[:5]

Unnamed: 0,Rank,tfidf,Tolerance,N-gram range,Stop words,Test Score,Train Score
32,1,True,0.1,"(1, 1)",,0.457715,0.997003
16,2,True,0.01,"(1, 1)",,0.454715,0.997001
3,3,True,0.001,"(1, 2)",english,0.451625,0.997752
0,4,True,0.001,"(1, 1)",,0.45072,0.997502
35,5,True,0.1,"(1, 2)",english,0.44962,0.997502


## Testing

In [51]:
testing = Pipeline([
     ('vect', CountVectorizer(stop_words='english', ngram_range=(1,4))),
     ('tfidf', TfidfTransformer(use_idf=False)),
     ('reg', Perceptron(n_jobs=-1, random_state=0, tol=0.1)),
 ])

In [52]:
testing.fit(X=X_train, y=y_train)
testing.score(X=X_test, y=y_test)

0.432