In [1]:
import pandas as pd

path_emotion_file = "isear.csv"

# read csv
df_file = pd.read_csv(path_emotion_file, error_bad_lines=False,
                      warn_bad_lines=False, sep='|', encoding='latin1')

df = df_file[['Field1', 'SIT']]
df.head()

Unnamed: 0,Field1,SIT
0,joy,"During the period of falling in love, each tim..."
1,fear,When I was involved in a traffic accident.
2,anger,When I was driving home after several days of...
3,sadness,When I lost the person who meant the most to me.
4,disgust,The time I knocked a deer down - the sight of ...


In [2]:
df.shape

(7503, 2)

In [3]:
df.Field1.unique() # target values

array(['joy', 'fear', 'anger', 'sadness', 'disgust', 'shame', 'guilt'],
      dtype=object)

In [4]:
df.groupby('Field1').size() # target value sizes

Field1
anger      1070
disgust    1058
fear       1079
guilt      1070
joy        1065
sadness    1080
shame      1081
dtype: int64

In [5]:
df = df[df['Field1'].str.contains('anger|disgust|fear|joy|sadness|guilt|shame')]
df = df[~df['SIT'].str.contains('\[')]

In [6]:
df.groupby('Field1').size() # target value sizes

Field1
anger      1045
disgust    1029
fear       1064
guilt      1030
joy        1048
sadness    1051
shame      1038
dtype: int64

In [7]:
import numpy as np
raw_data = np.array(df['SIT'].values)

In [8]:
len(raw_data)

7305

In [9]:
import re
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords

def preprocess_text(text, stemming=False, stop_word=False):
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL', text)
    text = re.sub('@[^\s]+','USER', text)
    text = text.lower().replace("ё", "е")
    text = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', text)
    text = re.sub(' +',' ', text)
    text = text.strip()
    
    # This part for stemming.
    if stemming:
        stemmer = LancasterStemmer()
        text = text.split()
        text = " ".join([stemmer.stem(w) for w in text])
    
    # This part for removing stopwords.
    if stop_word:
        text = text.split()
        stop_words = set(stopwords.words('english'))
        text = " ".join([w for w in text if not w in stop_words])
    return text

In [10]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV

def printresult(clf):
    for mean, std, params in zip(clf.cv_results_['mean_test_score'], 
                             clf.cv_results_['std_test_score'], 
                             clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
        print()
    
    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    print(classification_report(y_test, clf.predict(x_test), digits=2))
    print()
    print("Best parameters:")
    print(clf.best_params_)
    
def crossValidation(algorithm, parameters, folds, X, y):
    gs = GridSearchCV(algorithm, parameters, cv=folds, scoring='f1_macro')
    gs = gs.fit(X, y)
    print("Best Score %.3f" % (gs.best_score_))
    for param_name in sorted(parameters.keys()):
        print("%s: %r" % (param_name, gs.best_params_[param_name]))
    result = pd.DataFrame(gs.cv_results_)
    return result

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])

tuned_parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': [1, 1e-1, 1e-2]
}

from sklearn.metrics import classification_report

### TEST 1: No Stemming, No Stopwords

In [11]:
data = [preprocess_text(t) for t in raw_data] # Without stemming.
x_train, x_test, y_train, y_test = train_test_split(data, df.Field1.values, test_size=0.33, random_state=42)

result00 = crossValidation(text_clf, tuned_parameters, 10, x_train, y_train)

result00 = result00[["rank_test_score", "param_tfidf__use_idf","param_clf__alpha","param_vect__ngram_range", 
                "mean_test_score", "mean_train_score"]]
result00 = result00.sort_values(by="rank_test_score")

result00.head()

Best Score 0.572
clf__alpha: 0.1
tfidf__norm: 'l2'
tfidf__use_idf: False
vect__ngram_range: (1, 2)


Unnamed: 0,rank_test_score,param_tfidf__use_idf,param_clf__alpha,param_vect__ngram_range,mean_test_score,mean_train_score
22,1,False,0.1,"(1, 2)",0.571831,0.978656
19,2,True,0.1,"(1, 2)",0.570454,0.991232
7,3,True,1.0,"(1, 2)",0.565405,0.951801
28,4,False,0.01,"(1, 2)",0.563874,0.985326
34,5,False,0.01,"(1, 2)",0.562267,0.992067


###  TEST 2: With Stemming, no Stopwords

In [12]:
data = [preprocess_text(t, stemming=True) for t in raw_data] # With stemming.
x_train, x_test, y_train, y_test = train_test_split(data, df.Field1.values, test_size=0.33, random_state=42)

result01 = crossValidation(text_clf, tuned_parameters, 10, x_train, y_train)

result01 = result01[["rank_test_score", "param_tfidf__use_idf","param_clf__alpha","param_vect__ngram_range", 
                "mean_test_score", "mean_train_score"]]
result01 = result01.sort_values(by="rank_test_score")

result01.head()

Best Score 0.582
clf__alpha: 0.1
tfidf__norm: 'l2'
tfidf__use_idf: False
vect__ngram_range: (1, 2)


Unnamed: 0,rank_test_score,param_tfidf__use_idf,param_clf__alpha,param_vect__ngram_range,mean_test_score,mean_train_score
22,1,False,0.1,"(1, 2)",0.581987,0.974217
19,2,True,0.1,"(1, 2)",0.574952,0.988726
7,3,True,1.0,"(1, 2)",0.574103,0.940241
28,4,False,0.01,"(1, 2)",0.570544,0.982176
34,5,False,0.01,"(1, 2)",0.565046,0.990607


### TEST 3: With Stopwords, no Stemming

In [13]:
data = [preprocess_text(t, stop_word=True) for t in raw_data] # With stemming.
x_train, x_test, y_train, y_test = train_test_split(data, df.Field1.values, test_size=0.33, random_state=42)

result10 = crossValidation(text_clf, tuned_parameters, 10, x_train, y_train)

result10 = result10[["rank_test_score", "param_tfidf__use_idf","param_clf__alpha","param_vect__ngram_range", 
                "mean_test_score", "mean_train_score"]]
result10 = result10.sort_values(by="rank_test_score")

result10.head()

Best Score 0.561
clf__alpha: 1
tfidf__norm: 'l2'
tfidf__use_idf: True
vect__ngram_range: (1, 2)


Unnamed: 0,rank_test_score,param_tfidf__use_idf,param_clf__alpha,param_vect__ngram_range,mean_test_score,mean_train_score
7,1,True,1.0,"(1, 2)",0.561337,0.973153
22,2,False,0.1,"(1, 2)",0.555281,0.987875
10,3,False,1.0,"(1, 2)",0.554263,0.918542
16,4,False,0.1,"(1, 2)",0.547978,0.959239
9,5,False,1.0,"(1, 1)",0.546783,0.78689


### TEST 4: With Stemming and Stopwords

In [14]:
data = [preprocess_text(t, True, True) for t in raw_data] # With stemming.
x_train, x_test, y_train, y_test = train_test_split(data, df.Field1.values, test_size=0.33, random_state=42)

result11 = crossValidation(text_clf, tuned_parameters, 10, x_train, y_train)

result11 = result11[["rank_test_score", "param_tfidf__use_idf","param_clf__alpha","param_vect__ngram_range", 
                "mean_test_score", "mean_train_score"]]
result11 = result11.sort_values(by="rank_test_score")

result11.head()

Best Score 0.565
clf__alpha: 1
tfidf__norm: 'l2'
tfidf__use_idf: False
vect__ngram_range: (1, 2)


Unnamed: 0,rank_test_score,param_tfidf__use_idf,param_clf__alpha,param_vect__ngram_range,mean_test_score,mean_train_score
10,1,False,1.0,"(1, 2)",0.565379,0.899977
7,2,True,1.0,"(1, 2)",0.565161,0.967421
22,3,False,0.1,"(1, 2)",0.5646,0.986142
9,4,False,1.0,"(1, 1)",0.557815,0.745058
16,5,False,0.1,"(1, 2)",0.555431,0.944892


We get our results. And test with (1,2) ngram range, alpha 0.1, with stopwords and stemming.

In [15]:
test = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', MultinomialNB(alpha=0.1))])

data = [preprocess_text(t, stemming=True, stop_word=True) for t in raw_data] # With stemming.
x_train, x_test, y_train, y_test = train_test_split(data, df.Field1.values, test_size=0.33, random_state=5)

test.fit(x_train, y_train)

predicted = test.predict(x_test)
print("Score of test is:")
print("%.3f"%(np.mean(predicted == y_test)))

Score of test is:
0.560


### Most effective 10 words for each class

In [16]:
def show_top10(classifier, vectorizer, categories):
    feature_names = np.asarray(vectorizer.get_feature_names())
    for i, category in enumerate(categories):
        top10 = np.argsort(classifier.coef_[i])[-10:]
        print("%s: %s" % (category, " ".join(feature_names[top10])))


show_top10(test.steps[2][1], test.steps[0][1], test.classes_)

anger: person told tim moth becaus thi wer hav angry friend
disgust: som person felt disgust someon man felt peopl friend saw disgust
fear: tim bef would wer afraid hom alon car fear night
joy: happy tim year univers exam first aft got pass friend
sadness: grandmoth year dea rel aft fath felt sad died friend
shame: feel som thi day felt asham tim hav felt asham friend
