In [107]:
import numpy as np
import pandas as pd

reading data

In [108]:
emotion_df = pd.read_csv("1-P-3-ISEAR.csv",header=None)
emotion_df.head()

Unnamed: 0,0,1,2
0,0,joy,On days when I feel close to my partner and ot...
1,1,fear,Every time I imagine that someone I love or I ...
2,2,anger,When I had been obviously unjustly treated and...
3,3,sadness,When I think about the short time that we live...
4,4,disgust,At a gathering I found myself involuntarily si...


adding name to column

In [109]:
emotion_df.columns = ['sn','Target','Sentence']
emotion_df.drop('sn',inplace=True,axis =1)

In [110]:
emotion_df.head()

Unnamed: 0,Target,Sentence
0,joy,On days when I feel close to my partner and ot...
1,fear,Every time I imagine that someone I love or I ...
2,anger,When I had been obviously unjustly treated and...
3,sadness,When I think about the short time that we live...
4,disgust,At a gathering I found myself involuntarily si...


In [111]:
emotion_df['Target'].value_counts()

joy        1082
sadness    1074
anger      1069
fear       1063
disgust    1059
shame      1059
guilt      1040
Name: Target, dtype: int64

the data looks fairly balance

### Text Preprocessing

converting all sentences to lower case

In [112]:
def lowercase(text):
    '''a function lowercasing all characters'''
    text = text.lower()
    return text

emotion_df['Sentence'] = emotion_df['Sentence'].apply(lowercase)

removing punctuation and number from sentences

In [113]:
import string

def remove_punct_num(text):
    '''a function for removing punctuation and number'''
    text = "".join([char for char in text if char not in string.punctuation and not char.isdigit()])
    return text


emotion_df['Sentence'] = emotion_df['Sentence'].apply(remove_punct_num)

In [114]:
emotion_df.head()

Unnamed: 0,Target,Sentence
0,joy,on days when i feel close to my partner and ot...
1,fear,every time i imagine that someone i love or i ...
2,anger,when i had been obviously unjustly treated and...
3,sadness,when i think about the short time that we live...
4,disgust,at a gathering i found myself involuntarily si...


removing all stopwords

In [115]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/araul/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [116]:

def remove_stopwords(text):
    """
    Removing stopwords belonging to english language
    """
    text = [w for w in text.split() if w not in stopwords.words('english')]
    return ' '.join(text)

emotion_df['Sentence'] = emotion_df['Sentence'].apply(remove_stopwords)

In [117]:
emotion_df.head()

Unnamed: 0,Target,Sentence
0,joy,days feel close partner friends feel peace als...
1,fear,every time imagine someone love could contact ...
2,anger,obviously unjustly treated possibility elucida...
3,sadness,think short time live relate periods life thin...
4,disgust,gathering found involuntarily sitting next two...


Lemmatization i.e changing words into it's root form

In [118]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.corpus import wordnet

[nltk_data] Downloading package wordnet to /home/araul/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [119]:
lemmatizer = WordNetLemmatizer()

def lemmatize(text):
    text = [lemmatizer.lemmatize(word) for word in text.split()]
    return ' '.join(text)

emotion_df['Sentence'] = emotion_df['Sentence'].apply(lemmatize)

In [120]:
emotion_df.head()

Unnamed: 0,Target,Sentence
0,joy,day feel close partner friend feel peace also ...
1,fear,every time imagine someone love could contact ...
2,anger,obviously unjustly treated possibility elucida...
3,sadness,think short time live relate period life think...
4,disgust,gathering found involuntarily sitting next two...


spliting data

In [121]:
from sklearn.model_selection import train_test_split
X = emotion_df['Sentence']
y = emotion_df['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=10)


TFIDF 

It is technique to transform text into a meaningful vector of numbers.
TFIDF penalizes words that come up too often and don't really have much use. 
So it rescales the frequency of words that are common which makes scoring more balanced

In [122]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2))
train_tfidf = tfidf.fit_transform(X_train)
test_tfidf = tfidf.transform(X_test)

### Model Building

In [123]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

In [124]:
lr = LogisticRegression()
lr.fit(train_tfidf,y_train)
lr.score(train_tfidf, y_train), lr.score(test_tfidf, y_test)

(0.8608126259234385, 0.561744966442953)

In [125]:
nb = MultinomialNB()
nb.fit(train_tfidf,y_train)
nb.score(train_tfidf, y_train), nb.score(test_tfidf, y_test)

(0.8398253861652115, 0.5604026845637584)

Test prediction of random sentence

In [126]:
test_sentence = ['i am very disappointed at you']
test_sentence = tfidf.transform(test_sentence)

lr.predict(test_sentence)


array(['sadness'], dtype=object)