# Data Information

*  Context

Collection of documents and its emotions

*  Example

i feel sick;sadness

*  Goal

When do people use positive emojis when they are texting?

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import math
import mlxtend
import sklearn.cluster as cluster
import sklearn.neighbors
import sklearn.metrics as metrics
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,confusion_matrix, roc_curve, auc, precision_score, recall_score, f1_score, precision_recall_curve
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize 
from nltk.tokenize import RegexpTokenizer

import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load and explore the data
#### The data is available at this source, and you can learn more about how and why this dataset is created from this paper.
Data source: https://www.kaggle.com/praveengovi/emotions-dataset-for-nlp 

In [73]:
train_data = pd.read_csv('../input/emotions-dataset-for-nlp/train.txt',names=['sentence','emotion'],header=None, sep=';')
test_data = pd.read_csv('../input/emotions-dataset-for-nlp/test.txt',names=['sentence','emotion'],header=None, sep=';')
val_data= pd.read_csv('../input/emotions-dataset-for-nlp/val.txt',names=['sentence','emotion'],header=None, sep=';')
df = pd.concat([train_data,test_data, val_data])
print('Total data:',df.shape)

In [4]:
# Null Check
train_data.isnull().sum()
test_data.isnull().sum()
val_data.isnull().sum()

In [74]:
df = df.drop_duplicates(keep="first") # Drop duplicated data and reindex the data
df_reidx = df.reset_index(drop=True)
df_reidx.shape

In [75]:
# convert the emotions to binary labels. love and joy emotions are 1, and sadness, anger, fear, and surprise are 0.
df_reidx['label']=df_reidx['emotion'].replace({'joy':1, 'love': 1, 
                                   'sadness':0, 'anger':0, 'fear':0,'surprise':0})

In [7]:
# check if pos and neg sentiments
df_reidx.label.value_counts()

In [8]:
df_reidx['length'] = df_reidx['sentence'].apply(len) # number of characters
df_reidx['length'].describe() # info()

In [76]:
df_reidx.tail()

# Text Preprocessing
#### To clean the sentences,we do text preprocessing.

*   Decontracted
*   Data cleaning
Additionally,
*   Spell check
*   Lemmatization
*   Nomalization





In [119]:
from tqdm import tqdm
import re
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

def decontracted(phrase):
    """
    We first define a function to expand the contracted phrase into normal words
    """
    # specific
    phrase = re.sub(r"wont", "will not", phrase)
    phrase = re.sub(r"wouldnt", "would not", phrase)
    phrase = re.sub(r"shouldnt", "should not", phrase)
    phrase = re.sub(r"couldnt", "could not", phrase)
    phrase = re.sub(r"cudnt", "could not", phrase)
    phrase = re.sub(r"cant", "can not", phrase)
    phrase = re.sub(r"dont", "do not", phrase)
    phrase = re.sub(r"doesnt", "does not", phrase)
    phrase = re.sub(r"didnt", "did not", phrase)
    phrase = re.sub(r"wasnt", "was not", phrase)
    phrase = re.sub(r"werent", "were not", phrase)
    phrase = re.sub(r"havent", "have not", phrase)
    phrase = re.sub(r"hadnt", "had not", phrase)

    # general
    phrase = re.sub(r"n\ t", " not", phrase)
    #phrase = re.sub(r"\re", " are", phrase)
    phrase = re.sub(r"\ s ", " is ", phrase) # prime 
    phrase = re.sub(r"\ d ", " would ", phrase)
    phrase = re.sub(r"\ ll ", " will ", phrase)
    phrase = re.sub(r"\dunno", "do not ", phrase)
    phrase = re.sub(r"ive ", "i have ", phrase)
    phrase = re.sub(r"im ", "i am ", phrase)
    phrase = re.sub(r"i m ", "i am ", phrase)
    phrase = re.sub(r" w ", " with ", phrase)
    
    return phrase

    
def clean_text(df):
    """
    Clean the review texts
    """
    cleaned_review = []

    for review_text in tqdm(df['sentence']):
        
        # expand the contracted words
        review_text = decontracted(review_text)
        #remove html tags
        review_text = BeautifulSoup(review_text, 'lxml').get_text().strip() # re.sub(r'<.*?>', '', text)
        
        #remove non-alphabetic characters
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
    
        #remove url 
        review_text = re.sub(r'https?://\S+|www\.\S+', '', review_text)
        
        #Removing punctutation, string.punctuation in python consists of !"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~`
        review_text = review_text.translate(str.maketrans('', '', string.punctuation))
        # ''.join([char for char in movie_text_data if char not in string.punctuation])
        
        # remove emails
        review_text = re.sub(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", '', review_text)
    
        cleaned_review.append(review_text)

    return cleaned_review  

df_reidx['cleaned_sentence'] = clean_text(df_reidx)
df_reidx.head()  

###### additional lemmatization

In [120]:
def remove_stopwords(phrase):
    remove_sw = []
    tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+')
    stop_words = stopwords.words('english')
    
    for review_text in tqdm(phrase):
        tokens = word_tokenize(review_text)
        tokens = [word for word in tokens if not word in stop_words]
        remove_sw.append(tokens)
    return remove_sw

df_reidx['cleaned_sentence'] = remove_stopwords(df_reidx['cleaned_sentence'])
df_reidx.head()

In [121]:
#stemming for extract the actual meaning of the words
from nltk.stem import PorterStemmer

def stemming(phrase):
    stemmer = PorterStemmer()
    stem_output=[]
    stemmed=[]
    for review_text in tqdm(phrase):
        stemmed = [stemmer.stem(word) for word in review_text]
        stem_output.append(stemmed)
    return stem_output

df_reidx['cleaned_sentence'] = stemming(df_reidx['cleaned_sentence'])
df_reidx['cleaned_sentence'].head()

In [122]:
def to_sentence(phrase):
    sentence=[]
    for words in tqdm(phrase):
        sentence.append((" ").join(words))
    return sentence
df_reidx['cleaned_sentence']=to_sentence(df_reidx['cleaned_sentence'])
df_reidx['cleaned_sentence'].head()

# Feature Engineering

### CounterVectorize: tokenization: 

In [123]:
# convert the cleaned sentences to vectors
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
# a built-in stop word list for english is used
# all values of n such than min_n<=n<= max_n will be used. (1,1): only unigrams, (1,2):uni&bigram, (2,2): only bigrams
# max_df: when building the vocabulary, ignore terms that have a document frequency strictly higher than the given threshold.
# min_df: ignore terms that have a document frequency strictly lower than the given threshold.

vectorizer = CountVectorizer(stop_words='english', max_df=0.5, min_df=3, ngram_range=(1,1),tokenizer = token.tokenize)
x = vectorizer.fit_transform(df_reidx.cleaned_sentence)
y = df_reidx.label.values

print("X.shape : ",x.shape)
print("y.shape : ",y.shape)

# Train Test split

```
# This is formatted as code
```



In [125]:
# do shuffle to make neg and pos data of data set split equaly in the test and training set
# do random_sate for making it settle when we run this code repeatedly
train_idx, test_idx = train_test_split(np.arange(df_reidx.shape[0]), test_size=0.3,shuffle=True, random_state=42)

x_train = x[train_idx]
y_train = y[train_idx]

x_test = x[test_idx]
y_test = y[test_idx]
print("Number of training examples:{}".format(len(train_idx)))
print("Number of testing examples:{}\n".format(len(test_idx)))
print("Training data: X_train : {}, y_train : {}".format(x_train.shape, y_train.shape))
print("Testing data: X_test : {}, y_test : {}".format(x_test.shape, y_test.shape))


In [126]:
x_train.shape

# Model Training

#### Logistic Regression

In [127]:
# fit a logistic regression classifier on the training data use default settings
lr_clf = LogisticRegression()
lr_clf.fit(x_train, y_train)

# make prediction on testing data
y_pred_test_lr = lr_clf.predict(x_test)
y_predprob_lr = lr_clf.predict_proba(x_test)
matrix_lr = confusion_matrix(y_test,y_pred_test_lr)
print(matrix_lr)

In [128]:
print(classification_report(y_test, y_pred_test_lr))

In [129]:
print("Accuracy for Logistic Regression model:",metrics.accuracy_score(y_test, y_pred_test_lr))

#### Naive Bayes classifier

##### BernouliNB

A binary algorithm used when the feature is present or not.

In [130]:
from sklearn.naive_bayes import BernoulliNB
nb_clf = BernoulliNB()
nb_clf.fit(x_train, y_train)
# make prediction on testing data
y_pred_test_nb = nb_clf.predict(x_test)
y_predprob_nb = nb_clf.predict_proba(x_test)
matrix_nb = confusion_matrix(y_test,y_pred_test_nb)
print(matrix_nb)

In [132]:
y_predprob_test_nb = nb_clf.predict_proba(x_test)
y_predprob_test_nb

In [133]:
print(classification_report(y_test, y_pred_test_nb))
# micro average (averaging the total true positives, false negatives and false positives globally, true pos of one class / (all true pos + all false pos))
# macro average (averaging the unweighted mean per label)

In [134]:
from sklearn import metrics
print("Accuracy for Bernouli Naive Bayes model:",metrics.accuracy_score(y_test, y_pred_test_nb))

##### MultinominaliNB

It consider a feature vector where a given term represents the number of times it appears or very ofen, such as frequency.

In [135]:
mnb = MultinomialNB()
mnb.fit(x_train, y_train)
# make prediction on testing data
y_pred_test_mnb = mnb.predict(x_test)
y_predprob_mnb = mnb.predict_proba(x_test)
matrix = confusion_matrix(y_test,y_pred_test_mnb)
print(matrix)

In [136]:
print(classification_report(y_test, y_pred_test_mnb))

In [137]:
print("Accuracy for multinominal Naive Bayes model:",metrics.accuracy_score(y_test, y_pred_test_mnb))

# Model Evaluation

# Cross validataion

##### Logistic Regression

In [138]:
# n-fold cross validation
scores_lr = cross_val_score(lr_clf, x, y, cv=5, scoring='precision')
print(scores_lr)

##### BernouliNB

In [139]:
scores_nb = cross_val_score(nb_clf, x, y, cv=5, scoring='precision')
print(scores_nb)

##### MultinominaliNB

In [140]:
scores_mnb = cross_val_score(mnb, x, y, cv=5, scoring='precision')
print(scores_mnb)

##### ROC Curve

##### Logistic Regression

In [141]:
from sklearn.metrics import roc_curve, auc

fpr, tpr, thresholds = roc_curve(y_true = y_test, y_score = y_predprob_lr[:,1], pos_label=1)
roc_auc = auc(fpr, tpr) # area under ROC curve

plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC (Receiver operating characteristic) curve')
plt.legend(loc="lower right")
plt.show()

##### BernouliNB

In [142]:
from sklearn.metrics import roc_curve, auc

fpr, tpr, thresholds = roc_curve(y_true = y_test, y_score = y_predprob_nb[:,1], pos_label=1)
roc_auc = auc(fpr, tpr) # area under ROC curve

plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC (Receiver operating characteristic) curve')
plt.legend(loc="lower right")
plt.show()

##### MultinominaliNB

In [143]:
from sklearn.metrics import roc_curve, auc

fpr, tpr, thresholds = roc_curve(y_true = y_test, y_score = y_predprob_mnb[:,1], pos_label=1)
roc_auc = auc(fpr, tpr) # area under ROC curve

plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC (Receiver operating characteristic) curve')
plt.legend(loc="lower right")
plt.show()

### Precision Recall Curve

##### Logistic Regression

In [144]:
from sklearn.metrics import average_precision_score
precision, recall, thresholds = precision_recall_curve(y_true=y_test, probas_pred=y_predprob_lr[:,1], pos_label=1)
plt.plot(recall, precision, color='darkorange', lw=lw, label='Average precision recall score: %0.2f' % average_precision_score(y_test, y_predprob_mnb[:,1]))

plt.title('Precision recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.show()

##### BernouliNB

In [145]:
from sklearn.metrics import average_precision_score
precision, recall, thresholds = precision_recall_curve(y_true=y_test, probas_pred=y_predprob_nb[:,1], pos_label=1)
plt.plot(recall, precision, color='darkorange', lw=lw, label='Average precision recall score: %0.2f' % average_precision_score(y_test, y_predprob_mnb[:,1]))

plt.title('Precision recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.show()

##### MultinominaliNB

In [146]:
from sklearn.metrics import average_precision_score
precision, recall, thresholds = precision_recall_curve(y_true=y_test, probas_pred=y_predprob_mnb[:,1], pos_label=1)
plt.plot(recall, precision, color='darkorange', lw=lw, label='Average precision recall score: %0.2f' % average_precision_score(y_test, y_predprob_mnb[:,1]))

plt.title('Precision recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.show()

###### LinearRegression

In [147]:
acc_score_lr = metrics.accuracy_score(y_pred_test_lr,y_test)
prec_score_lr = precision_score(y_test,y_pred_test_lr, average='macro')
recall_lr = recall_score(y_test, y_pred_test_lr,average='macro')
f1_lr = f1_score(y_test,y_pred_test_nb,average='macro')
matrix_lr = confusion_matrix(y_test,y_pred_test_lr)
print('Logistic Regression Model\n')
print(str('Accuracy: '+'{:04.2f}'.format(acc_score_lr*100))+'%')
print(str('Precision: '+'{:04.2f}'.format(prec_score_lr*100))+'%')
print(str('Recall: '+'{:04.2f}'.format(recall_lr*100))+'%')
print('F1 Score: ',f1_lr)
print(matrix_lr)

##### BernouliNB

In [148]:
acc_score_nb = metrics.accuracy_score(y_pred_test_nb,y_test)
prec_score_nb = precision_score(y_test,y_pred_test_nb, average='macro')
recall_nb = recall_score(y_test, y_pred_test_nb,average='macro')
f1_nb = f1_score(y_test,y_pred_test_nb,average='macro')
matrix_nb = confusion_matrix(y_test,y_pred_test_nb)
print('Bernouli Naive Bayes Model\n')
print(str('Accuracy: '+'{:04.2f}'.format(acc_score_nb*100))+'%')
print(str('Precision: '+'{:04.2f}'.format(prec_score_nb*100))+'%')
print(str('Recall: '+'{:04.2f}'.format(recall_nb*100))+'%')
print('F1 Score: ',f1_nb)
print(matrix_nb)

##### MultinominaliNB

In [149]:
acc_score_mnb = metrics.accuracy_score(y_pred_test_mnb,y_test)
prec_score_mnb = precision_score(y_test,y_pred_test_mnb, average='macro')
recall_mnb = recall_score(y_test, y_pred_test_mnb,average='macro')
f1_mnb = f1_score(y_test,y_pred_test_mnb,average='macro')
matrix_mnb = confusion_matrix(y_test,y_pred_test_mnb)
print('Multimominal Naive Bayes Model\n')
print(str('Accuracy: '+'{:04.2f}'.format(acc_score_mnb*100))+'%')
print(str('Precision: '+'{:04.2f}'.format(prec_score_mnb*100))+'%')
print(str('Recall: '+'{:04.2f}'.format(recall_mnb*100))+'%')
print('F1 Score: ',f1_mnb)
print(matrix_mnb)

# Grid Search for parameter tuning

##### Logistic Regression parameter tuning

In [150]:
from sklearn.model_selection import GridSearchCV  #predefined hyperparameters and fit your estimator (model) on your training set.
from sklearn.pipeline import Pipeline

vector = CountVectorizer(stop_words='english')
logistic = LogisticRegression() # , tol=0.1

# build a pipeline
pipe = Pipeline(steps = [
       ('vectorizer', vector),
       ('classifier', logistic)])

# creat a dictionary of model parameters and corresponding values
# For example, in countvectorizer, we want to explore the suitable value for min_df, select from 1,3,5,10
param_grid = {
    'vectorizer__min_df': [1, 3, 5, 10],
    'vectorizer__max_df': [0.7, 0.8, 0.9],
    'classifier__penalty': ['l1','l2']}

# run GridSearchCV, cv, f1
search_result = GridSearchCV(pipe, param_grid, cv=5, scoring='f1').fit(df_reidx.cleaned_sentence.values, df_reidx.label.values)

print("Best parameter (CV score=%0.3f):" % search_result.best_score_) # scoring: accuracy by default
print(search_result.best_params_)
# with 5 fold cv, the best f1 score is 0.958, and the corresponding parameter values are as follows:

##### Bernoulli Naive Bayes parameter tuning

In [151]:
vector = CountVectorizer(stop_words='english')
logistic = BernoulliNB() # , tol=0.1

# build a pipeline
pipe = Pipeline(steps = [
       ('vectorizer', vector),
       ('classifier', logistic)])
parameters = {'vectorizer__min_df': [1, 3, 5, 10],
    'vectorizer__max_df': [0.7, 0.8, 0.9],
    'classifier__alpha':[0.0, 0.1, 1.0, 2.0, 10.0]}

# run GridSearchCV, cv, f1    
search_result = GridSearchCV(pipe, parameters, cv=5, scoring='f1').fit(df_reidx.cleaned_sentence.values, df_reidx.label.values)

print("Best parameter (CV score=%0.3f):" % search_result.best_score_) 
print(search_result.best_params_)
# with 5 fold cv, the best f1 score is 0.943, and the corresponding parameter values are as follows:

##### Multinomial Naive Bayes parameter tuning

In [152]:
vector = CountVectorizer(stop_words='english')
logistic = MultinomialNB() # , tol=0.1

# build a pipeline
pipe = Pipeline(steps = [
       ('vectorizer', vector),
       ('classifier', logistic)])
parameters = {'vectorizer__min_df': [1, 3, 5, 10],
    'vectorizer__max_df': [0.7, 0.8, 0.9],
    'classifier__alpha':[0.0, 0.1, 1.0, 2.0, 10.0]}

# run GridSearchCV, cv, f1    
search_result = GridSearchCV(pipe, parameters, cv=5, scoring='f1').fit(df_reidx.cleaned_sentence.values, df_reidx.label.values)

print("Best parameter (CV score=%0.3f):" % search_result.best_score_) 
print(search_result.best_params_)
# with 5 fold cv, the best f1 score is 0.942, and the corresponding parameter values are as follows:

# Explain the model prediction

Multimominal Naive Bayes Model has higher accuracy than Bernouli Naive Bayes Model. And Logistic Regression model has the highest accuracy than others, so I chose Logistic Regression model to test the model.

In [153]:
test_data = df_reidx.iloc[test_idx]
test_data['pred_label'] = y_pred_test_lr
test_data.head(2)[['sentence','label','pred_label']]
# shows what the prediction label fit to the real label

In [154]:
# shows what the prediction label does not fit to the real label
test_data[test_data['label'] != test_data['pred_label']].head()[['sentence','label','pred_label']].head(2)

##### Predicted features of logistic regression model

In [155]:
feature_to_coef = {word: float("%.3f" % coef) for word, coef in zip(vectorizer.get_feature_names(), lr_clf.coef_[0])}

print("Top positive features:")
sorted(feature_to_coef.items(), key=lambda x: x[1], reverse=True)[:10]

In [156]:
# most of the words are reliable evidence of indicating negative sentiments
print("Top negative features:")
sorted(feature_to_coef.items(), key=lambda x: x[1], reverse=False)[:10]

##### Predicted features of BernouliNB

In [157]:
feature_to_coef = {word: float("%.3f" % coef) for word, coef in zip(vectorizer.get_feature_names(), nb_clf.coef_[0])}

print("Top positive features:")
sorted(feature_to_coef.items(), key=lambda x: x[1], reverse=True)[:10]

In [158]:
# most of the words are reliable evidence of indicating negative sentiments
print("Top negative features:")
sorted(feature_to_coef.items(), key=lambda x: x[1], reverse=False)[:10]

##### Predicted features of multinomial NB

In [159]:
feature_to_coef = {word: float("%.3f" % coef) for word, coef in zip(vectorizer.get_feature_names(), mnb.coef_[0])}

print("Top positive features:")
sorted(feature_to_coef.items(), key=lambda x: x[1], reverse=True)[:10]

In [160]:
# most of the words are reliable evidence of indicating negative sentiments
print("Top negative features:")
sorted(feature_to_coef.items(), key=lambda x: x[1], reverse=False)[:10]

# Conclusion

In [161]:
text=['i want to make this positive', 'i want to make this project better', 'i feel aaaaaaah']
test_result = lr_clf.predict(vectorizer.transform(text))
print(test_result)

The multinominal Naive Bayes model is slightly higher than Bernoulli Naive Bayes model.
Using the Logistic Regression model is the best model along with the three classifiers because it has the highest accuracy through this project. Also, the predicted features of the logistic regression model are most accurate and reasonable compare to other classifiers. By comparing the three classifiers, the logistic regression model has three to four higher accuracy than others.

According to the Logistic Regression Model prediction, when people talk about their feeling that they are accepted, supported, and beloved, they use positive emojis when they are texting. On the other hand, when people feel pressured, hated, or punished, they use negative emojis when texting. 
