In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import nltk
from nltk.corpus import stopwords
import re

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import plot_confusion_matrix, classification_report

# Data Processing and Visualization

In [None]:
train = pd.read_csv('../input/emotions-dataset-for-nlp/train.txt', sep = ';', names = ['text', 'emotion'])
test = pd.read_csv('../input/emotions-dataset-for-nlp/test.txt', sep = ';', names = ['text', 'emotion'])
val = pd.read_csv('../input/emotions-dataset-for-nlp/val.txt', sep = ';', names = ['text', 'emotion'])

In [None]:
df = train.copy()
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df[df.duplicated()]

In [None]:
df['emotion'].value_counts()

In [None]:
sns.countplot(df['emotion'], color='grey')

In [None]:
def clean_text(data):
    corpus = []
    
    text = data.lower()
    text = re.sub('/[^a-zA-Z0-9]+', ' ', text)
    text = nltk.word_tokenize(text)
    lemma = nltk.WordNetLemmatizer()
    for word in text:
        if word not in stopwords.words('english'):
            word = lemma.lemmatize(word)
            corpus.append(''.join(word))
    return ' '.join(corpus)

In [None]:
preprocessed_text = df['text'].apply(clean_text)

In [None]:
df['cleaned_text'] = preprocessed_text

In [None]:
df['emotion'].unique()

In [None]:
emot_label = {'sadness':1, 'anger':2, 'love':3, 'surprise':4, 'fear':5, 'joy':6}
df['labels'] = [emot_label[label] for label in df['emotion']]

In [None]:
df.head()

# Spliting the data

In [None]:
X_train, X_test, X_train_names, X_test_names = train_test_split(df['cleaned_text'], df['emotion'],
                                                                test_size = 0.25, random_state = 1)

In [None]:
X_train

In [None]:
print(f'X_train shpae: {X_train.shape}')
print(f'X_test shpae: {X_test.shape}')
print(f'X_train_names shpae: {X_train_names.shape}')
print(f'X_test_names shpae: {X_test_names.shape}')

# Feature Extraction

In [None]:
# Build BOW(Bag of words) model using given text
tfidf = TfidfVectorizer(use_idf=True, max_df = 0.6, min_df=0.00001)
tv_train_features = tfidf.fit_transform(X_train.astype('U'))

tv_test_features = tfidf.transform(X_test.astype('U'))

print('Features shape =>', f'\n train feature shape is: {tv_train_features.shape}', f'\n test features shape is: {tv_test_features.shape}')

# Classification

In [None]:
svm_pipeline = Pipeline([('tfidf', TfidfVectorizer()),
                        ('svm', LinearSVC(random_state=1))])

param_grid = {'tfidf__ngram_range': [(1, 1), (1, 1)],
              'svm__C': [1e-5, 1e-4, 1e-2, 1e-1, 1]
}

grid_svm = GridSearchCV(svm_pipeline, param_grid=param_grid, cv=5, verbose=2)
gs_svm = grid_svm.fit(X_train.astype('U'), X_train_names)

In [None]:
gs_svm.best_estimator_.get_params()

In [None]:
best_svm_score = gs_svm.score(X_test.astype('U'), X_test_names)
print(f'Score of the LinearSVM model is: {best_svm_score *100}')

Multinomial Classification

In [None]:
mnb_pipeline = Pipeline([('tfidf', TfidfVectorizer()),
                        ('mnb', MultinomialNB())])

param_grid_mnb = {'tfidf__ngram_range': [(1, 1), (1, 2)],
              'mnb__alpha': [1e-5, 1e-4, 1e-2, 1e-1, 1]
}

grid_mnb = GridSearchCV(mnb_pipeline, param_grid=param_grid_mnb, cv=5, verbose=2)
gs_mnb = grid_svm.fit(X_train, X_train_names)

In [None]:
best_score_mnb = gs_mnb.score(X_test, X_test_names)
print(f'Score of the MultinomialNB model is: {best_score_mnb * 100}%')

# Evaluation

In [None]:
test['cleaned_text'] = test['text'].apply(clean_text)

**LinearSVM Classifier**

In [None]:
svm_results = gs_svm.cv_results_
results_df = pd.DataFrame({'rank': svm_results['rank_test_score'],
                           'params': svm_results['params'], 
                           'cv score (mean)': svm_results['mean_test_score'], 
                           'cv score (std)': svm_results['std_test_score']} 
              )
results_df = results_df.sort_values(by=['rank'], ascending=True)
pd.set_option('display.max_colwidth', 100)
results_df

In [None]:
svm_predict = gs_svm.predict(test['cleaned_text'])

plt.figure(figsize=(16, 10))
plot_confusion_matrix(estimator=gs_svm, X= test['cleaned_text'], y_true=test['emotion'], xticks_rotation='vertical',
                     cmap = 'Greys', ax = None,include_values=True)
plt.show()

plot_confusion_matrix(estimator=gs_svm, X= test['cleaned_text'], y_true=svm_predict, xticks_rotation='vertical',
                     cmap = 'Greys', ax = None,include_values=True)

plt.show()

In [None]:
print(classification_report(test['emotion'], svm_predict))

**MultinomialNB Classification**

In [None]:
mnb_results = gs_mnb.cv_results_
results_df_mnb = pd.DataFrame({'rank': mnb_results['rank_test_score'],
                           'params': mnb_results['params'], 
                           'mnb score (mean)': mnb_results['mean_test_score'], 
                           'mnb score (std)': mnb_results['std_test_score']} 
              )
results_df_mnb = results_df_mnb.sort_values(by=['rank'], ascending=True)
pd.set_option('display.max_colwidth', 100)
results_df_mnb

In [None]:
mnb_predict = gs_mnb.predict(test['cleaned_text'])

plt.figure(figsize=(16, 10))
plot_confusion_matrix(estimator=gs_mnb, X= test['cleaned_text'], y_true=test['emotion'], xticks_rotation='vertical',
                     cmap = 'Greys', ax = None,include_values=True)
plt.show()

plot_confusion_matrix(estimator=gs_svm, X= test['cleaned_text'], y_true=mnb_predict, xticks_rotation='vertical',
                     cmap = 'Greys', ax = None,include_values=True)

plt.show()

In [None]:
print(classification_report(test['emotion'], mnb_predict))