In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load Dataset

In [2]:
train_data = pd.read_csv('../input/emotions-dataset-for-nlp/train.txt',names=['sentence','emotion'],header=None, sep=';')
test_data = pd.read_csv('../input/emotions-dataset-for-nlp/test.txt',names=['sentence','emotion'],header=None, sep=';')
val_data= pd.read_csv('../input/emotions-dataset-for-nlp/val.txt',names=['sentence','emotion'],header=None, sep=';')
print('Total data:',train_data.shape)
print('Total data:',test_data.shape)
print('Total data:',val_data.shape)

In [3]:
print('Train data Null Check:\n',train_data.isnull().sum(),'\n')
print('Test data Null Check:\n',test_data.isnull().sum(),'\n')
print('Validation data Null Check:\n',val_data.isnull().sum())


In [4]:
train_data.emotion.value_counts()

In [5]:
import matplotlib.pyplot as plt
import seaborn as sns
train_data['length'] = train_data['sentence'].apply(len) # number of characters
plt.figure(figsize=(10,7))
sns.kdeplot(x=train_data["length"], hue=train_data["emotion"])
plt.show()

In [6]:
train_data['length'].max()

# Data Preprocessing

In [7]:
# Encode target labels with value between 0 and n_classes-1.
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
train_data['emotion'] = le.fit_transform(train_data['emotion'])
test_data['emotion'] = le.fit_transform(test_data['emotion'])
val_data['emotion'] = le.fit_transform(val_data['emotion'])
train_data.head()

In [9]:
test_data.head()

In [10]:
import re
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
stopwords = set(nltk.corpus.stopwords.words('english'))
import warnings
warnings.filterwarnings('ignore')

In [11]:
from tqdm import tqdm
from bs4 import BeautifulSoup
import string
vocab_size= 100000
len_sentence = 150

def decontracted(phrase):
    """
    We first define a function to expand the contracted phrase into normal words
    """
    # specific
    phrase = re.sub(r"wont", "will not", phrase)
    phrase = re.sub(r"wouldnt", "would not", phrase)
    phrase = re.sub(r"shouldnt", "should not", phrase)
    phrase = re.sub(r"couldnt", "could not", phrase)
    phrase = re.sub(r"cudnt", "could not", phrase)
    phrase = re.sub(r"cant", "can not", phrase)
    phrase = re.sub(r"dont", "do not", phrase)
    phrase = re.sub(r"doesnt", "does not", phrase)
    phrase = re.sub(r"didnt", "did not", phrase)
    phrase = re.sub(r"wasnt", "was not", phrase)
    phrase = re.sub(r"werent", "were not", phrase)
    phrase = re.sub(r"havent", "have not", phrase)
    phrase = re.sub(r"hadnt", "had not", phrase)

    # general
    phrase = re.sub(r"n\ t", " not", phrase)
    #phrase = re.sub(r"\re", " are", phrase)
    phrase = re.sub(r"\ s ", " is ", phrase) # prime 
    phrase = re.sub(r"\ d ", " would ", phrase)
    phrase = re.sub(r"\ ll ", " will ", phrase)
    phrase = re.sub(r"\dunno", "do not ", phrase)
    phrase = re.sub(r"ive ", "i have ", phrase)
    phrase = re.sub(r"im ", "i am ", phrase)
    phrase = re.sub(r"i m ", "i am ", phrase)
    phrase = re.sub(r" w ", " with ", phrase)
    
    return phrase

def preprocessing(df):
    """
    Clean the review texts
    """
    cleaned_review = []    
    stemmer = PorterStemmer()

    for review_text in tqdm(df['sentence']):
        
        # expand the contracted words
        review_text = decontracted(review_text)
        
        #remove html tags
        review_text = BeautifulSoup(review_text, 'lxml').get_text().strip() # re.sub(r'<.*?>', '', text)
        
        #remove non-alphabetic characters
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
    
        #remove url 
        review_text = re.sub(r'https?://\S+|www\.\S+', '', review_text)
        
        #Removing punctutation, string.punctuation in python consists of !"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~`
        review_text = review_text.translate(str.maketrans('', '', string.punctuation))

        # remove emails
        review_text = re.sub(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", '', review_text)
        review_text = review_text.split()
        review_text = [stemmer.stem(word) for word in review_text if word not in stopwords]
        review_text = " ".join(review_text)
    
        cleaned_review.append(review_text)
        
    return cleaned_review

In [12]:
train_data['cleaned_data'] = preprocessing(train_data)
test_data['cleaned_data'] = preprocessing(test_data)
val_data['cleaned_data'] = preprocessing(val_data)

In [13]:
test_data['cleaned_data'][0]

In [14]:
def encode(text):
    one_hot_words = [one_hot(input_text=word, n=10000) for word in text]
    padded = pad_sequences(sequences = one_hot_words, maxlen =len_sentence, padding ="pre") # padding is used to provide uniformity in the sentences.
    print(text.shape)
    return padded

In [15]:
x_train = encode(train_data['cleaned_data'])
x_val = encode(val_data['cleaned_data'])
x_test = encode(test_data['cleaned_data'])

In [16]:
# labels
y_train = train_data['emotion']
y_val = val_data['emotion']
y_test = test_data['emotion']

In [17]:
from sklearn.preprocessing import OneHotEncoder 
# What is One_hot in Python?
#One-hot encoding is essentially the representation of categorical variables as binary vectors.
onehot_encoder = OneHotEncoder()
y_train = np.array(y_train)
y_train = onehot_encoder.fit_transform(y_train.reshape(-1,1)).toarray()
print(y_train)

y_val = np.array(y_val)
y_val = onehot_encoder.fit_transform(y_val.reshape(-1,1)).toarray()
print(y_val)

y_test = np.array(y_test)
y_test = onehot_encoder.fit_transform(y_test.reshape(-1,1)).toarray()
print(y_test)

## Building a model

In [18]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [27]:
#building the neural network

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, output_dim = 150,input_length = len_sentence), # gives an output of the shape of a 2D array w/ length of the sentence as one diemntion and the embedding dimention
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(64,activation='sigmoid'),
    tf.keras.layers.Dense(6,activation='softmax')
])
model.compile(loss = 'categorical_crossentropy',optimizer = 'adam',metrics=['accuracy'])
model.summary()

In [28]:
callbacks = [
    keras.callbacks.EarlyStopping(
    monitor = "val_loss",
        min_delta=1e-2,
        patience = 2,
        verbose=1
    )
]

In [29]:
num_epoch = 15
history = model.fit(x_train, y_train, epochs = num_epoch,batch_size = 64, callbacks=callbacks,
                   validation_data = (x_val, y_val),verbose=1)

## Model Evaluation

In [30]:
results = model.evaluate(x_test,y_test)
print(results)

In [31]:
history_dict = history.history
acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [32]:
plt.clf()   # clear figure

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()