# Assignment 2: Emotion Classification

In [41]:
#all dependencies

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
nltk.download('stopwords')

import tensorflow as tf

from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to /Users/ygao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [32]:
#constants
DATA_LOCATION = '../data/go_emotions_dataset.csv'
CLEAN_DATA_LOCATION = '../data/'
MODEL_LOCATION = '../model/'
RANDOM_SEED = 297
CATEGORIES = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
              'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 
              'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 
              'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']

# Data preparation

We load the data from local source, and split it into train validation and test sets. 
Split corpus and target.

In [38]:
#load dataset
def load_data(filename):
    df = pd.read_csv(filename)
    df_raw_data = df.drop(columns=['id', 'example_very_unclear']) #text+categories
    category_columns = df_raw_data.columns[1:29] #extract categories
    df_data = pd.DataFrame({ #creating new df with 'text' and 'target
        'text': df_raw_data['text'],
        'target': df_raw_data[category_columns].apply(lambda row: row.values.tolist(), axis=1)
    })
    return df_data

df = load_data(DATA_LOCATION)

In [39]:
#split dataset into train, validation, test (70% 15% 15%)
#train: 147857 validation/test: 31684
df_train, df_temp = train_test_split(df, test_size=0.3, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=RANDOM_SEED)
#save the split data frames to csv just in case
df_train.to_csv(CLEAN_DATA_LOCATION+'raw_train.csv', index=False)
df_val.to_csv(CLEAN_DATA_LOCATION+'raw_val.csv', index=False)
df_test.to_csv(CLEAN_DATA_LOCATION+'raw_test.csv', index=False)

We have a closer look at the training set to see if the data is balanced. 

In [None]:
print("Possible sentiments are", np.unique(df_train['target']), "of", len(np.unique(df_train['target'])),"sentiments")
print("The number of posts for training is", len(df_train))

#visualize distribution
df_train_visual = pd.DataFrame(df_train['target'].tolist(), columns=CATEGORIES)
plt.figure(figsize=(15,6))
sns.barplot(data=df_train_visual, estimator=sum, palette='viridis')
plt.title('distribution of categories in training data')
plt.show()

#split into corpus and target
corpus_train = df_train['text']
target_train = df_train['target']

track = pd.DataFrame([corpus_train[5]], columns=["raw text example"]) #use to track progress
track.head()

Possible sentiments are [list([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
 list([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])
 list([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0])
 ...
 list([1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
 list([1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
 list([1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])] of 1701 sentiments
The number of posts for training is 147857


# Preprocessing