# EDA : Emotions Dataset

## Libraries 

In [2]:
import numpy  as np
import pandas as pd
import json

import datetime
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

import re
import nltk.corpus
from unidecode                        import unidecode
from nltk.tokenize                    import word_tokenize
from nltk                             import SnowballStemmer
from sklearn.feature_extraction.text  import TfidfVectorizer
from sklearn.preprocessing            import normalize

from sklearn import cluster

import matplotlib.pyplot  as plt
import matplotlib.cm      as cm
import seaborn            as sns
from sklearn.metrics                  import silhouette_samples, silhouette_score
from wordcloud                        import WordCloud

import folium
from branca.element import Figure
from sklearn.cluster import KMeans

In [3]:
emo_df = pd.read_csv('../raw_data/emotion_data.csv')

## Entire training set size

In [4]:
emo_df.shape

(61271, 2)

## Classes balance

In [5]:
emo_df['Emotion'].value_counts()

anger      15872
joy        14168
worry      11786
neutral    10212
sad         9233
Name: Emotion, dtype: int64

## Clean_data

In [6]:
def clean_data(data):
    """
    clean and preprocess data
    """
    # Lowercase text
    data['clean_text'] = data['Text'].apply(
        lambda x: x.lower()
        )
    # Strip whitespace
    data['clean_text'] = data['clean_text'].apply(
        lambda x: x.strip()
        )
    # Remove numbers
    data['clean_text'] = data['clean_text'].apply(
        lambda x: ''.join(let for let in x if not let.isdigit())
        )
    # Remove punctuation
    data['clean_text'] = data['clean_text'].apply(
        lambda x: ''.join(let for let in x if not let in string.punctuation)
        )
    # Tokenization with nltk
    data['clean_text'] = data['clean_text'].apply(
        lambda x: word_tokenize(x)
    )

    # Lemmatizing with nltk
    lemmatizer = WordNetLemmatizer()
    data['clean_text'] = data['clean_text'].apply(
        lambda x: ' '.join(lemmatizer.lemmatize(word) for word in x)
        )

    # Return data
    return data

In [7]:
clean_data(emo_df)

Unnamed: 0,Emotion,Text,clean_text
0,sad,Layin n bed with a headache ughhhh...waitin o...,layin n bed with a headache ughhhhwaitin on yo...
1,sad,Funeral ceremony...gloomy friday...,funeral ceremonygloomy friday
2,joy,wants to hang out with friends SOON!,want to hang out with friend soon
3,worry,Re-pinging @ghostridah14: why didn't you go to...,repinging ghostridah why didnt you go to prom ...
4,sad,"I should be sleep, but im not! thinking about ...",i should be sleep but im not thinking about an...
5,worry,Hmmm. http://www.djhero.com/ is down,hmmm httpwwwdjherocom is down
6,sad,@charviray Charlene my love. I miss you,charviray charlene my love i miss you
7,sad,@kelcouch I'm sorry at least it's Friday?,kelcouch im sorry at least it friday
8,worry,Choked on her retainers,choked on her retainer
9,sad,Ugh! I have to beat this stupid song to get to...,ugh i have to beat this stupid song to get to ...


## Vectorize + 51 626 independent words

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf_vectorizer = TfidfVectorizer()

X = tf_idf_vectorizer.fit_transform(emo_df.clean_text)

X.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [12]:
pd.DataFrame(X.toarray(),columns = tf_idf_vectorizer.get_feature_names())

Unnamed: 0,aa,aaa,aaaa,aaaaaaaa,aaaaaaaaaaa,aaaaaaaaaahhhhhhhh,aaaaaaaaaamazing,aaaaaaaafternoon,aaaaaaaahhhhhhhh,aaaaaah,...,½ureo,½ve,½we,½who,½why,½whyyy,½y,½you,½z,½ï
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Vectorize with max features to see which words are the most used 

In [8]:
tf_idf_vectorizer2 = TfidfVectorizer(max_features = 20)

X2 = tf_idf_vectorizer2.fit_transform(emo_df.clean_text)

X2.toarray()

pd.DataFrame(X2.toarray(),columns = tf_idf_vectorizer2.get_feature_names())

Unnamed: 0,and,at,for,have,im,in,is,it,me,my,not,of,on,so,that,the,to,wa,with,you
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.697495,0.000000,0.000000,0.000000,0.000000,0.000000,0.716589,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.535400,0.000000,0.844599,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.844923,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.347794,0.000000,0.000000,0.406380
4,0.000000,0.000000,0.000000,0.000000,0.595366,0.000000,0.000000,0.000000,0.534605,0.000000,0.599780,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.720691,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.693257
7,0.000000,0.624166,0.000000,0.000000,0.628975,0.000000,0.000000,0.463473,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,0.000000,0.000000,0.000000,0.437930,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.286128,0.852261,0.000000,0.000000,0.000000


In [9]:
most_famous_words = tf_idf_vectorizer2.get_feature_names()

# wordcloud based on sentiment 

In [10]:
emo_df

Unnamed: 0,Emotion,Text,clean_text
0,sad,Layin n bed with a headache ughhhh...waitin o...,layin n bed with a headache ughhhhwaitin on yo...
1,sad,Funeral ceremony...gloomy friday...,funeral ceremonygloomy friday
2,joy,wants to hang out with friends SOON!,want to hang out with friend soon
3,worry,Re-pinging @ghostridah14: why didn't you go to...,repinging ghostridah why didnt you go to prom ...
4,sad,"I should be sleep, but im not! thinking about ...",i should be sleep but im not thinking about an...
5,worry,Hmmm. http://www.djhero.com/ is down,hmmm httpwwwdjherocom is down
6,sad,@charviray Charlene my love. I miss you,charviray charlene my love i miss you
7,sad,@kelcouch I'm sorry at least it's Friday?,kelcouch im sorry at least it friday
8,worry,Choked on her retainers,choked on her retainer
9,sad,Ugh! I have to beat this stupid song to get to...,ugh i have to beat this stupid song to get to ...


In [13]:
lda_model = LatentDirichletAllocation(n_components=5).fit(X)

NameError: name 'LatentDirichletAllocation' is not defined

In [None]:
# remove stopwords 
# lematize

# Tfdf 
## compare to see which words are most important in both 
# LDA - unsupervised - seeing if algo finds same thing as feelings 


# then do cloudwords
