In [2]:
import pandas as pd
import numpy as np
import os

In [3]:
os.listdir('data/')

['sample_submission.csv',
 'test_data.csv',
 'text_emotion.csv',
 'train_data.csv']

In [4]:
df = pd.read_csv('data/text_emotion.csv')
print (df.shape)
df.head()

(40000, 4)


Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...


Let's filter out the columns we require

In [5]:
cols = ['sentiment', 'content']
df = df[cols]

We will now run various feature extraction tools to get a good idea about our data.

### 1. Text Preprocessing
#### 1.1. Remove Stop Words and Punctuation

In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

In [7]:
stop_words = set(stopwords.words('english')) 

In [8]:
def convert_list_to_string(s): 
  
    # initialization of string to "" 
    new = "" 
  
    # traverse in the string  
    for x in s: 
        new += x +' ' 
  
    # return string  
    return new.lower() # converting string to lowercase

In [9]:
def remove_stopwords(sentence):

    word_tokens = word_tokenize(sentence) 

    filtered_sentence = [w for w in word_tokens if not w in stop_words] 

    filtered_sentence = [] 

    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w) 
            
    return convert_list_to_string(filtered_sentence)

In [10]:
from nltk.tokenize import RegexpTokenizer

def remove_tokens(sentence):

    tokenizer = RegexpTokenizer(r'\w+')
    return convert_list_to_string(tokenizer.tokenize(sentence))

In [11]:
import pandas as pd

#extract lemma
def extract_lemma(sent):
    parsed_text = {'word':[], 'lemma':[]}
    #for sent in doc.sentences:
    for wrd in sent:
        #extract text and lemma
        parsed_text['word'].append(wrd.text)
        parsed_text['lemma'].append(wrd.lemma)
    #return a dataframe
    return pd.DataFrame(parsed_text)

In [12]:
from nltk.stem import WordNetLemmatizer 
  
def extract_lemma(sent):
    lemmatizer = WordNetLemmatizer() 
    word_tokens = word_tokenize(sent) 
    
    lemmatized_sentence = []
    for word in word_tokens:
        lemmatized_sentence.append(lemmatizer.lemmatize(word))
    
    return convert_list_to_string(lemmatized_sentence)

#### 1.2. Driver Function to Process text

In [13]:
def process_text(sentence):
    return extract_lemma(remove_tokens(remove_stopwords(sentence)))

In [14]:
process_text(df['content'].iloc[1])

'layin n bed headache ughhhh waitin call '

### 2. Feature extraction

Setting up Vader NLP.
#### 2.1. Vader Sentiment Feature Extraction

In [15]:
#!pip install requests

In [16]:
#!pip install vaderSentiment

In [17]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

In [18]:
def sentiment_analyzer_scores(sentence):
    score = analyser.polarity_scores(sentence)
    #print("{:-<40} {}".format(sentence, str(score)))
    return score

In [19]:
sentiment_analyzer_scores(df['content'].iloc[0])

{'neg': 0.2, 'neu': 0.8, 'pos': 0.0, 'compound': -0.5423}

#### 2.2. Textblob NLP Feature Extraction

In [20]:
#!pip install textblob

In [21]:
from textblob import TextBlob
import nltk
#nltk.download('brown')

In [22]:
def get_textblob_features(sentence):
    
    blob = TextBlob(process_text(sentence))
    
    textblob_dict = {}
    
    ## Number of noun phrases in the sentence
    textblob_dict['Number of Noun Phrases'] = len(list(blob.noun_phrases))
    
    ## Number of POS tags in the sentence
    textblob_dict['Number of POS tags'] = len(list(blob.pos_tags))
    
    ## Polarity of the sentence
    textblob_dict['Sentence Polarity'] = blob.sentiment_assessments.polarity
    
    ## Polarity of the sentence
    textblob_dict['Sentence Subjectivity'] = blob.sentiment_assessments.polarity
    
    ## Number of words in the processed sentence
    textblob_dict['Number of words'] = len(list(blob.words))
    
    ## Language of the sentence
    textblob_dict['Language Detected'] = blob.detect_language()
    
    return textblob_dict

In [24]:
#get_textblob_features(df['content'].iloc[1002])

#### 2.3. spaCy Feature Extraction
Ref. https://spacy.io/usage/spacy-101

In [25]:
import spacy

spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")

In [26]:
def get_entities(sentence):
    doc = nlp(sentence)
    
    entities_dict = {}
    for ent in doc.ents:
        entities_dict[ent.label_] =  ent.text
        
    return entities_dict

#### 2.4. afinn Sentiment score

In [27]:
#!pip install afinn

In [28]:
from afinn import Afinn
afinn = Afinn()

def afinn_score(sentence):
    return afinn.score(sentence)

#### 2.5. TF-IDF Feature Extraction

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
v = TfidfVectorizer()
x = v.fit_transform(df['processed_sentence'])

In [32]:
tf_idf_array = x.toarray()

In [81]:
tf_idf_df = pd.DataFrame(tf_idf_array)[cols]

In [86]:
#tf_idf_df = tf_idf_df.reset_index()
tf_idf_df.head()

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,990,991,992,993,994,995,996,997,998,999
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
#df = df.reset_index()
df.head()

Unnamed: 0,index,sentiment,content,processed_sentence
0,0,empty,@tiffanylue i know i was listenin to bad habi...,tiffanylue know listenin bad habit earlier sta...
1,1,sadness,Layin n bed with a headache ughhhh...waitin o...,layin n bed headache ughhhh waitin call
2,2,sadness,Funeral ceremony...gloomy friday...,funeral ceremony gloomy friday
3,3,enthusiasm,wants to hang out with friends SOON!,want hang friend soon
4,4,neutral,@dannycastillo We want to trade with someone w...,dannycastillo we want trade someone houston ti...


In [None]:
result = df.merge(tf_idf_df, on='index')

### 3. Preparing Training Data

In [90]:
df.head()

Unnamed: 0,index,sentiment,content,processed_sentence
0,0,empty,@tiffanylue i know i was listenin to bad habi...,tiffanylue know listenin bad habit earlier sta...
1,1,sadness,Layin n bed with a headache ughhhh...waitin o...,layin n bed headache ughhhh waitin call
2,2,sadness,Funeral ceremony...gloomy friday...,funeral ceremony gloomy friday
3,3,enthusiasm,wants to hang out with friends SOON!,want hang friend soon
4,4,neutral,@dannycastillo We want to trade with someone w...,dannycastillo we want trade someone houston ti...


#### 3.1. Processing the sentence

In [30]:
## Processing the text for NLP
df['processed_sentence'] = df['content'].apply(lambda x: process_text(x))

#### 3.2. Vader Sentiment Extraction

In [91]:
## Vader Sentiment Feature Extraction

df['vader_neg'] = df['processed_sentence'].apply(lambda x: sentiment_analyzer_scores(x)['neg'])
df['vader_neu'] = df['processed_sentence'].apply(lambda x: sentiment_analyzer_scores(x)['neu'])
df['vader_pos'] = df['processed_sentence'].apply(lambda x: sentiment_analyzer_scores(x)['pos'])
df['vader_compound'] = df['processed_sentence'].apply(lambda x: sentiment_analyzer_scores(x)['compound'])

#### 3.3. TextBlob Feature Extraction

In [37]:
get_textblob_features(df['content'].iloc[1002])

{'Number of Noun Phrases': 1,
 'Number of POS tags': 6,
 'Sentence Polarity': -0.1,
 'Sentence Subjectivity': -0.1,
 'Number of words': 6,
 'Language Detected': 'en'}

In [40]:
## TextBlob Feature Extraction
'''
#try:
df['Number of Noun Phrases'] = df['processed_sentence'].apply(lambda x: get_textblob_features(x)['Number of Noun Phrases'])
df['Number of POS tags'] = df['processed_sentence'].apply(lambda x: get_textblob_features(x)['Number of POS tags'])
df['Sentence Polarity'] = df['processed_sentence'].apply(lambda x: get_textblob_features(x)['Sentence Polarity'])
df['Sentence Subjectivity'] = df['processed_sentence'].apply(lambda x: get_textblob_features(x)['Sentence Subjectivity'])
df['Number of words'] = df['processed_sentence'].apply(lambda x: get_textblob_features(x)['Number of words'])
#except:
#    pass
'''

"\n#try:\ndf['Number of Noun Phrases'] = df['processed_sentence'].apply(lambda x: get_textblob_features(x)['Number of Noun Phrases'])\ndf['Number of POS tags'] = df['processed_sentence'].apply(lambda x: get_textblob_features(x)['Number of POS tags'])\ndf['Sentence Polarity'] = df['processed_sentence'].apply(lambda x: get_textblob_features(x)['Sentence Polarity'])\ndf['Sentence Subjectivity'] = df['processed_sentence'].apply(lambda x: get_textblob_features(x)['Sentence Subjectivity'])\ndf['Number of words'] = df['processed_sentence'].apply(lambda x: get_textblob_features(x)['Number of words'])\n#except:\n#    pass\n"

TextBlob accepts strings with minimum 3 characters.

In [39]:
df.head()

Unnamed: 0,sentiment,content,processed_sentence,vader_neg,vader_neu,vader_pos,vader_compound
0,empty,@tiffanylue i know i was listenin to bad habi...,tiffanylue know listenin bad habit earlier sta...,0.304,0.696,0.0,-0.5423
1,sadness,Layin n bed with a headache ughhhh...waitin o...,layin n bed headache ughhhh waitin call,0.0,1.0,0.0,0.0
2,sadness,Funeral ceremony...gloomy friday...,funeral ceremony gloomy friday,0.672,0.328,0.0,-0.4767
3,enthusiasm,wants to hang out with friends SOON!,want hang friend soon,0.0,0.308,0.692,0.5423
4,neutral,@dannycastillo We want to trade with someone w...,dannycastillo we want trade someone houston ti...,0.0,0.843,0.157,0.0772


### 4. Preparing Data for Training
#### 4.1. Encoding String/ Categorical Data

In [92]:
df.head()

Unnamed: 0,index,sentiment,content,processed_sentence,vader_neg,vader_neu,vader_pos,vader_compound
0,0,empty,@tiffanylue i know i was listenin to bad habi...,tiffanylue know listenin bad habit earlier sta...,0.304,0.696,0.0,-0.5423
1,1,sadness,Layin n bed with a headache ughhhh...waitin o...,layin n bed headache ughhhh waitin call,0.0,1.0,0.0,0.0
2,2,sadness,Funeral ceremony...gloomy friday...,funeral ceremony gloomy friday,0.672,0.328,0.0,-0.4767
3,3,enthusiasm,wants to hang out with friends SOON!,want hang friend soon,0.0,0.308,0.692,0.5423
4,4,neutral,@dannycastillo We want to trade with someone w...,dannycastillo we want trade someone houston ti...,0.0,0.843,0.157,0.0772


In [102]:
df.columns.values

array(['index', 'sentiment', 'content', 'processed_sentence', 'vader_neg',
       'vader_neu', 'vader_pos', 'vader_compound', 'Encoded Targets'],
      dtype=object)

In [93]:
df['sentiment'].unique()

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

In [94]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [95]:
le.fit(df['sentiment'])

LabelEncoder()

In [97]:
le.classes_

array(['anger', 'boredom', 'empty', 'enthusiasm', 'fun', 'happiness',
       'hate', 'love', 'neutral', 'relief', 'sadness', 'surprise',
       'worry'], dtype=object)

In [99]:
df['Encoded Targets'] = le.transform(df['sentiment'])

In [103]:
df['Encoded Targets'].unique()

array([ 2, 10,  3,  8, 12, 11,  7,  4,  6,  5,  1,  9,  0], dtype=int64)

#### 4.2. Defining Dependent and Independent Variables

In [104]:
vader_features = ['vader_neg', 'vader_neu', 'vader_pos', 'vader_compound']
target = ['Encoded Targets']

In [105]:
X = df[vader_features]
y = df[target]

### 5. Start the Machine Learning Classification

In [106]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [107]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [108]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((30000, 4), (10000, 4), (30000, 1), (10000, 1))

In [109]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier()

In [110]:
rf_clf.fit(X_train, y_train)

  """Entry point for launching an IPython kernel.


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [111]:
rf_clf.score(X_test, y_test)

0.2711