In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
os.listdir('data/')

['sample_submission.csv',
 'test_data.csv',
 'text_emotion.csv',
 'train_data.csv']

In [3]:
df = pd.read_csv('data/text_emotion.csv')
print (df.shape)
df.head()

(40000, 4)


Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...


Let's filter out the columns we require

In [4]:
cols = ['sentiment', 'content']
df = df[cols]

We will now run various feature extraction tools to get a good idea about our data.

### 1. Text Preprocessing
#### 1.1. Remove Stop Words and Punctuation

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

In [6]:
stop_words = set(stopwords.words('english')) 

In [7]:
def convert_list_to_string(s): 
  
    # initialization of string to "" 
    new = "" 
  
    # traverse in the string  
    for x in s: 
        new += x +' ' 
  
    # return string  
    return new.lower() # converting string to lowercase

In [8]:
def remove_stopwords(sentence):

    word_tokens = word_tokenize(sentence) 

    filtered_sentence = [w for w in word_tokens if not w in stop_words] 

    filtered_sentence = [] 

    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w) 
            
    return convert_list_to_string(filtered_sentence)

In [9]:
from nltk.tokenize import RegexpTokenizer

def remove_tokens(sentence):

    tokenizer = RegexpTokenizer(r'\w+')
    return convert_list_to_string(tokenizer.tokenize(sentence))

In [10]:
import pandas as pd

#extract lemma
def extract_lemma(sent):
    parsed_text = {'word':[], 'lemma':[]}
    #for sent in doc.sentences:
    for wrd in sent:
        #extract text and lemma
        parsed_text['word'].append(wrd.text)
        parsed_text['lemma'].append(wrd.lemma)
    #return a dataframe
    return pd.DataFrame(parsed_text)

In [11]:
from nltk.stem import WordNetLemmatizer 
  
def extract_lemma(sent):
    lemmatizer = WordNetLemmatizer() 
    word_tokens = word_tokenize(sent) 
    
    lemmatized_sentence = []
    for word in word_tokens:
        lemmatized_sentence.append(lemmatizer.lemmatize(word))
    
    return convert_list_to_string(lemmatized_sentence)

#### 1.2. Driver Function to Process text

In [12]:
def process_text(sentence):
    return extract_lemma(remove_tokens(remove_stopwords(sentence)))

In [13]:
process_text(df['content'].iloc[1])

'layin n bed headache ughhhh waitin call '

### 2. Feature extraction

Setting up Vader NLP.
#### 2.1. Vader Sentiment Feature Extraction

In [14]:
#!pip install requests

In [15]:
#!pip install vaderSentiment

In [16]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

In [17]:
def sentiment_analyzer_scores(sentence):
    score = analyser.polarity_scores(sentence)
    #print("{:-<40} {}".format(sentence, str(score)))
    return score

In [18]:
sentiment_analyzer_scores(df['content'].iloc[0])

{'neg': 0.2, 'neu': 0.8, 'pos': 0.0, 'compound': -0.5423}

#### 2.2. Textblob NLP Feature Extraction

In [19]:
#!pip install textblob

In [20]:
from textblob import TextBlob
import nltk
#nltk.download('brown')

In [21]:
def get_textblob_features(sentence):
    
    blob = TextBlob(process_text(sentence))
    
    textblob_dict = {}
    
    ## Number of noun phrases in the sentence
    textblob_dict['Number of Noun Phrases'] = len(list(blob.noun_phrases))
    
    ## Number of POS tags in the sentence
    textblob_dict['Number of POS tags'] = len(list(blob.pos_tags))
    
    ## Polarity of the sentence
    textblob_dict['Sentence Polarity'] = blob.sentiment_assessments.polarity
    
    ## Polarity of the sentence
    textblob_dict['Sentence Subjectivity'] = blob.sentiment_assessments.polarity
    
    ## Number of words in the processed sentence
    textblob_dict['Number of words'] = len(list(blob.words))
    
    ## Language of the sentence
    textblob_dict['Language Detected'] = blob.detect_language()
    
    return textblob_dict

In [22]:
get_textblob_features(df['content'].iloc[1002])

{'Number of Noun Phrases': 1,
 'Number of POS tags': 6,
 'Sentence Polarity': -0.1,
 'Sentence Subjectivity': -0.1,
 'Number of words': 6,
 'Language Detected': 'en'}

#### 2.3. spaCy Feature Extraction
Ref. https://spacy.io/usage/spacy-101

In [23]:
import spacy

spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")

In [24]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


In [26]:
def get_entities(sentence):
    doc = nlp(sentence)
    
    entities_dict = {}
    for ent in doc.ents:
        entities_dict[ent.label_] =  ent.text
        
    return entities_dict

In [27]:
get_entities("Apple is looking at buying U.K. startup for $1 billion")

{'ORG': 'Apple', 'GPE': 'U.K.', 'MONEY': '$1 billion'}

#### 2.4. afinn Sentiment score

In [44]:
#!pip install afinn

In [46]:
from afinn import Afinn
afinn = Afinn()

def afinn_score(sentence):
    return afinn.score(sentence)

#### 2.5. SentiWordNet

In [69]:
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.stem import PorterStemmer

def penn_to_wn(tag):

    #Convert between the PennTreebank tags to simple Wordnet tags

    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def get_sentiment(word,tag):
    #returns list of pos neg and objective score. But returns empty list if not present in senti wordnet.


    wn_tag = penn_to_wn(tag)
    if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
        return []

    lemma = lemmatizer.lemmatize(word, pos=wn_tag)
    if not lemma:
        return []

    synsets = wn.synsets(word, pos=wn_tag)

    if not synsets:
        return []

    # Take the first sense, the most common
    synset = synsets[0]
    swn_synset = swn.senti_synset(synset.name())

    return [swn_synset.pos_score(),swn_synset.neg_score(),swn_synset.obj_score()]


ps = PorterStemmer()
words_data = ['this','movie','is','wonderful']
words_data = [ps.stem(x) for x in words_data]

pos_val = nltk.pos_tag(words_data)
senti_val=[ get_sentiment(x,y) for (x,y) in pos_val]

In [70]:
pos_val

[('thi', 'NN'), ('movi', 'NN'), ('is', 'VBZ'), ('wonder', 'JJR')]

### 3. Preparing Training Data

In [36]:
df.head()

Unnamed: 0,sentiment,content,processed_sentence,vader_neg,vader_neu,vader_pos,vader_compound
0,empty,@tiffanylue i know i was listenin to bad habi...,tiffanylue know listenin bad habit earlier sta...,0.304,0.696,0.0,-0.5423
1,sadness,Layin n bed with a headache ughhhh...waitin o...,layin n bed headache ughhhh waitin call,0.0,1.0,0.0,0.0
2,sadness,Funeral ceremony...gloomy friday...,funeral ceremony gloomy friday,0.672,0.328,0.0,-0.4767
3,enthusiasm,wants to hang out with friends SOON!,want hang friend soon,0.0,0.308,0.692,0.5423
4,neutral,@dannycastillo We want to trade with someone w...,dannycastillo we want trade someone houston ti...,0.0,0.843,0.157,0.0772


#### 3.1. Processing the sentence

In [28]:
## Processing the text for NLP
df['processed_sentence'] = df['content'].apply(lambda x: process_text_text(x))

#### 3.2. Vader Sentiment Extraction

In [35]:
## Vader Sentiment Feature Extraction

df['vader_neg'] = df['processed_sentence'].apply(lambda x: sentiment_analyzer_scores(x)['neg'])
df['vader_neu'] = df['processed_sentence'].apply(lambda x: sentiment_analyzer_scores(x)['neu'])
df['vader_pos'] = df['processed_sentence'].apply(lambda x: sentiment_analyzer_scores(x)['pos'])
df['vader_compound'] = df['processed_sentence'].apply(lambda x: sentiment_analyzer_scores(x)['compound'])

#### 3.3. TextBlob Feature Extraction

In [37]:
get_textblob_features(df['content'].iloc[1002])

{'Number of Noun Phrases': 1,
 'Number of POS tags': 6,
 'Sentence Polarity': -0.1,
 'Sentence Subjectivity': -0.1,
 'Number of words': 6,
 'Language Detected': 'en'}

In [None]:
## TextBlob Feature Extraction

#try:
df['Number of Noun Phrases'] = df['processed_sentence'].apply(lambda x: get_textblob_features(x)['Number of Noun Phrases'])
df['Number of POS tags'] = df['processed_sentence'].apply(lambda x: get_textblob_features(x)['Number of POS tags'])
df['Sentence Polarity'] = df['processed_sentence'].apply(lambda x: get_textblob_features(x)['Sentence Polarity'])
df['Sentence Subjectivity'] = df['processed_sentence'].apply(lambda x: get_textblob_features(x)['Sentence Subjectivity'])
df['Number of words'] = df['processed_sentence'].apply(lambda x: get_textblob_features(x)['Number of words'])
#except:
#    pass

In [41]:
df.head()

Unnamed: 0,sentiment,content,processed_sentence,vader_neg,vader_neu,vader_pos,vader_compound
0,empty,@tiffanylue i know i was listenin to bad habi...,tiffanylue know listenin bad habit earlier sta...,0.304,0.696,0.0,-0.5423
1,sadness,Layin n bed with a headache ughhhh...waitin o...,layin n bed headache ughhhh waitin call,0.0,1.0,0.0,0.0
2,sadness,Funeral ceremony...gloomy friday...,funeral ceremony gloomy friday,0.672,0.328,0.0,-0.4767
3,enthusiasm,wants to hang out with friends SOON!,want hang friend soon,0.0,0.308,0.692,0.5423
4,neutral,@dannycastillo We want to trade with someone w...,dannycastillo we want trade someone houston ti...,0.0,0.843,0.157,0.0772
