# Sentiment Analysis Tweets

## Extract Data

In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


#Train Data
trainDf =  pd.read_csv('Train.txt', sep='	', names=["id","text","emotion","intensity"], engine='python')
trainDf=trainDf.drop("id",axis=1)
print("Initial Shape of train data:",trainDf.shape)

AngerDf = trainDf[trainDf["emotion"]=="anger"].drop("emotion",axis=1)
AngerDf = AngerDf[AngerDf["intensity"]>=0.35]
AngerDf["emotion"] = 0

FearDf = trainDf[trainDf["emotion"]=="fear"].drop("emotion",axis=1)
FearDf = FearDf[FearDf["intensity"]>=0.4]
FearDf["emotion"] = 1

JoyDf = trainDf[trainDf["emotion"]=="joy"].drop("emotion",axis=1)
JoyDf = JoyDf[JoyDf["intensity"]>=0.292]
JoyDf["emotion"] = 2

SadDf = trainDf[trainDf["emotion"]=="sadness"].drop("emotion",axis=1)
SadDf = SadDf[SadDf["intensity"]>=0.263]
SadDf["emotion"] = 3

trainDf = pd.concat([AngerDf,FearDf,JoyDf,SadDf],ignore_index=True)
print("Final shape of train dataset:",trainDf.shape)
print(trainDf.head(10))
print()


#Cross Validation Data
crossValDf =  pd.read_csv('CrossValidate.txt', sep='	', names=["id","text","emotion","intensity"], engine='python')
crossValDf=crossValDf.drop("id",axis=1)
print("Initial Shape of CV data:",crossValDf.shape)


AngerDf = crossValDf[crossValDf["emotion"]=="anger"].drop("emotion",axis=1)
AngerDf = AngerDf[AngerDf["intensity"]>0.271]
AngerDf["emotion"] = 0

FearDf = crossValDf[crossValDf["emotion"]=="fear"].drop("emotion",axis=1)
FearDf = FearDf[FearDf["intensity"]>=0.250]
FearDf["emotion"] = 1

JoyDf = crossValDf[crossValDf["emotion"]=="joy"].drop("emotion",axis=1)
JoyDf = JoyDf[JoyDf["intensity"]>=0.292]
JoyDf["emotion"] = 2

SadDf = crossValDf[crossValDf["emotion"]=="sadness"].drop("emotion",axis=1)
SadDf = SadDf[SadDf["intensity"]>=0.333]
SadDf["emotion"] = 3

crossValDf = pd.concat([AngerDf,FearDf,JoyDf,SadDf],ignore_index=True)
print("Final shape of CV data:",crossValDf.shape)
print(crossValDf.head(10))
print()


#Test Data
testDf =  pd.read_csv('Test.txt', sep='	', names=["id","text","emotion","intensity"], engine='python')
testDf=testDf.drop("id",axis=1)
print("Initial Shape of Test data:",testDf.shape)


AngerDf = testDf[testDf["emotion"]=="anger"].drop("emotion",axis=1)
AngerDf = AngerDf[AngerDf["intensity"]>=0.400]
AngerDf["emotion"] = 0

FearDf = testDf[testDf["emotion"]=="fear"].drop("emotion",axis=1)
FearDf = FearDf[FearDf["intensity"]>=0.438]
FearDf["emotion"] = 1

JoyDf = testDf[testDf["emotion"]=="joy"].drop("emotion",axis=1)
JoyDf = JoyDf[JoyDf["intensity"]>=0.400]
JoyDf["emotion"] = 2

SadDf = testDf[testDf["emotion"]=="sadness"].drop("emotion",axis=1)
SadDf = SadDf[SadDf["intensity"]>=0.354]
SadDf["emotion"] = 3

testDf = pd.concat([AngerDf,FearDf,JoyDf,SadDf],ignore_index=True)
print("Final shape of Test data:",testDf.shape)
print(testDf.head(10))

Initial Shape of train data: (3317, 3)
Final shape of train dataset: (2840, 3)
                                                text  intensity  emotion
0  How the fu*k! Who the heck! moved my fridge!.....      0.938        0
1  So my Indian Uber driver just called someone t...      0.896        0
2  @DPD_UK I asked for my parcel to be delivered ...      0.896        0
3  so ef whichever butt wipe pulled the fire alar...      0.896        0
4  Don't join @BTCare they put the phone down on ...      0.896        0
5                                My blood is boiling      0.875        0
6  When you've still got a whole season of Wentwo...      0.875        0
7  @bt_uk why does tracking show my equipment del...      0.875        0
8  @TeamShanny legit why i am so furious with him...      0.875        0
9  How is it suppose to work if you do that? Wtf ...      0.875        0

Initial Shape of CV data: (347, 3)
Final shape of CV data: (296, 3)
                                                t

In [57]:
data_train= trainDf.drop(["intensity"],axis=1)
data_cv   = crossValDf.drop(["intensity"],axis=1)
data_test = testDf.drop(["intensity"],axis=1)

# emotion=0 means anger
# emotion=1 means fear
# emotion=2 means joy
# emotion=3 means sadness

## Text Cleaning

In [58]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")

#Get set of english stop words and prepare stemmer
stop=set(stopwords.words("english"))
sno = nltk.stem.SnowballStemmer("english")

#Train data cleaning
train_text=data_train["text"]
cleaned_text=[]
for line in train_text:
    #Removing tags(ex-@abhishek is a name and not needed)
    tags = re.compile("^@[a-zA-Z_]*")
    line = re.sub(tags," ",line)
    #Replacing # and * with a ""
    hashtags = re.compile("#|\*")
    line = re.sub(hashtags,"",line)
    #Replacing all other characters with a space
    extraCharacters = re.compile("[^a-zA-Z]")
    line = re.sub(extraCharacters," ",line)
    filtered_words=[]
    
    #Conversion of line to array of words
    for word in line.split():
        word=word.lower()
        if(word not in stop):
            word = sno.stem(word)
            filtered_words.append(word)
    cleaned_text.append(filtered_words)
            
data_train["text"]=cleaned_text
print(data_train.head(10))
print()

#Cross Validate data cleaning
cv_text=data_cv["text"][0:]
cleaned_text=[]
for line in cv_text:
    #Removing tags(ex-@abhishek is a name and not needed)
    tags = re.compile("^@[a-zA-Z_]*")
    line = re.sub(tags," ",line)
    #Replacing # and * with "" 
    hashtags = re.compile("#|\*")
    line = re.sub(hashtags,"",line)
    #Replacing all other characters with a space
    extraCharacters = re.compile("[^a-zA-Z]")
    line = re.sub(extraCharacters," ",line)
    
    #Conversion of line to array of words
    filtered_words=[]
    for word in line.split():
        word=word.lower()
        if(word not in stop):
            word = sno.stem(word)
            filtered_words.append(word)
    cleaned_text.append(filtered_words)
            
data_cv["text"]=cleaned_text
print(data_cv.head(10))
print()

#Test data cleaning
test_text=data_test["text"][0:]
cleaned_text=[]
for line in test_text:
    #Removing tags(ex-@abhishek is a name and not needed)
    tags = re.compile("^@[a-zA-Z_]*")
    line = re.sub(tags," ",line)
    #Replacing hash and * with ""
    hashtags = re.compile("#|\*")
    line = re.sub(hashtags,"",line)
    #Replacing all other characters with a space
    extraCharacters = re.compile("[^a-zA-Z]")
    line = re.sub(extraCharacters," ",line)
    
    #Conversion of line to array of words
    filtered_words=[]
    for word in line.split():
        word=word.lower()
        if(word not in stop):
            word = sno.stem(word)
            filtered_words.append(word)
    cleaned_text.append(filtered_words)
            
data_test["text"]=cleaned_text
print(data_test.head(10))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                                text  emotion
0  [fuk, heck, move, fridg, knock, landlord, door...        0
1  [indian, uber, driver, call, someon, n, word, ...        0
2  [ask, parcel, deliv, pick, store, address, fum...        0
3  [ef, whichev, butt, wipe, pull, fire, alarm, d...        0
4  [join, btcare, put, phone, talk, rude, take, m...        0
5                                      [blood, boil]        0
6  [still, got, whole, season, wentworth, watch, ...        0
7  [track, show, equip, deliv, servic, sudden, de...        0
8               [legit, furious, peopl, fuck, idiot]        0
9             [suppos, work, wtf, dude, thank, piss]        0

                                                text  emotion
0                   [pls, dont, insult, word, molna]        0
1        [would, almost, took, offens, actual, snap]        0
2  [rutger, game, abomin, affront, god, man, must...        0
3                [lisa, ask, start, rage, call, heh]        0
4  [som

## Text Vectorization(Word2Vec)

In [59]:
import gensim
from tqdm import tqdm

#Train on text data
list_of_sent = data_train["text"]
w2v_model=gensim.models.Word2Vec(list_of_sent,min_count=5,size=300, workers=4)
w2v_words=list(w2v_model.wv.vocab)

#Vectorize text data
listof_sent_vec=[]
#tqdm is for improving speed and making progressbar
#Vectorization and normalization both going on
for sent in tqdm(list_of_sent): 
    sent_vec = np.zeros(300) 
    cnt_words =0; 
    for word in sent: 
        if word in w2v_words:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    listof_sent_vec.append(sent_vec)
    
Label = data_train["emotion"]
list_col=tuple(range(300))
W2v_data_train=pd.DataFrame(data=listof_sent_vec, columns=list_col)
W2v_data_train["emotion"] = Label
print(W2v_data_train.head(10))
print(W2v_data_train.shape)

100%|████████████████████████████████████████████████████████████████████████████| 2840/2840 [00:00<00:00, 5338.52it/s]


          0         1         2         3         4         5         6  \
0  0.010811 -0.002395 -0.003304 -0.013111  0.004599 -0.010065 -0.012781   
1  0.016901 -0.003309 -0.006856 -0.022324  0.007567 -0.017228 -0.021352   
2  0.008887 -0.002072 -0.003029 -0.011886  0.004473 -0.009074 -0.011522   
3  0.009685 -0.002460 -0.003401 -0.012250  0.004529 -0.009266 -0.011931   
4  0.014243 -0.003578 -0.005921 -0.018841  0.006535 -0.014779 -0.017013   
5  0.008095 -0.002186 -0.003142 -0.011746  0.004185 -0.009562 -0.009984   
6  0.019486 -0.005017 -0.007621 -0.026569  0.009730 -0.020312 -0.024442   
7  0.011808 -0.002903 -0.003815 -0.016110  0.006302 -0.012099 -0.014492   
8  0.020335 -0.005600 -0.008251 -0.028469  0.009450 -0.022262 -0.025520   
9  0.011424 -0.003358 -0.003669 -0.014964  0.005508 -0.011640 -0.013327   

          7         8         9  ...       291       292       293       294  \
0 -0.004045  0.006342 -0.007068  ... -0.008057 -0.001864 -0.002956 -0.017881   
1 -0.006573  0

In [60]:
#Vectorize Cross Validate
list_of_sent= data_cv["text"]
listof_sent_vec=[]

for sent in tqdm(list_of_sent): 
    sent_vec = np.zeros(300) 
    cnt_words =0; 
    for word in sent: 
        if word in w2v_words:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    listof_sent_vec.append(sent_vec)
    
Label = data_cv["emotion"]
list_col=tuple(range(300))
W2v_data_cv=pd.DataFrame(data=listof_sent_vec, columns=list_col)
W2v_data_cv["emotion"] = Label
print(W2v_data_cv.head(10))
print(W2v_data_cv.shape)


#test
list_of_sent= data_test["text"]
listof_sent_vec=[]

for sent in tqdm(list_of_sent): 
    sent_vec = np.zeros(300) 
    cnt_words =0; 
    for word in sent: 
        if word in w2v_words:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    listof_sent_vec.append(sent_vec)
    
Label = data_test["emotion"]
list_col=tuple(range(300))
W2v_data_test=pd.DataFrame(data=listof_sent_vec, columns=list_col)
W2v_data_test["emotion"] = Label
print(W2v_data_test.head(10))
print(W2v_data_test.shape)

100%|██████████████████████████████████████████████████████████████████████████████| 296/296 [00:00<00:00, 6165.12it/s]


          0         1         2         3         4         5         6  \
0  0.008337 -0.000875 -0.003654 -0.012183  0.003616 -0.009058 -0.011530   
1  0.012515 -0.002597 -0.005005 -0.015760  0.005453 -0.012741 -0.015162   
2  0.016371 -0.003254 -0.005998 -0.022280  0.007538 -0.016211 -0.019662   
3  0.021432 -0.004745 -0.008489 -0.027819  0.009653 -0.021888 -0.025948   
4  0.022636 -0.005382 -0.008728 -0.030269  0.010977 -0.022794 -0.028101   
5  0.022878 -0.005374 -0.008810 -0.030510  0.010978 -0.023132 -0.028477   
6  0.016644 -0.004017 -0.006443 -0.022135  0.007325 -0.017237 -0.020397   
7  0.013912 -0.003019 -0.004869 -0.018542  0.006737 -0.014772 -0.017602   
8  0.014304 -0.003282 -0.005020 -0.018672  0.006891 -0.015207 -0.018162   
9  0.016041 -0.003655 -0.005263 -0.020422  0.006930 -0.016967 -0.020403   

          7         8         9  ...       291       292       293       294  \
0 -0.004037  0.005434 -0.005804  ... -0.008315 -0.001646 -0.001818 -0.016432   
1 -0.004520  0

100%|██████████████████████████████████████████████████████████████████████████████| 998/998 [00:00<00:00, 4352.72it/s]


          0         1         2         3         4         5         6  \
0  0.017389 -0.003800 -0.006450 -0.023239  0.007974 -0.017850 -0.021276   
1  0.010864 -0.002100 -0.003431 -0.014579  0.006130 -0.011176 -0.013766   
2  0.011196 -0.002033 -0.003909 -0.015354  0.006477 -0.011564 -0.014669   
3  0.008719 -0.002118 -0.003537 -0.012290  0.003580 -0.009142 -0.011427   
4  0.017617 -0.004084 -0.007694 -0.023714  0.008318 -0.017596 -0.022531   
5  0.005844 -0.002830 -0.001263 -0.008921  0.003466 -0.007650 -0.008355   
6  0.015716 -0.003962 -0.006178 -0.021699  0.008709 -0.015502 -0.020882   
7  0.014414 -0.003580 -0.006042 -0.020682  0.008283 -0.014645 -0.019650   
8  0.015586 -0.002803 -0.006083 -0.021088  0.007486 -0.015192 -0.017898   
9  0.015006 -0.003439 -0.005608 -0.021378  0.007912 -0.016516 -0.019343   

          7         8         9  ...       291       292       293       294  \
0 -0.006924  0.010780 -0.010771  ... -0.014549 -0.003471 -0.004094 -0.030183   
1 -0.004604  0