# Sentiment Analysis Tweets

## Extract Data

In [119]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


#Train Data
trainDf =  pd.read_csv('Train.txt', sep='	', names=["id","text","emotion","intensity"], engine='python')
trainDf=trainDf.drop("id",axis=1)
print("Initial Shape of train data:",trainDf.shape)

AngerDf = trainDf[trainDf["emotion"]=="anger"].drop("emotion",axis=1)
AngerDf = AngerDf[AngerDf["intensity"]>=0.35]
AngerDf["emotion"] = 0

FearDf = trainDf[trainDf["emotion"]=="fear"].drop("emotion",axis=1)
FearDf = FearDf[FearDf["intensity"]>=0.4]
FearDf["emotion"] = 1

JoyDf = trainDf[trainDf["emotion"]=="joy"].drop("emotion",axis=1)
JoyDf = JoyDf[JoyDf["intensity"]>=0.292]
JoyDf["emotion"] = 2

SadDf = trainDf[trainDf["emotion"]=="sadness"].drop("emotion",axis=1)
SadDf = SadDf[SadDf["intensity"]>=0.263]
SadDf["emotion"] = 3

trainDf = pd.concat([AngerDf,FearDf,JoyDf,SadDf],ignore_index=True)
print("Final shape of train dataset:",trainDf.shape)
print(trainDf.head(10))
print()


#Cross Validation Data
crossValDf =  pd.read_csv('CrossValidate.txt', sep='	', names=["id","text","emotion","intensity"], engine='python')
crossValDf=crossValDf.drop("id",axis=1)
print("Initial Shape of CV data:",crossValDf.shape)


AngerDf = crossValDf[crossValDf["emotion"]=="anger"].drop("emotion",axis=1)
AngerDf = AngerDf[AngerDf["intensity"]>0.271]
AngerDf["emotion"] = 0

FearDf = crossValDf[crossValDf["emotion"]=="fear"].drop("emotion",axis=1)
FearDf = FearDf[FearDf["intensity"]>=0.250]
FearDf["emotion"] = 1

JoyDf = crossValDf[crossValDf["emotion"]=="joy"].drop("emotion",axis=1)
JoyDf = JoyDf[JoyDf["intensity"]>=0.292]
JoyDf["emotion"] = 2

SadDf = crossValDf[crossValDf["emotion"]=="sadness"].drop("emotion",axis=1)
SadDf = SadDf[SadDf["intensity"]>=0.333]
SadDf["emotion"] = 3

crossValDf = pd.concat([AngerDf,FearDf,JoyDf,SadDf],ignore_index=True)
print("Final shape of CV data:",crossValDf.shape)
print(crossValDf.head(10))
print()


#Test Data
testDf =  pd.read_csv('Test.txt', sep='	', names=["id","text","emotion","intensity"], engine='python')
testDf=testDf.drop("id",axis=1)
print("Initial Shape of Test data:",testDf.shape)


AngerDf = testDf[testDf["emotion"]=="anger"].drop("emotion",axis=1)
AngerDf = AngerDf[AngerDf["intensity"]>=0.400]
AngerDf["emotion"] = 0

FearDf = testDf[testDf["emotion"]=="fear"].drop("emotion",axis=1)
FearDf = FearDf[FearDf["intensity"]>=0.438]
FearDf["emotion"] = 1

JoyDf = testDf[testDf["emotion"]=="joy"].drop("emotion",axis=1)
JoyDf = JoyDf[JoyDf["intensity"]>=0.400]
JoyDf["emotion"] = 2

SadDf = testDf[testDf["emotion"]=="sadness"].drop("emotion",axis=1)
SadDf = SadDf[SadDf["intensity"]>=0.354]
SadDf["emotion"] = 3

testDf = pd.concat([AngerDf,FearDf,JoyDf,SadDf],ignore_index=True)
print("Final shape of Test data:",testDf.shape)
print(testDf.head(10))

Initial Shape of train data: (3317, 3)
Final shape of train dataset: (2840, 3)
                                                text  intensity  emotion
0  How the fu*k! Who the heck! moved my fridge!.....      0.938        0
1  So my Indian Uber driver just called someone t...      0.896        0
2  @DPD_UK I asked for my parcel to be delivered ...      0.896        0
3  so ef whichever butt wipe pulled the fire alar...      0.896        0
4  Don't join @BTCare they put the phone down on ...      0.896        0
5                                My blood is boiling      0.875        0
6  When you've still got a whole season of Wentwo...      0.875        0
7  @bt_uk why does tracking show my equipment del...      0.875        0
8  @TeamShanny legit why i am so furious with him...      0.875        0
9  How is it suppose to work if you do that? Wtf ...      0.875        0

Initial Shape of CV data: (347, 3)
Final shape of CV data: (296, 3)
                                                t

In [120]:
data_train= trainDf.drop(["intensity"],axis=1)
data_cv   = crossValDf.drop(["intensity"],axis=1)
data_test = testDf.drop(["intensity"],axis=1)

# emotion=0 means anger
# emotion=1 means fear
# emotion=2 means joy
# emotion=3 means sadness

## Text Cleaning

In [121]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")

#Get set of english stop words and prepare stemmer
#Stop words for bag of words are different because we will use bigrams
stop=set(stopwords.words("english"))
stop_bow = set(stopwords.words("english"))
stop_bow.discard("not")
stop_bow.discard("no")
sno = nltk.stem.SnowballStemmer("english")

#Train data cleaning
train_text=data_train["text"]
cleaned_text_bow=[]
cleaned_text=[]
for line in train_text:
    #Removing tags(ex-@abhishek is a name and not needed)
    tags = re.compile("^@[a-zA-Z_]*")
    line = re.sub(tags," ",line)
    #Replacing # and * with a ""
    hashtags = re.compile("#|\*")
    line = re.sub(hashtags,"",line)
    #Replacing all other characters with a space
    extraCharacters = re.compile("[^a-zA-Z]")
    line = re.sub(extraCharacters," ",line)

    #Conversion of line to array of words for word2vec
    filtered_words=[]
    filtered_words_bow=""
    for word in line.split():
        word=word.lower()
        if(word not in stop):
            word = sno.stem(word)
            filtered_words.append(word)
        if(word not in stop_bow):
            word = sno.stem(word)
            filtered_words_bow+=" "+word
    cleaned_text.append(filtered_words)
    cleaned_text_bow.append(filtered_words_bow)
            
data_train_bow = pd.DataFrame(data=cleaned_text_bow,columns=["text"])
data_train_bow["emotion"] = data_train["emotion"]
data_train["text"]=cleaned_text
print(data_train.head(10))
print()
print(data_train_bow.head(10))
print()

#Cross Validate data cleaning
cv_text=data_cv["text"][0:]
cleaned_text=[]
cleaned_text_bow=[]
for line in cv_text:
    #Removing tags(ex-@abhishek is a name and not needed)
    tags = re.compile("^@[a-zA-Z_]*")
    line = re.sub(tags," ",line)
    #Replacing # and * with "" 
    hashtags = re.compile("#|\*")
    line = re.sub(hashtags,"",line)
    #Replacing all other characters with a space
    extraCharacters = re.compile("[^a-zA-Z]")
    line = re.sub(extraCharacters," ",line)
    
    #Conversion of line to array of words
    filtered_words=[]
    filtered_words_bow=""
    for word in line.split():
        word=word.lower()
        if(word not in stop):
            word = sno.stem(word)
            filtered_words.append(word)
            filtered_words_bow+=" "+word
    cleaned_text.append(filtered_words)
    cleaned_text_bow.append(filtered_words_bow)
            
data_cv_bow = pd.DataFrame(data=cleaned_text_bow, columns=["text"])
data_cv_bow["emotion"]=data_cv["emotion"]
data_cv["text"]=cleaned_text
print(data_cv.head(10))
print()
print(data_cv_bow.head(10))
print()

#Test data cleaning
test_text=data_test["text"][0:]
cleaned_text=[]
cleaned_text_bow=[]
for line in test_text:
    #Removing tags(ex-@abhishek is a name and not needed)
    tags = re.compile("^@[a-zA-Z_]*")
    line = re.sub(tags," ",line)
    #Replacing hash and * with ""
    hashtags = re.compile("#|\*")
    line = re.sub(hashtags,"",line)
    #Replacing all other characters with a space
    extraCharacters = re.compile("[^a-zA-Z]")
    line = re.sub(extraCharacters," ",line)
    
    #Conversion of line to array of words
    filtered_words=[]
    filtered_words_bow=""
    for word in line.split():
        word=word.lower()
        if(word not in stop):
            word = sno.stem(word)
            filtered_words.append(word)
            filtered_words_bow+=" "+word
    cleaned_text.append(filtered_words)
    cleaned_text_bow.append(filtered_words_bow)
            
data_test_bow = pd.DataFrame(data=cleaned_text_bow, columns=["text"])
data_test_bow["emotion"]=data_test["emotion"]
data_test["text"]=cleaned_text
print(data_test.head(10))
print(data_test_bow.head(10))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                                text  emotion
0  [fuk, heck, move, fridg, knock, landlord, door...        0
1  [indian, uber, driver, call, someon, n, word, ...        0
2  [ask, parcel, deliv, pick, store, address, fum...        0
3  [ef, whichev, butt, wipe, pull, fire, alarm, d...        0
4  [join, btcare, put, phone, talk, rude, take, m...        0
5                                      [blood, boil]        0
6  [still, got, whole, season, wentworth, watch, ...        0
7  [track, show, equip, deliv, servic, sudden, de...        0
8               [legit, furious, peopl, fuck, idiot]        0
9             [suppos, work, wtf, dude, thank, piss]        0

                                                text  emotion
0   fuk heck move fridg knock landlord door angri...        0
1   indian uber driver call someon n word move ve...        0
2   ask parcel deliv pick store not address fume ...        0
3   ef whichev butt wipe pull fire alarm davi bc ...        0
4   joi

## Text Vectorization(Word2Vec)

In [122]:
import gensim
from tqdm import tqdm

#Train on text data
list_of_sent = data_train["text"]
w2v_model=gensim.models.Word2Vec(list_of_sent,min_count=5,size=300, workers=4)
w2v_words=list(w2v_model.wv.vocab)

#Vectorize text data
listof_sent_vec=[]
#tqdm is for improving speed and making progressbar
#Vectorization and normalization both going on
for sent in tqdm(list_of_sent): 
    sent_vec = np.zeros(300) 
    cnt_words =0; 
    for word in sent: 
        if word in w2v_words:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    listof_sent_vec.append(sent_vec)
    
Label = data_train["emotion"]
list_col=tuple(range(300))
W2v_data_train=pd.DataFrame(data=listof_sent_vec, columns=list_col)
W2v_data_train["emotion"] = Label
print(W2v_data_train.head(10))
print(W2v_data_train.shape)

100%|████████████████████████████████████████████████████████████████████████████| 2840/2840 [00:00<00:00, 5020.08it/s]


          0         1         2         3         4         5         6  \
0  0.010851 -0.003688 -0.001901 -0.011896  0.003534 -0.009506 -0.012363   
1  0.016236 -0.005182 -0.004301 -0.019407  0.005657 -0.015649 -0.019823   
2  0.008532 -0.003147 -0.001670 -0.010332  0.003441 -0.008199 -0.010699   
3  0.009729 -0.003736 -0.002111 -0.011186  0.003618 -0.008811 -0.011629   
4  0.014149 -0.005358 -0.003964 -0.016960  0.005082 -0.013828 -0.016283   
5  0.008384 -0.003335 -0.002157 -0.011122  0.003437 -0.009354 -0.009939   
6  0.020037 -0.007843 -0.004988 -0.024805  0.007998 -0.019729 -0.024252   
7  0.011678 -0.004402 -0.002129 -0.014377  0.004995 -0.011226 -0.013769   
8  0.021942 -0.008933 -0.005779 -0.027797  0.007854 -0.022617 -0.026581   
9  0.011288 -0.004778 -0.002088 -0.013435  0.004394 -0.010876 -0.012735   

          7         8         9  ...       291       292       293       294  \
0 -0.003398  0.006141 -0.006827  ... -0.006243 -0.003452 -0.003920 -0.019083   
1 -0.005349  0

In [123]:
#Vectorize Cross Validate
list_of_sent= data_cv["text"]
listof_sent_vec=[]

for sent in tqdm(list_of_sent): 
    sent_vec = np.zeros(300) 
    cnt_words =0; 
    for word in sent: 
        if word in w2v_words:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    listof_sent_vec.append(sent_vec)
    
Label = data_cv["emotion"]
list_col=tuple(range(300))
W2v_data_cv=pd.DataFrame(data=listof_sent_vec, columns=list_col)
W2v_data_cv["emotion"] = Label
print(W2v_data_cv.head(10))
print(W2v_data_cv.shape)


#test
list_of_sent= data_test["text"]
listof_sent_vec=[]

for sent in tqdm(list_of_sent): 
    sent_vec = np.zeros(300) 
    cnt_words =0; 
    for word in sent: 
        if word in w2v_words:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    listof_sent_vec.append(sent_vec)
    
Label = data_test["emotion"]
list_col=tuple(range(300))
W2v_data_test=pd.DataFrame(data=listof_sent_vec, columns=list_col)
W2v_data_test["emotion"] = Label
print(W2v_data_test.head(10))
print(W2v_data_test.shape)

100%|██████████████████████████████████████████████████████████████████████████████| 296/296 [00:00<00:00, 5284.63it/s]


          0         1         2         3         4         5         6  \
0  0.007539 -0.001739 -0.002184 -0.010123  0.002469 -0.007762 -0.010161   
1  0.013271 -0.004445 -0.003466 -0.015164  0.004492 -0.012766 -0.015528   
2  0.016365 -0.005355 -0.003755 -0.020174  0.005800 -0.015246 -0.018939   
3  0.021701 -0.007591 -0.005601 -0.025561  0.007609 -0.021019 -0.025437   
4  0.023080 -0.008594 -0.005644 -0.028017  0.008958 -0.021951 -0.027621   
5  0.023248 -0.008580 -0.005676 -0.028127  0.008905 -0.022185 -0.027888   
6  0.016402 -0.006142 -0.004028 -0.019758  0.005558 -0.016065 -0.019404   
7  0.013988 -0.004883 -0.002979 -0.016873  0.005327 -0.014061 -0.017099   
8  0.014440 -0.005219 -0.003109 -0.017067  0.005494 -0.014584 -0.017756   
9  0.016108 -0.005761 -0.003019 -0.018608  0.005366 -0.016218 -0.019776   

          7         8         9  ...       291       292       293       294  \
0 -0.003265  0.004782 -0.005060  ... -0.006096 -0.002846 -0.002391 -0.016000   
1 -0.004029  0

100%|██████████████████████████████████████████████████████████████████████████████| 998/998 [00:00<00:00, 5431.05it/s]


          0         1         2         3         4         5         6  \
0  0.016952 -0.005859 -0.003993 -0.020428  0.005976 -0.016370 -0.019936   
1  0.010692 -0.003514 -0.001827 -0.013096  0.005031 -0.010432 -0.013138   
2  0.011080 -0.003557 -0.002226 -0.013910  0.005364 -0.010874 -0.014093   
3  0.008669 -0.003210 -0.002274 -0.011089  0.002638 -0.008562 -0.010944   
4  0.018090 -0.006539 -0.005328 -0.022068  0.006712 -0.017034 -0.022339   
5  0.005584 -0.003517 -0.000309 -0.007823  0.002723 -0.006999 -0.007717   
6  0.015865 -0.006010 -0.004033 -0.019874  0.007076 -0.014708 -0.020321   
7  0.014795 -0.005594 -0.004093 -0.019207  0.006835 -0.014104 -0.019387   
8  0.016857 -0.005269 -0.004303 -0.020720  0.006373 -0.015609 -0.018732   
9  0.016815 -0.006123 -0.003878 -0.021644  0.007037 -0.017383 -0.020802   

          7         8         9  ...       291       292       293       294  \
0 -0.005693  0.010089 -0.010066  ... -0.011036 -0.006037 -0.005523 -0.031348   
1 -0.003879  0

In [124]:
X_train_w2v = W2v_data_train.drop("emotion",axis=1).to_numpy()
y_train_w2v = W2v_data_train["emotion"].to_numpy()

X_cv_w2v = W2v_data_cv.drop("emotion",axis=1).to_numpy()
y_cv_w2v = W2v_data_cv["emotion"].to_numpy()

X_test_w2v = W2v_data_test.drop("emotion",axis=1).to_numpy()
y_test_w2v = W2v_data_test["emotion"].to_numpy()

print("Final shape of Train X and y for word2vec:",X_train.shape,y_train.shape)
print("Final shape of CV X and y for word2vec:",X_cv.shape,y_cv.shape)
print("Final shape of Test X and y for word2vec:",X_test.shape,y_test.shape)

Final shape of Train X and y for word2vec: (2840, 300) (2840,)
Final shape of CV X and y for word2vec: (296, 300) (296,)
Final shape of Test X and y for word2vec: (998, 300) (998,)


## Text Vectorization(bag of words)

In [125]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize

#Train bag of words
bow_model = CountVectorizer(ngram_range=(1,2))
bow_model.fit(data_train_bow["text"])

X_train_bow = normalize(bow_model.transform(data_train_bow["text"])).tocsr()
y_train_bow = data_train_bow["emotion"].to_numpy()

X_cv_bow = normalize(bow_model.transform(data_cv_bow["text"])).tocsr()
y_cv_bow = data_cv_bow["emotion"].to_numpy()

X_test_bow = normalize(bow_model.transform(data_test_bow["text"])).tocsr()
y_test_bow = data_test_bow["emotion"].to_numpy()

print("Final shape of Train X and y for bag of words:",X_train_bow.shape,y_train_bow.shape)
print("Final shape of CV X and y for bag of words:",X_cv_bow.shape,y_cv_bow.shape)
print("Final shape of Test X and y for bag of words:",X_test_bow.shape,y_test_bow.shape)

Final shape of Train X and y for bag of words: (2840, 24681) (2840,)
Final shape of CV X and y for bag of words: (296, 24681) (296,)
Final shape of Test X and y for bag of words: (998, 24681) (998,)
