In [1]:
#Importing Libraries
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
#Connecting to drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Setup for kaggle
!mkdir -p ~/.kaggle
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/
import os
#Downloading kaggle dataset
os.environ['KAGGLE_USERNAME'] = 'vamsianem' #Replace with your details
os.environ["key"]= 'e82997bbf028ff93d5ecd6f25cd8e2b9'
#api for the dataset
!kaggle datasets download kazanova/sentiment140

Downloading sentiment140.zip to /content
 90% 73.0M/80.9M [00:00<00:00, 112MB/s] 
100% 80.9M/80.9M [00:00<00:00, 101MB/s]


In [None]:
!unzip sentiment140.zip

Archive:  sentiment140.zip
  inflating: training.1600000.processed.noemoticon.csv  


In [None]:
path = 'training.1600000.processed.noemoticon.csv'

df = pd.read_csv(path, header=None, encoding='ISO-8859-1',  #Data is not in utf-8 encoding
                   names=['target', 'id', 'date', 'flag', 'user', 'text'])
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [None]:
from sklearn.model_selection import train_test_split

_, data = train_test_split(df, test_size=5000, random_state=42, stratify = df['target'].values, shuffle = True) #Taking only part of the data to reduce computation time

In [None]:
data.head()

Unnamed: 0,target,id,date,flag,user,text
776997,0,2322302594,Wed Jun 24 22:23:00 PDT 2009,NO_QUERY,RandomHajile2,@MariahCarey can i ask... whats the best thing...
477095,0,2178072207,Mon Jun 15 07:09:10 PDT 2009,NO_QUERY,dayday0012,going to breakfast for the last time in connet...
1472812,4,2065444779,Sun Jun 07 08:36:34 PDT 2009,NO_QUERY,JensMessyStudio,Heading to the in-law's for BBQ and swimming! ...
1071181,4,1966284469,Fri May 29 17:28:50 PDT 2009,NO_QUERY,MadiUhart,#myweakness My boyfriend. love him to death. T...
211837,0,1974589348,Sat May 30 13:26:36 PDT 2009,NO_QUERY,joeloverton,@lkue sucks. wanna get drunk?


In [None]:
#Resetting index
data = data.reset_index()

In [None]:
#Drop unwanted rows
data.drop(columns=['index','id','date','flag','user'],inplace=True)

( 0 = negative,  4 = positive )

In [None]:
#Checking for balance
data['target'].value_counts()

0    2500
4    2500
Name: target, dtype: int64

In [None]:
#Replacing 4 with 1
data['target'] = data['target'].replace(4,1)

In [None]:
data.tail(10)

Unnamed: 0,target,text
4990,0,"Made it to the studio, but could not work on m..."
4991,0,"For some reason my twitterena, myspace and fac..."
4992,0,@TFL_Swadley I still have to.
4993,1,Waving goodbye to Athens with much thanks to m...
4994,1,"@MommyMellie Yep, that's me!"
4995,1,@natobasso I know! I do a few every day with s...
4996,1,http://twitpic.com/6gonw - @daniellebabeyy pho...
4997,1,@jaboc haha that's nice of you
4998,1,@dancerr08 errrrrr i'm already confused. hahah...
4999,1,@hellobejoy you make me smile


#Cleaning the text

*   Remove URLs, mentions, hashtags
*   Remove Stopwords --> am, the, and etc.
*   Lemmatise --> EX: ate -> eat, better -> good etc.

##Using NLTK library

In [12]:
import re
import string

In [13]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [14]:
#Downloading Dependencies
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
def nltk_cleaning(text):
    #Using Regular Expressions method to remove URLs, Mentions and Hashtags
    text = re.sub(r'http\S+', '', text) # Remove URLs
    text = re.sub(r'@[A-Za-z0-9]+', '', text) # Remove mentions
    text = re.sub(r'#[A-Za-z0-9]+', '', text) # Remove hashtags
    text = re.sub(r'[^a-zA-z0-9\s]','',text) # Remove non alphabets

    # Lemmatize, and remove stop words using NLTK library
    clean_text = ' '.join([lemmatizer.lemmatize(word.lower()) for word in text.split() if word.lower() not in stop_words]) 
    return clean_text

In [None]:
#Testing nltk cleaning on a sample
text = data['text'].iloc[14]
print(text)
nltk_cleaning(text)

zzzzzzzzZZZzz my poor mac  sigh. well on the brighter side, been in salamanca for a week now and pretty much loving it. viva la vida !


'zzzzzzzzzzzzz poor mac sigh well brighter side salamanca week pretty much loving viva la vida'

In [None]:
#Cleaning - NLTK with progess bar
tqdm.pandas()
data['nltk_clean_text'] = data['text'].progress_apply(nltk_cleaning)

  0%|          | 0/5000 [00:00<?, ?it/s]

In [None]:
data.head()

Unnamed: 0,target,text,nltk_clean_text
0,0,@MariahCarey can i ask... whats the best thing...,ask whats best thing tu say tu girl u like lik...
1,0,going to breakfast for the last time in connet...,going breakfast last time conneticut
2,1,Heading to the in-law's for BBQ and swimming! ...,heading inlaws bbq swimming inlaw gone till 18th
3,1,#myweakness My boyfriend. love him to death. T...,boyfriend love death man get want
4,0,@lkue sucks. wanna get drunk?,suck wanna get drunk


##Using SpaCy library

In [15]:
import spacy

# loading spacy small model
nlp = spacy.load('en_core_web_sm')

In [16]:
def spacy_cleaning(text):
    #Using Regular Expressions method to remove URLs, Mentions and Hashtags
    text = re.sub(r'http\S+', '', text) # Remove URLs
    text = re.sub(r'@[A-Za-z0-9]+', '', text) # Remove mentions
    text = re.sub(r'#[A-Za-z]+', '', text) # Remove hashtags
    text = re.sub(r'[^a-zA-z\s]','',text) # Remove non alphabets
    # Lemmatize, and remove stop words using SpaCy library
    doc = nlp(text)
    clean_tokens = []
    for token in doc:
        if not token.is_stop and not token.is_punct and not token.is_space: #Removing Stopwords and Punctuaions 
            clean_tokens.append(token.lemma_.lower()) #Lemmatizing
    return ' '.join(clean_tokens)

In [17]:
#Testing spacy cleaning on a sample
text = data['text'].iloc[14]
print(text)
spacy_cleaning(text)

zzzzzzzzZZZzz my poor mac  sigh. well on the brighter side, been in salamanca for a week now and pretty much loving it. viva la vida !


'zzzzzzzzzzzzz poor mac sigh bright salamanca week pretty love viva la vida'

In [18]:
#Cleaning - Spacy with progress bar
tqdm.pandas()
data['spacy_clean_text'] = data['text'].progress_apply(spacy_cleaning)

  0%|          | 0/5000 [00:00<?, ?it/s]

In [19]:
data.head()

Unnamed: 0,target,text,nltk_clean_text,spacy_clean_text
0,0,@MariahCarey can i ask... whats the best thing...,ask whats best thing tu say tu girl u like lik...,ask s good thing tu tu girl u like like need h...
1,0,going to breakfast for the last time in connet...,going breakfast last time conneticut,go breakfast time conneticut
2,1,Heading to the in-law's for BBQ and swimming! ...,heading inlaws bbq swimming inlaw gone till 18th,head inlaw bbq swimming inlaw go till th
3,1,#myweakness My boyfriend. love him to death. T...,boyfriend love death man get want,boyfriend love death man get want
4,0,@lkue sucks. wanna get drunk?,suck wanna get drunk,suck wanna drunk


In [20]:
data.to_csv('/content/drive/MyDrive/sentiment140_cleanned_5k.csv', index=False)

#Using Vader Sentiment Analyzer on Spacy cleanned text

In [3]:
data = pd.read_csv('/content/drive/MyDrive/sentiment140_cleanned_5k.csv')

In [None]:
!pip install vaderSentiment

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sentiment = SentimentIntensityAnalyzer()

In [None]:
import random
n = random.randint(0, len(data['spacy_clean_text']))

text = data['spacy_clean_text'].iloc[n]

sent = sentiment.polarity_scores(text)
print(text,'\n',sent,'\t True: ', 'pos' if data['target'][n] else 'neg')

spend weekend aliyah nightmare poor thing 
 {'neg': 0.383, 'neu': 0.617, 'pos': 0.0, 'compound': -0.4767} 	 True:  neg


In [None]:
y_true = data['target'].values
vader_pred = np.ones(len(y_true)) #Setting all predictions to positive
length = len(data)

In [None]:
#Getting Negatives using vader sentiment polarity score
for i,text in tqdm(enumerate(data['spacy_clean_text'].values), total = length):
  sent = sentiment.polarity_scores(text)
  if (sent['compound']<0):                  #Changing the negative predictions only
     vader_pred[i] = 0

  0%|          | 0/5000 [00:00<?, ?it/s]

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_true,vader_pred))

              precision    recall  f1-score   support

           0       0.79      0.38      0.52      2500
           1       0.59      0.90      0.71      2500

    accuracy                           0.64      5000
   macro avg       0.69      0.64      0.62      5000
weighted avg       0.69      0.64      0.62      5000



#Bag of Words Vectorization-Based Model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer

token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)

In [None]:
text_counts = cv.fit_transform(data['spacy_clean_text'])  #Creating bag-of-words and also vectorising



In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, data['target'], test_size=0.25, random_state=5,stratify= data['target'])

##Multinomial Naive Bayes classifier

In [None]:
from sklearn.naive_bayes import MultinomialNB
MNB = MultinomialNB()
MNB.fit(X_train, Y_train)

In [None]:
nb_pred = MNB.predict(X_test)

print(classification_report(Y_test,nb_pred))

              precision    recall  f1-score   support

           0       0.71      0.71      0.71       625
           1       0.71      0.71      0.71       625

    accuracy                           0.71      1250
   macro avg       0.71      0.71      0.71      1250
weighted avg       0.71      0.71      0.71      1250



Better Results than Vader Sentiment Polariser

##XGBoost Classifier

In [39]:
from xgboost import XGBClassifier

xgb = XGBClassifier()

In [None]:
xgb.fit(X_train,Y_train)

In [None]:
xgb_pred = xgb.predict(X_test)

print(classification_report(Y_test,xgb_pred))

              precision    recall  f1-score   support

           0       0.75      0.61      0.67       625
           1       0.67      0.80      0.73       625

    accuracy                           0.70      1250
   macro avg       0.71      0.70      0.70      1250
weighted avg       0.71      0.70      0.70      1250



#Using Word Embedding

In [4]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

In [5]:
# Filter out float values and corresponding targets
text_values = []
targets = []

for text, target in zip(data['spacy_clean_text'].values, data['target'].values):
    if isinstance(text, str):
        text_values.append(str(text))
        targets.append(target)

In [6]:
# Generate embeddings using keras tokenizer
tokenizer = Tokenizer(num_words=500, split=' ')
tokenizer.fit_on_texts(text_values)
X = tokenizer.texts_to_sequences(text_values)
X = pad_sequences(X)
Y = np.array(targets)

In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.25, random_state=5,stratify= Y)

###XGBoost Classifier

In [None]:
xgb_embed = XGBClassifier()

xgb_embed.fit(X_train,Y_train)

In [42]:
xgb_embed_pred = xgb_embed.predict(X_test)

print(classification_report(Y_test,xgb_embed_pred))

              precision    recall  f1-score   support

           0       0.62      0.57      0.59       621
           1       0.60      0.64      0.62       622

    accuracy                           0.61      1243
   macro avg       0.61      0.61      0.61      1243
weighted avg       0.61      0.61      0.61      1243



###Using custom model with LSTM

In [10]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout

In [11]:
#Custom Model
model = Sequential()
model.add(Embedding(500, 120, input_length = X_train.shape[1]))
model.add(Dropout(0.2))
model.add(LSTM(512, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(128, activation='LeakyReLU'))
model.add(Dense(2, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 16, 120)           60000     
                                                                 
 dropout (Dropout)           (None, 16, 120)           0         
                                                                 
 lstm (LSTM)                 (None, 512)               1296384   
                                                                 
 dense (Dense)               (None, 128)               65664     
                                                                 
 dense_1 (Dense)             (None, 2)                 258       
                                                                 
Total params: 1,422,306
Trainable params: 1,422,306
Non-trainable params: 0
_________________________________________________________________


In [12]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

early_stop = EarlyStopping(monitor ='val_accuracy',patience = 10,verbose= 1)
checkpoint = ModelCheckpoint('sentiment140_model.h5', monitor='val_accuracy', save_best_only=True)

callbacks = [early_stop,checkpoint]

In [22]:
from keras.utils import to_categorical

Y_train = to_categorical(Y_train)
Y_test = to_categorical(Y_test)

In [14]:
model.fit(X_train,Y_train, batch_size = 64, epochs = 50, validation_data=(X_test, Y_test),callbacks = callbacks)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 13: early stopping


<keras.callbacks.History at 0x7fe8bc360c10>

In [None]:
from keras.models import load_model
loaded_model = load_model('sentiment140_model.h5')

In [25]:
custom_model_pred = np.argmax(model.predict(X_test),axis=1)

from sklearn.metrics import accuracy_score
print('Accuracy: ',accuracy_score(custom_model_pred,np.argmax(Y_test,axis=1)))

Accuracy:  0.6613032984714401


#Using Roberta Pretrained Model

In [None]:
!pip install transformers

In [48]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [49]:
roberta = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(roberta)
model = AutoModelForSequenceClassification.from_pretrained(roberta)

Downloading (…)lve/main/config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [50]:
sentiments = ['negative','neutral','positive']

In [55]:
text = text_values[6]
print(text)
encoded_text = tokenizer(text, return_tensors='pt') #encoding the text
output = model(**encoded_text)  #getting output from roberta model
scores = output[0][0].detach().numpy()
result_index = np.argmax(scores)
result = sentiments[result_index]
print(result,'-->', scores[result_index])

bad net issue weds not broadcast tonight tho tune new anthem bad mixing
negative --> 2.2070968


In [52]:
def roberta_res(text):
  encoded_text = tokenizer(text, return_tensors='pt') #encoding the text
  output = model(**encoded_text)  #getting output from roberta model
  scores = output[0][0].detach().numpy()
  result_index = np.argmax(scores)
  result = 1 if (result_index>0) else 0
  return result

In [57]:
roberta_results = []
for texts in tqdm(text_values):
  roberta_results.append(roberta_res(texts))

  0%|          | 0/4972 [00:00<?, ?it/s]

In [58]:
print(classification_report(targets,roberta_results))

              precision    recall  f1-score   support

           0       0.84      0.37      0.51      2485
           1       0.60      0.93      0.73      2487

    accuracy                           0.65      4972
   macro avg       0.72      0.65      0.62      4972
weighted avg       0.72      0.65      0.62      4972



#Using Transformers pipeline

In [59]:
from transformers import pipeline

sent_pipeline = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Xformers is not installed correctly. If you want to use memorry_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [63]:
print(text_values[6],'\n',sent_pipeline(text_values[6]))

bad net issue weds not broadcast tonight tho tune new anthem bad mixing 
 [{'label': 'NEGATIVE', 'score': 0.9993546605110168}]


In [69]:
transformers_results = []
for texts in tqdm(text_values):
  result_ = sent_pipeline(texts)
  if(result_[0]['label'] == 'POSITIVE'):
    result = 1 
  elif(result_[0]['label'] == 'NEGATIVE'):
    result = 0
  transformers_results.append(result)

  0%|          | 0/4972 [00:00<?, ?it/s]

In [71]:
print(classification_report(targets,transformers_results))

              precision    recall  f1-score   support

           0       0.65      0.75      0.70      2485
           1       0.70      0.60      0.65      2487

    accuracy                           0.67      4972
   macro avg       0.68      0.67      0.67      4972
weighted avg       0.68      0.67      0.67      4972



From all of the models, Multinomial Naive Bayes classifier is best for a custom data as it got 71% accuracy. But pretrained models also can be used, the only drawback is that they are not trained on our own data. They were trained on different websources and books.