Created By Anshul Agarwal

In [1]:
! pip install kaggle



In [2]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Importing Twitter Sentiment Dataset

In [8]:
#Uploading the Dataset file through kaggle API key
!kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
sentiment140.zip: Skipping, found more recently modified local copy (use --force to force download)


In [9]:
#Extracting the compressed dataset
from zipfile import ZipFile
file_name = "sentiment140.zip"

with ZipFile(file_name,'r') as zip:
  zip.extractall()
  print("Done")

Done


Importing the Dependencies

In [10]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [11]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [12]:
#printing stopwords in English
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Data Preprocessing

In [13]:
twitter_data = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding = 'ISO-8859-1')

In [14]:
twitter_data.shape

(1599999, 6)

In [17]:
#printing the first 5 rows of the dataset
twitter_data.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [18]:
#naming the columns and reading the dataset again.
twitter_data.columns = ['target','id','date','flag','user','tweet']
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,tweet
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [23]:
#counting the number of missing values in the dataset
twitter_data.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
tweet     0
dtype: int64

In [24]:
# Checking the distribution of target columns
twitter_data['target'].value_counts()

target
4    800000
0    799999
Name: count, dtype: int64

Convert the target '4' to '1'

In [25]:
twitter_data.replace({'target' : {4:1}},inplace = True)

In [26]:
twitter_data['target'].value_counts()

target
1    800000
0    799999
Name: count, dtype: int64

0 --> Negative Tweet
1 --> Positive Tweet

Stemming

In [27]:
port_stem = PorterStemmer()

In [28]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ',content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content


In [30]:
twitter_data['stemmed_content'] = twitter_data['tweet'].apply(stemming)

In [31]:
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,tweet,stemmed_content
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew,kwesidei whole crew


In [32]:
#Separating the data and the label
X = twitter_data['stemmed_content'].values
Y = twitter_data['target'].values

In [33]:
print(X)

['upset updat facebook text might cri result school today also blah'
 'kenichan dive mani time ball manag save rest go bound'
 'whole bodi feel itchi like fire' ... 'readi mojo makeov ask detail'
 'happi th birthday boo alll time tupac amaru shakur'
 'happi charitytuesday thenspcc sparkschar speakinguph h']


In [34]:
print(Y)

[0 0 0 ... 1 1 1]


Splitting the data to training data and the test data

In [35]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y, random_state = 3)

In [37]:
print(X.shape,X_train.shape,X_test.shape)

(1599999,) (1279999,) (320000,)


In [38]:
print(X_train)

['sooo tire late night head pancak plu brekki everyon tho'
 'enjoy taco weekend watch basebal'
 'tayari wait watch video blog start wade still feel shi' ...
 'silknpearl thank tell go tri w tweetchat next time mayb make thing easier lol everyon nice'
 'annabeljad plane come pull'
 'rememb beauti moment bff moment crazi funni hahha']


In [39]:
print(X_test)

['make steak shrimp feast neighbor dessert mean bring dessert'
 'home sick mum got mad didnt pick phone call littl bit troubl lost phone'
 'lot hard research ive realis want stuff want im go need get job' ...
 'use dial connect long time dial suck' 'cecilguy use livejourn'
 'thebrandicyru http twitpic com iqt love billi rayi xx']


In [40]:
#converting the textual data to numerical data
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [41]:
print(X_train)

  (0, 406449)	0.2903970300514764
  (0, 124643)	0.25593968519022214
  (0, 51232)	0.48614030196989005
  (0, 318641)	0.33966243103215293
  (0, 307472)	0.3931859411979829
  (0, 161592)	0.260416459730594
  (0, 290715)	0.21147853996941665
  (0, 227848)	0.27496799754633616
  (0, 409622)	0.25460122559249926
  (0, 375898)	0.3014916396157007
  (1, 33790)	0.5606192994583294
  (1, 436314)	0.306260437268491
  (1, 437920)	0.35897146437044336
  (1, 393162)	0.5633260247079434
  (1, 120354)	0.3817248004694607
  (2, 364206)	0.38808043175298074
  (2, 130138)	0.18329989312492306
  (2, 384220)	0.18941455194150827
  (2, 434738)	0.43660461258564776
  (2, 381703)	0.21785153404038107
  (2, 45160)	0.2701889852558651
  (2, 431215)	0.2545714397697154
  (2, 435063)	0.2016140078084342
  (2, 395939)	0.5721509723878
  (2, 436314)	0.18979021976881275
  :	:
  (1279996, 367631)	0.4604123581144623
  (1279996, 112811)	0.32023725059692126
  (1279996, 398213)	0.2265526356732871
  (1279996, 405829)	0.19669725292137827
  (127

In [42]:
print(X_test)

  (0, 382121)	0.31644006355141435
  (0, 366165)	0.34760842326956387
  (0, 286687)	0.30258148089124776
  (0, 258284)	0.20719370893747435
  (0, 248981)	0.1622724768855546
  (0, 129875)	0.3628523791225507
  (0, 98688)	0.6568524644133005
  (0, 52385)	0.23473371904079524
  (1, 416714)	0.3248199702285594
  (1, 366841)	0.23370861037801047
  (1, 315557)	0.28249435433562387
  (1, 314867)	0.4778798318364711
  (1, 279299)	0.2984308182085052
  (1, 247025)	0.2888833689587772
  (1, 241298)	0.24741413801366802
  (1, 237081)	0.23111044201392344
  (1, 168471)	0.19521254806775332
  (1, 151679)	0.17797038394954415
  (1, 100592)	0.2702474935999385
  (1, 59002)	0.23519115846242028
  (1, 42961)	0.2154756296003259
  (2, 435750)	0.39887305415328395
  (2, 386202)	0.27754711079328476
  (2, 338108)	0.39001622245694134
  (2, 334458)	0.3695222937517813
  :	:
  (319995, 131797)	0.3769686651516573
  (319995, 12414)	0.30557236496732354
  (319996, 436314)	0.477019557110337
  (319996, 409080)	0.4964784475885718
  (3199

Training the Machine Learning Model

Logistic Regression

In [43]:
model = LogisticRegression(max_iter = 1000)

In [44]:
model.fit(X_train,Y_train)

Model Evaluation

In [45]:
#accuracy score of the training data
X_train_predicion = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_predicion,Y_train)

In [46]:
print('Accuracy score on the training data : ', training_data_accuracy)

Accuracy score on the training data :  0.810261570516852


In [47]:
#accuracy score of the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction,Y_test)

In [48]:
print('Accuracy score on the test data : ', test_data_accuracy)

Accuracy score on the test data :  0.777903125


Model Accuracy = 77.79%

Saving the trained model

In [49]:
import pickle

In [50]:
file_name = 'trained_model.sav'
pickle.dump(model,open(file_name,'wb'))

In [51]:
#loading the saved model
loaded_model = pickle.load(open('trained_model.sav','rb'))

In [52]:
X_new = X_test[200]
print(Y_test[200])
prediction = loaded_model.predict(X_new)
print(prediction)

0
[0]


Thank You!