In [6]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\raina\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\raina\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
# Printing the stop words in english
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [8]:
# Data processing 
# Loading Data from csv file to pandas dataframe
twitter_data = pd.read_csv('twitter.csv',encoding='ISO-8859-1')

In [9]:
# Checking the dataframe
twitter_data.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [10]:
twitter_data.tail()   # optional 

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1599994,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599995,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599996,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599997,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...
1599998,4,2193602129,Tue Jun 16 08:40:50 PDT 2009,NO_QUERY,RyanTrevMorris,happy #charitytuesday @theNSPCC @SparksCharity...


In [11]:
# Naming the columns and re-reading the dataset
column_names=['target','id','date','flag','user','text']
twitter_data = pd.read_csv('twitter.csv',names = column_names,encoding = 'ISO-8859-1')


In [12]:
# Rechecking the Dataframe that was just modified
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [13]:
# Checking for Missing Values i.e text in the dataframe 
twitter_data.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [14]:
# Checking the distribution of target column
twitter_data['target'].value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

In [15]:
# Converting label (4 --> 1)
twitter_data.replace({'target':{4:1}},inplace = True)

In [16]:
# Checking the distribution of target column after conversion
twitter_data['target'].value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

In [17]:
# 0 --> Negative Tweet
# 1 --> Positive Tweet
# Either use stemming or lamentization 
lem = WordNetLemmatizer()

In [18]:
def lament(content):
    lem_content = re.sub('[^a-zA-Z]','',content)
    lem_content = lem_content.lower()
    lem_content = lem_content.split()
    lem_content = [lem.lemmatize(word) for word in lem_content if not word in stopwords.words('english')]
    lem_content = ''.join(lem_content)

    return lem_content
    

In [19]:
twitter_data['lem_content']=twitter_data['text'].apply(lament)

In [20]:
# Rechecking the Dataframe that was just modified after applying lamentization
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text,lem_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoothttptwitpiccomyzlawwwthatsabummeryou...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,isupsetthathecantupdatehisfacebookbytextingita...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichanidivedmanytimesfortheballmanagedtosave...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,mywholebodyfeelsitchyandlikeitsonfire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclassnoitsnotbehavingatallimmadwhyam...


In [21]:
print(twitter_data['lem_content'])

0          switchfoothttptwitpiccomyzlawwwthatsabummeryou...
1          isupsetthathecantupdatehisfacebookbytextingita...
2          kenichanidivedmanytimesfortheballmanagedtosave...
3                      mywholebodyfeelsitchyandlikeitsonfire
4          nationwideclassnoitsnotbehavingatallimmadwhyam...
                                 ...                        
1599995         justwokeuphavingnoschoolisthebestfeelingever
1599996    thewdbcomverycooltohearoldwaltinterviewshttpbl...
1599997        areyoureadyforyourmojomakeoveraskmefordetails
1599998     happythbirthdaytomybooofallltimetupacamarushakur
1599999    happycharitytuesdaythenspccsparkscharityspeaki...
Name: lem_content, Length: 1600000, dtype: object


In [22]:
print(twitter_data['target'])

0          0
1          0
2          0
3          0
4          0
          ..
1599995    1
1599996    1
1599997    1
1599998    1
1599999    1
Name: target, Length: 1600000, dtype: int64


In [23]:
# Separating the data and label
x = twitter_data['lem_content'].values
y = twitter_data['target'].values

In [24]:
print(x)

['switchfoothttptwitpiccomyzlawwwthatsabummeryoushouldagotdavidcarrofthirddaytodoitd'
 'isupsetthathecantupdatehisfacebookbytextingitandmightcryasaresultschooltodayalsoblah'
 'kenichanidivedmanytimesfortheballmanagedtosavetherestgooutofbounds' ...
 'areyoureadyforyourmojomakeoveraskmefordetails'
 'happythbirthdaytomybooofallltimetupacamarushakur'
 'happycharitytuesdaythenspccsparkscharityspeakinguphh']


In [25]:
print(y)

[0 0 0 ... 1 1 1]


In [26]:
# Model Building


In [27]:
print(x.shape,xtrain.shape,xtest.shape)

(1600000,) (1280000,) (320000,)


In [28]:
print(xtrain)

['abouttowatchsawivanddrinkalilwine' 'hatermagazineimin'
 'eventhoughitsmyfavouritedrinkithinkitsthevodkaandcokethatwipesmymindallthetimethinkimgonnahavetofindanewdrink'
 ... 'iseagerformondayafternoon'
 'hopeeveryoneandtheirmotherhadagreatdaycantwaittohearwhattheguyshaveinstoretomorrow'
 'ilovewakinguptofolgerstoobadmyvoicewasdeeperthanhis']


In [29]:
print(xtest)

['mmangenmdoingfineihaventhadmuchtimetochatontwitterhubbyisbackforthesummeramptendstodominatemyfreetime'
 'atahsmayshowwruthkimampgeoffreysanhueza'
 'ishataramaybeitwasonlyabayareathangdammit' ...
 'destinineverthelesshoorayformembersandhaveawonderfulandsafetrip'
 'notfeelingtoowell' 'supersandrothankyou']


In [30]:
# Conversion of text data into numeric data
vectorizer = TfidfVectorizer()
xtrain = vectorizer.fit_transform(xtrain)
xtest =  vectorizer.transform(xtest)

In [31]:
print(xtrain)    # converted into numeric

  (0, 4269)	1.0
  (1, 370562)	1.0
  (2, 259916)	1.0
  (3, 544814)	1.0
  (4, 1125950)	1.0
  (5, 183114)	1.0
  (6, 656106)	1.0
  (7, 428071)	1.0
  (8, 372380)	1.0
  (9, 414470)	1.0
  (10, 453322)	1.0
  (11, 946538)	1.0
  (12, 1165544)	1.0
  (13, 334964)	1.0
  (14, 1029783)	1.0
  (15, 1251675)	1.0
  (16, 26694)	1.0
  (17, 620801)	1.0
  (18, 371140)	1.0
  (19, 166411)	1.0
  (20, 939887)	1.0
  (21, 392852)	1.0
  (22, 288433)	1.0
  (23, 202719)	1.0
  (24, 1253758)	1.0
  :	:
  (1279975, 97258)	1.0
  (1279976, 1111031)	1.0
  (1279977, 939195)	1.0
  (1279978, 783668)	1.0
  (1279979, 801133)	1.0
  (1279980, 865634)	1.0
  (1279981, 587527)	1.0
  (1279982, 953203)	1.0
  (1279983, 412932)	1.0
  (1279984, 81692)	1.0
  (1279985, 59068)	1.0
  (1279986, 1016205)	1.0
  (1279987, 777433)	1.0
  (1279988, 1218987)	1.0
  (1279989, 451086)	1.0
  (1279990, 465270)	1.0
  (1279991, 249260)	1.0
  (1279992, 1170111)	1.0
  (1279993, 1170401)	1.0
  (1279994, 75562)	1.0
  (1279995, 1107243)	1.0
  (1279996, 561267)	1

In [32]:
print(xtest)   # converted into numeric

  (22, 137733)	1.0
  (36, 790681)	1.0
  (55, 1232354)	1.0
  (104, 806419)	1.0
  (316, 675475)	1.0
  (317, 183140)	1.0
  (326, 531051)	1.0
  (343, 483385)	1.0
  (356, 1005018)	1.0
  (405, 294087)	1.0
  (411, 755018)	1.0
  (412, 671949)	1.0
  (503, 922958)	1.0
  (575, 484502)	1.0
  (576, 68319)	1.0
  (585, 553890)	1.0
  (597, 438098)	1.0
  (704, 437582)	1.0
  (730, 522670)	1.0
  (735, 1190743)	1.0
  (763, 201569)	1.0
  (851, 230689)	1.0
  (887, 945557)	1.0
  (905, 1081015)	1.0
  (965, 64116)	1.0
  :	:
  (319388, 469806)	1.0
  (319391, 363506)	1.0
  (319401, 69395)	1.0
  (319403, 1010401)	1.0
  (319419, 435926)	1.0
  (319442, 1154265)	1.0
  (319462, 332054)	1.0
  (319474, 363313)	1.0
  (319550, 1020416)	1.0
  (319573, 1087930)	1.0
  (319595, 377229)	1.0
  (319630, 416342)	1.0
  (319632, 324633)	1.0
  (319640, 333875)	1.0
  (319649, 564948)	1.0
  (319673, 474037)	1.0
  (319679, 363526)	1.0
  (319687, 874538)	1.0
  (319700, 937918)	1.0
  (319712, 230625)	1.0
  (319779, 553637)	1.0
  (319838

In [33]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error

In [34]:
# Training the Model --> 
model = Sequential()
model.add(Dense(128,activation='relu',input_dim = xtrain.shape[1]))
model.add(Dense(64,activation='relu'))
model.add(Dense(1,activation='linear'))

optimizer= Adam(learning_rate=0.001)
model.compile(optimizer = optimizer,loss="mean_squared_error",metrics =['accuracy'])

In [None]:
model.fit(xtrain,ytrain,epochs=10)



    4/40000 [..............................] - ETA: 710:51:22 - loss: 0.5032 - accuracy: 0.4844

In [60]:
# Model Evaluation
# Accuracy Score on basis of training data
xtrain_prediction = model.predict(xtrain)
training_accuracy = accuracy_score(ytrain,xtrain_prediction)

In [61]:
print('Accuracy score is:',training_accuracy)

Accuracy score is: 0.9981390625


In [62]:
# Accuracy Score on basis of test data
xtest_prediction = model.predict(xtest)
test_accuracy = accuracy_score(ytest,xtest_prediction)

In [63]:
print('Accuracy score is:',test_accuracy)

Accuracy score is: 0.51249375


In [66]:
# Saving the model
import pickle

In [67]:
filename = 'trained_model.sav'
pickle.dump(model,open(filename,'wb'))