In [None]:
import numpy as np
import pandas as pd
import math
import re

from bs4 import BeautifulSoup
from google.colab import drive
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPool1D, Dense, Dropout
from sklearn.model_selection import train_test_split

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
cols=['sentiment','id','date','query','user','text']
df=pd.read_csv("/content/drive/MyDrive/Projects/Twitter sentiment analysis/data/training.1600000.processed.noemoticon.csv",
               engine='python',
               header=None,
               names=cols,
               encoding="Latin1")
df.drop(['id','date','query','user'],axis=1,inplace=True)
df.tail()

Unnamed: 0,sentiment,text
1599995,4,Just woke up. Having no school is the best fee...
1599996,4,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,Happy 38th Birthday to my boo of alll time!!! ...
1599999,4,happy #charitytuesday @theNSPCC @SparksCharity...


In [None]:
def clean_data(tweet):
  tweet = BeautifulSoup(tweet,"lxml").get_text()
  tweet = re.sub(r"@[a-zA-Z0-9]+",'',tweet)
  tweet = re.sub(r"https?://[a-zA-Z0-9./]+",'',tweet)
  tweet = re.sub(r"[^a-zA-Z.!?']+"," ",tweet)
  tweet = re.sub(r" +"," ",tweet)

  return tweet

In [None]:

cleaned_data=[clean_data(tweet) for tweet in df.text]


In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:

from nltk.corpus import stopwords
en_stopwords=set(stopwords.words('english'))
en_stopwords.remove('not')
def stop_remove(tokens):
  new_tokens=[token for token in tokens.split() if token.lower() not in  en_stopwords]
  return " ".join(new_tokens)

In [None]:
cleaned_data1=[stop_remove(tweet) for tweet in cleaned_data]

In [None]:
cleaned_data1[5]

'not whole crew'

In [None]:
df_labels=df.sentiment.values
df_labels[df_labels==4]=1

In [None]:
cleaned_data1[0]

"Awww that's bummer. shoulda got David Carr Third Day it."

In [None]:
#tokenizer = 	tfds.deprecated.text.TextEncoder(cleaned_data,target_vocab_size=2**16)
#tokenizer =tfds.deprecated.text.Tokenizer()
#from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer1=tfds.deprecated.text.Tokenizer()
input_data = [tokenizer1.tokenize(sentence) for sentence in cleaned_data1]


In [None]:
input_data[0]

['Awww',
 'that',
 's',
 'bummer',
 'shoulda',
 'got',
 'David',
 'Carr',
 'Third',
 'Day',
 'it']

In [None]:
# To list:

sentences = dataset['text'].tolist()
labels = dataset['sentiment'].tolist()

In [None]:

X_train, X_test, y_train, y_test = train_test_split(cleaned_data, df_labels, test_size=.05, random_state=42)

In [None]:
MAX_LEN = 200
vocab_size = len(cleaned_data)
embedding_dim = 16
max_length = 200
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(cleaned_data)
word_index = tokenizer.word_index



In [None]:
training_sequences = tokenizer.texts_to_sequences(X_train)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(X_test)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
#We store this tokenizer in a file to use later in web app
import pickle
# saving
with open('/content/drive/MyDrive/Projects/Twitter sentiment analysis/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:


VOCAB_SIZE =vocab_size
EMB_DIM = 200
nb_filters  = 50  
FFN_units = 200
NB_CLASSES = 2
dropout_rate = 0.2
BATCH_SIZE = 32
NB_EPOCHS = 2

In [None]:
model1 = Sequential()

model1.add(tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length))
model1.add(Conv1D(filters=nb_filters,
                                kernel_size=2,
                                padding = 'valid',
                                activation = "relu"))

model1.add(Conv1D(filters = nb_filters,
                                 kernel_size = 3,
                                 padding = "valid",
                                 activation = "relu"))

model1.add(Conv1D(filters = nb_filters,
                                 kernel_size = 4,
                                 padding = "valid",
                                activation = 'relu'))

model1.add(GlobalMaxPool1D())
model1.add(Dense(units = FFN_units,activation = "relu"))
model1.add(Dropout(rate = dropout_rate))
model1.add(Dense(1, activation='sigmoid'))

In [None]:
model1.compile(loss = "binary_crossentropy",
               optimizer = 'adam',
               metrics = ['accuracy'])

In [None]:
checkpoint_path = "/content/drive/MyDrive/Projects/Twitter sentiment analysis/checkpoint/"

ckpt = tf.train.Checkpoint(model1)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

In [None]:
model1.fit(training_padded,df_labels, validation_split=.2,batch_size = 2**8, epochs = NB_EPOCHS, verbose =1) 

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f69e0078450>

In [None]:
ckpt_manager.save()

'/content/drive/MyDrive/Projects/Twitter sentiment analysis/checkpoint/ckpt-1'

In [None]:
predict_1 = model1.predict(testing_padded,batch_size = 2**8,verbose=1)



In [None]:
ori = lambda x:0 if x<0.5 else 1

In [None]:
predict_1_y = list(map(ori,predict_1))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,predict_1_y))

              precision    recall  f1-score   support

           0       0.79      0.86      0.83    239361
           1       0.85      0.78      0.81    240639

    accuracy                           0.82    480000
   macro avg       0.82      0.82      0.82    480000
weighted avg       0.82      0.82      0.82    480000



In [None]:
reviews = ['they made me cry', 'I hate spaghetti',"Yes the phone is not bad", 
                'Everything was good',
                'Everything was hot exactly as I wanted', 
                'Everything was green', 
                'the host seated us immediately',
                'they gave us free chocolate cake', 
                'not sure about the wilted flowers on the table',
                'only works when I stand on tippy toes', 
              'they gave gift']

# Create the sequences
padding_type='post'
sample_data=[clean_data(tweet) for tweet in reviews]
sample_stop = [stop_remove(i) for i in sample_data]
sample_sequences = tokenizer.texts_to_sequences(sample_stop)
reviews_padded = pad_sequences(sample_sequences, padding=padding_type, maxlen=max_length)           
classes = model1.predict(reviews_padded)
classes_y=list( map(ori,classes))
for i in range(len(reviews)):
  print(reviews[i],classes_y[i])

they made me cry 0
I hate spaghetti 0
Yes the phone is not bad 0
Everything was good 0
Everything was hot exactly as I wanted 0
Everything was green 1
the host seated us immediately 1
they gave us free chocolate cake 1
not sure about the wilted flowers on the table 1
only works when I stand on tippy toes 0
they gave gift 1


In [None]:
sample_stop

['made cry',
 'hate spaghetti',
 'phone good',
 'Everything good',
 'Everything hot exactly wanted',
 'Everything green',
 'host seated us immediately',
 'gave us free chocolate cake',
 'sure wilted flowers table',
 'works stand tippy toes',
 'gift nice']

In [None]:
 model1.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 200, 16)           17920000  
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 199, 50)           1650      
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 197, 50)           7550      
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 194, 50)           10050     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 50)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 200)               10200     
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)              

**SAVING MODEL**

In [None]:
model1.save("/content/drive/MyDrive/Projects/Twitter sentiment analysis/sentient_model1.h5")

**LOADING MODEL**

In [None]:
from keras.models import load_model
 
# load model
model2 = load_model('/content/drive/MyDrive/Projects/Twitter sentiment analysis/sentient_model1.h5')

In [None]:
import os
import tweepy as tw
import pandas as pd
#import preprocessor as p


consumer_key = "ZfmsU4RSrUh6KGPj3LY3ZkDsj"
consumer_secret = "tNIgrl5RcBrzRZvrl020zN11ycacL4cs9y6RYdDi03zw9VdOLH"
access_token = "725634674446360576-dNZ0UN9fLU3uwSOydmUZmL7xLtmTiyo"
access_token_secret = "06lvmh2cGO3k3rceM4UvL9ShQQXxWsaYvaIyLG35b2NO6"

In [None]:
auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth, wait_on_rate_limit=True)


In [None]:
# Define the search term and the date_since date as variables
search_words = "ipl2021"
date_since = "2021-01-01"

In [None]:
tweets = tw.Cursor(api.search,
              q=search_words,
              lang="en",
              since=date_since).items(5)

# Iterate and print tweets
for tweet in tweets:
    print(tweet.text)

RT @SunRisers: Give @jbairstow21 a 👍🏻 for him to continue his fine form in Chennai 🧡

#OrangeOrNothing #OrangeArmy #IPL2021 https://t.co/Rs…
RT @RCBTweets: No lazy weekends here. All Saturday calls for is a good stretch. 💪🏼

#PlayBold #WeAreChallengers #IPL2021 https://t.co/5zSwj…
RT @RCBTweets: Win big with Payer Of The Day on PhonePe. Predict the winning score of tomorrow’s game. Make a transaction of the same amoun…
RT @mipaltan: .@ImRo45: "We are not able to bat the way we want to in these 20 overs."

#OneFamily #MumbaiIndians #MI #IPL2021 #PBKSvMI
RT @CricCrazyJohns: #MI moves to Delhi for the next 4 matches and #PBKS moves to Ahmedabad for next 4 matches in #IPL2021.


In [None]:
# Collect tweets
tweets = tw.Cursor(api.search,
                       q=search_words,
                       lang="en",
                       since=date_since).items(5)

# Collect a list of tweets
twit = [tweet.text for tweet in tweets]

In [None]:
import requests
import bs4
from bs4 import BeautifulSoup
from requests_oauthlib import OAuth1

In [None]:
auth_params = {
    'app_key':'ZfmsU4RSrUh6KGPj3LY3ZkDsj',
    'app_secret':'tNIgrl5RcBrzRZvrl020zN11ycacL4cs9y6RYdDi03zw9VdOLH',
    'oauth_token':'725634674446360576-dNZ0UN9fLU3uwSOydmUZmL7xLtmTiyo',
    'oauth_token_secret':'06lvmh2cGO3k3rceM4UvL9ShQQXxWsaYvaIyLG35b2NO6'
}

# Creating an OAuth Client connection
auth = OAuth1 (
    auth_params['app_key'],
    auth_params['app_secret'],
    auth_params['oauth_token'],
    auth_params['oauth_token_secret']
)

In [None]:
search_words = "ipl2021"
date_since = "2021-01-01"
limit = 50

In [None]:

# url according to twitter API
url_rest = "https://api.twitter.com/1.1/search/tweets.json"

# getting rid of retweets in the extraction results and filtering all replies to the tweet often uncessary for the analysis
q = search_words+' -filter:retweets -filter:replies' # Twitter handle of Amazon India

# count : no of tweets to be retrieved per one call and parameters according to twitter API
params = {'q': q, 'count': limit, 'lang': 'en',  'result_type': 'recent', "since":date_since}
results = requests.get(url_rest, params=params, auth=auth)

In [None]:

tweets = results.json()

messages = [BeautifulSoup(tweet['text'], 'html5lib').get_text() for tweet in tweets['statuses']]


In [None]:
cleaned_msg = [clean_data(i) for i in messages]

In [None]:
len(cleaned_msg)

50

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
sample_sequences = tokenizer.texts_to_sequences(cleaned_msg)
reviews_padded = pad_sequences(sample_sequences, padding=padding_type, maxlen=max_length)           
classes = model2.predict(reviews_padded)

#for i in range(0,3):


In [None]:
ori = lambda x:0 if x<0.5 else 1
classes_y=list( map(ori,classes))

In [None]:
i=5
print(cleaned_msg[i],classes_y[i])

IPL Delhi Capitals Take On Wily Sunrisers Hyderabad In A Slugfest On A Slow Chepauk Chepauk DelhiCapitals  1


In [None]:
reviews = ['they made me cry', 'I hate spaghetti', 
                'Everything was good',
                'Everything was hot exactly as I wanted', 
                'Everything was green', 
                'the host seated us immediately',
                'they gave us free chocolate cake', 
                'not sure about the wilted flowers on the table',
                'only works when I stand on tippy toes', 
              'they gave gift']


In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
we = [stop_remove(i) for i in reviews]

In [None]:
we

['made cry',
 'hate spaghetti',
 'Everything good',
 'Everything hot exactly wanted',
 'Everything green',
 'host seated us immediately',
 'gave us free chocolate cake',
 'sure wilted flowers table',
 'works stand tippy toes',
 'gave gift']

In [None]:
print(s[0])

['made', 'cry']


In [None]:
from keras.layers import LSTM, SpatialDropout1D, Bidirectional
model2 = Sequential()

model2.add(tf.keras.layers.Embedding(vocab_size,16, input_length=max_length))
#model2.add(SpatialDropout1D(0.4)))
model2.add(Bidirectional(LSTM(32, dropout=0.2, recurrent_dropout=0.2)))
model2.add(Dense(1, activation='sigmoid'))




In [None]:
model2.compile(loss = "binary_crossentropy",
               optimizer = 'adam',
               metrics = ['accuracy'])

In [None]:
model2.fit(training_padded, y_train, validation_data=(testing_padded,y_test),batch_size = 2**8, epochs = 1, verbose =1) 

  30/5938 [..............................] - ETA: 2:26:22 - loss: 0.6929 - accuracy: 0.5010

KeyboardInterrupt: ignored

In [None]:

reviews = ['they made me cry', 'I hate spaghetti',"Yes the phone is not bad", 
                'Everything was good',
                'Everything was hot exactly as I wanted', 
                'Everything was green', 
                'the host seated us immediately',
                'they gave us free chocolate cake', 
                'not sure about the wilted flowers on the table',
                'only works when I stand on tippy toes', 
              'they gave gift']

ori = lambda x:0 if x<0.5 else 1
# Create the sequences
padding_type='post'
sample_data=[clean_data(tweet) for tweet in reviews]
sample_stop = [stop_remove(i) for i in sample_data]
sample_sequences = tokenizer.texts_to_sequences(sample_stop)
reviews_padded = pad_sequences(sample_sequences, padding=padding_type, maxlen=max_length)           
classes = model2.predict(reviews_padded)
classes_y=list( map(ori,classes))
for i in range(len(reviews)):
  print(reviews[i],classes_y[i])

they made me cry 1
I hate spaghetti 1
Yes the phone is not bad 1
Everything was good 1
Everything was hot exactly as I wanted 1
Everything was green 1
the host seated us immediately 1
they gave us free chocolate cake 1
not sure about the wilted flowers on the table 1
only works when I stand on tippy toes 1
they gave gift 1


In [None]:
from keras.layers import Bidirectional, LSTM

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=max_length))
model.add(Bidirectional(LSTM(128)))
model.add(Dropout(0.5))
model.add(Dense(1,activation='sigmoid'))
model.compile(optimizer='adam',
 loss='binary_crossentropy',
 metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 50)           80000000  
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               183296    
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 257       
Total params: 80,183,553
Trainable params: 80,183,553
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(training_padded,df_labels, validation_split=.2,batch_size = 2**8, epochs = NB_EPOCHS, verbose =1) 

Epoch 1/2
Epoch 2/2
 410/4750 [=>............................] - ETA: 53:10 - loss: 0.6440 - accuracy: 0.6550

KeyboardInterrupt: ignored