## Overview 

This script performs EDA and then preprocesses multiple datasets to train a bidirectional LSTM model which is in turn used to predict the sentiments behind tweets fetched in real time using `tweepy` and classify them as positive negative or neutral.

The model is then integrated with streamlit and deployed as a web-app.

**Checkout the web-app:** [Sententia](https://share.streamlit.io/kritanjalijain/twitter_sentiment_analysis/main/app.py)

## Installing and importing dependencies

To fetch tweets from twitter, we need to install the tweepy library. We will be using this package to pull tweets on which our model will make predictions.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing
import os
import tweepy as tw #for accessing Twitter API


#For Preprocessing
import re    # RegEx for removing non-letter characters
import nltk  #natural language processing
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import *

# For Building the model
from sklearn.model_selection import train_test_split
import tensorflow as tf
import seaborn as sns
import pickle
#For data visualization
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
%matplotlib inline

pd.options.plotting.backend = "plotly"

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Cleaning and prepping dataset

In [None]:
# Load Tweet dataset
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


###Getting Data And Combining Together

In [None]:
#first one
data1 = pd.read_csv(r"/content/drive/MyDrive/files/Twitter_Data.csv")
#second dataset
data2 = pd.read_csv(r'/content/drive/MyDrive/files/apple-twitter-sentiment-texts.csv')
data2 = data2.rename(columns={'text': 'clean_text', 'sentiment':'category'})
data2['category'] = data2['category'].map({-1: -1.0, 0: 0.0, 1:1.0})
#third dataset
data3 = pd.read_csv('/content/drive/MyDrive/files/finalSentimentdata2.csv')
data3 = data3.rename(columns={'text': 'clean_text', 'sentiment':'category'})
data3['category'] = data3['category'].map({'sad': -1.0, 'anger': -1.0, 'fear': -1.0, 'joy':1.0})
data3 = data3.drop(['Unnamed: 0'], axis=1)
#fouth dataset
data4 = pd.read_csv('/content/drive/MyDrive/files/Tweets.csv')
data4 = data4.rename(columns={'text': 'clean_text', 'airline_sentiment':'category'})
data4['category'] = data4['category'].map({'negative': -1.0, 'neutral': 0.0, 'positive':1.0})
data4 = data4[['category','clean_text']]
#combine
df = pd.concat([data1, data2, data3, data4], ignore_index=True)
# drop missing rows
df.dropna(axis=0, inplace=True)
df['category'] = df['category'].map({-1.0:'Negative', 0.0:'Neutral', 1.0:'Positive'})
df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,Negative
1,talk all the nonsense and continue all the dra...,Neutral
2,what did just say vote for modi welcome bjp t...,Positive
3,asking his supporters prefix chowkidar their n...,Positive
4,answer who among these the most powerful world...,Positive


## Data Preprocessing

In [None]:
def tweet_to_words(tweet):
    ''' Convert tweet text into a sequence of words '''
    
    # convert to lowercase
    text = tweet.lower()
    # remove non letters
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    # tokenize
    words = text.split()
    # remove stopwords
    words = [w for w in words if w not in stopwords.words("english")]
    # apply stemming
    words = [PorterStemmer().stem(w) for w in words]
    # return list
    return words

print("\nOriginal tweet ->", df['clean_text'][0])
print("\nProcessed tweet ->", tweet_to_words(df['clean_text'][0]))


Original tweet -> when modi promised “minimum government maximum governance” expected him begin the difficult job reforming the state why does take years get justice state should and not business and should exit psus and temples

Processed tweet -> ['modi', 'promis', 'minimum', 'govern', 'maximum', 'govern', 'expect', 'begin', 'difficult', 'job', 'reform', 'state', 'take', 'year', 'get', 'justic', 'state', 'busi', 'exit', 'psu', 'templ']


In [None]:
# Apply data processing to each tweet
X = list(map(tweet_to_words, df['clean_text']))

### Train and test data

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
# Encode target labels
label = LabelEncoder()
Y = label.fit_transform(df['category'])

y = pd.get_dummies(df['category'])
training_x, testing_x, training_y, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
training_x, X_validation, training_y, y_validation = train_test_split(training_x, training_y, test_size=0.25, random_state=1)
v_Size = 5000

c_v = CountVectorizer(max_features=v_Size,preprocessor=lambda x: x, tokenizer=lambda x: x) 

# Fit the training data
training_x = c_v.fit_transform(training_x).toarray()

# Transform testing data
testing_x = c_v.transform(testing_x).toarray()

training_xtraining_xtraining_### Tokenizing & Paddingvdfffffdftraining_

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

max_words = 5000
max_len=50

def tokenize_pad_sequences(text):
    '''
    This function tokenize the input text into sequnences of intergers and then
    pad each sequence to the same length
    '''
    # Text tokenization
    toke = Tokenizer(num_words=max_words, lower=True, split=' ')
    toke.fit_on_texts(text)
    # Transforms text to a sequence of integers
    X = toke.texts_to_sequences(text)
    # Pad sequences to the same length
    X = pad_sequences(X, padding='post', maxlen=max_len)
    # return sequences
    return X, toke

X, toke = tokenize_pad_sequences(df['clean_text'])



# saving
with open('/content/drive/MyDrive/files/tokenizer.pickle', 'wb') as handle:
    pickle.dump(toke, handle, protocol=pickle.HIGHEST_PROTOCOL)
# loading
with open('/content/drive/MyDrive/files/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

### Saving tokenized data

### Train & Test Split

In [None]:
y = pd.get_dummies(df['category'])
training_x, testing_x, training_y, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
training_x, X_validation, training_y, y_validation = train_test_split(training_x, training_y, test_size=0.25, random_state=1)
print('Train Set ->', training_x.shape, training_y.shape)
print('Validation Set ->', X_validation.shape, y_validation.shape)
print('Test Set ->', testing_x.shape, y_test.shape)

Train Set -> (109397, 50) (109397, 3)
Validation Set -> (36466, 50) (36466, 3)
Test Set -> (36466, 50) (36466, 3)


## Bidirectional LSTM Using NN

In [None]:
#!pip install tensorflow
#!pip install keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras import datasets

from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.callbacks import History

from tensorflow.keras import losses

vocab_size = 5000
e_size = 32
epochs=20
learning_rate = 0.1
d_rate = learning_rate / epochs
momentum = 0.8

Stochastic_Gradient_Descent = SGD(learning_rate=learning_rate, momentum=momentum, decay=d_rate, nesterov=False)
# Build model
model= Sequential()
model.add(Embedding(vocab_size, e_size, input_length=max_len))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.4))
model.add(Dense(3, activation='softmax'))
print(model.summary())
# Compile model
model.compile(loss='categorical_crossentropy', optimizer= Stochastic_Gradient_Descent,metrics=['accuracy', Precision(), Recall()])
# Train model
batch_size = 64


Collecting tf-estimator-nightly==2.8.0.dev2021122109
  Downloading tf_estimator_nightly-2.8.0.dev2021122109-py2.py3-none-any.whl (462 kB)
[K     |████████████████████████████████| 462 kB 5.0 MB/s 
Installing collected packages: tf-estimator-nightly
Successfully installed tf-estimator-nightly-2.8.0.dev2021122109
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 32)            160000    
                                                                 
 conv1d (Conv1D)             (None, 50, 32)            3104      
                                                                 
 max_pooling1d (MaxPooling1D  (None, 25, 32)           0         
 )                                                               
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)   

In [None]:
history = model.fit(training_x, training_y, validation_data=(X_validation, y_validation),batch_size=batch_size, epochs=epochs, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


### Model Accuracy & Loss

In [None]:
# Evaluate model on the test set
loss, accuracy, precision, recall = model.evaluate(testing_x, y_test, verbose=0)
# Print metrics
print('')
print('Accuracy  : {:.4f}'.format(accuracy))
print('Precision : {:.4f}'.format(precision))
print('Recall    : {:.4f}'.format(recall))



Accuracy  : 0.8982
Precision : 0.9030
Recall    : 0.8925


### Model Confusion Matrix

### Model save and load for the prediction

In [None]:
# Save the model architecture & the weights


model.save('new_Sentiment_model.h5')



import pickle
with open('/content/drive/MyDrive/files/sentiment_model','wb') as f:
    pickle.dump(model,f)



INFO:tensorflow:Assets written to: ram://d4d5cf25-9001-4abc-95a6-9c0d336b10a5/assets


INFO:tensorflow:Assets written to: ram://d4d5cf25-9001-4abc-95a6-9c0d336b10a5/assets


In [None]:
import pickle
with open('sentiment_model','rb') as f:
   model = pickle.load(f)
def predict_class(text):
    '''Function to predict sentiment class of the passed text'''
    
    sentiment_classes = ['Negative', 'Neutral', 'Positive']
    max_len=50
    
    # Transforms text to a sequence of integers using a toke object
    xt = toke.texts_to_sequences(text)
    # Pad sequences to the same length
    xt = pad_sequences(xt, padding='post', maxlen=max_len)
    # Do the prediction using the loaded model
    yt = model.predict(xt).argmax(axis=1)
    # Print the predicted sentiment
    print('The predicted sentiment is', sentiment_classes[yt[0]])

FileNotFoundError: ignored