In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# importing required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import models, Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, GlobalAveragePooling1D, Input, SpatialDropout1D
import string
import re
from sklearn.model_selection import train_test_split
import nltk
nltk.download('stopwords')
from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.preprocessing import OneHotEncoder
from keras.regularizers import l1, l2
from sklearn.metrics import accuracy_score, f1_score, RocCurveDisplay, roc_auc_score

# Reading Data

In [None]:
# importing data
# data -> tweets
# labels -> sentiment

data = pd.read_csv('/content/drive/MyDrive/Problem Set 5/ps5_tweets_text.csv')
labels = pd.read_csv('/content/drive/MyDrive/Problem Set 5/ps5_tweets_labels_as_numbers.csv')

In [None]:
# merging tweets and labels into one dataframe

fulldata = data.merge(labels, right_on='Id', left_on='Id')
fulldata.head()

In [None]:
label_meanings = {0:"Extremely Negative", 1:"Negative", 2:"Neutral", 3:"Positive", 4:"Extremely Positive"}

# Exploratory Data Analysis

In [None]:
# plotting bar chart number of tweets for each sentiment class
# plotting pie chart for showing percentage of tweets for each sentiment class

x0 = fulldata.Label.value_counts().sort_index()[0]
x1 = fulldata.Label.value_counts().sort_index()[1]
x2 = fulldata.Label.value_counts().sort_index()[2]
x3 = fulldata.Label.value_counts().sort_index()[3]
x4 = fulldata.Label.value_counts().sort_index()[4]

x = ["Extremely Negative", "Negative", "Neutral", "Positive", "Extremely Positive"]
y = [x0, x1, x2, x3, x4]

plt.figure(figsize=(15,5))

# Bar Plot
plt.subplot(1,2,1)
plt.title('Number of tweets for each category')
plt.bar(x,y)
plt.xticks(rotation=45)
plt.xlabel('Sentiment')
plt.ylabel('Number of tweets')

# Pie Chart
plt.subplot(1,2,2)
plt.title('Percentage of tweets for each category')

plt.pie(y, colors=None, labels=x, autopct='%1.1f%%',
        shadow=True, startangle=30, wedgeprops={'alpha':0.6})
plt.axis('equal') 
plt.show()

In [None]:
# calculating mean tweet length for each sentiment

def avgtweetlength(sentiment, corpus):

  # sentiment -> specifying the sentiment class
  # corpus -> collection of all tweets for a particular sentiment

  # creating a dataframe to store sentiment specific tweets and corresponding lengths
  df = pd.DataFrame()
  df[sentiment] = corpus
  df['len'] = df[sentiment].apply(lambda x: len(x))

  # calculating mean length
  l = df['len'].mean()
  print(sentiment+' =', round(l,2))
  
  return round(l,2)

In [None]:
# Plotting mean tweet length for each sentiment class

print('Average tweet lengths for each sentiment')
y = []
for i in range(5):
  data = fulldata['Tweet'][fulldata['Label']==i]
  data.reset_index(drop=True, inplace=True)
  y.append(avgtweetlength(x[i], data))

plt.figure()
plt.title('Average tweet length for each sentiment')
plt.bar(x,y, color=['red','orange','blue','yellow','green'])
plt.xticks(rotation=45)
plt.xlabel('Sentiment')
plt.ylabel('Average number of tweets')

In [None]:
# WordCloud

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

alltweets = ''

for tweet in fulldata.Tweet:
  alltweets = alltweets + tweet + ' '

alltweets = alltweets.strip()

plt.figure(figsize=(15,15))
list_of_stopwords = set(STOPWORDS)
list_of_stopwords.update(['https', 'co','com', 'amp', 'will'])

wordcloud = WordCloud(stopwords=list_of_stopwords, background_color='white').generate(alltweets)
plt.imshow(wordcloud)
plt.axis(False)
plt.show()



---
Cleaning the tweets
---





In [None]:
# cleaning the text

def clean(tweet):
  tr = ''

  # remove hyperlinks
  tweet = re.sub(r'^https?:\/\/.*[\r\n]*', '', tweet, flags=re.MULTILINE)
  
  # remove punctuations and replace with space
  for char in tweet:
    if char not in string.punctuation:
      tr = tr + char
    else:
      tr = tr + ' '
  tweet = tr

  # remove digits
  tweet = re.sub('[0-9]+', '', tweet)

  # return clean text
  return tweet

fulldata['Tweet_cleaned'] = fulldata['Tweet'].apply(lambda x: clean(x))
fulldata.head(10)

In [None]:
# removing words that are one character long

def onechar(line):
  
  words = line.split(' ')
  b = ''
  for word in words:
    if len(word) >= 2:
      b = b + word + ' '

  return b.strip()
fulldata['Tweet_cleaned'] = fulldata['Tweet_cleaned'].apply(lambda x: onechar(x))

In [None]:
# loading stopwords from nltk

stopwords = nltk.corpus.stopwords.words('english')

In [None]:
# removing stopwords from the tweets

def rem_stopwords(tweet):
  tr, new = '', ''
  tweet = tweet + ' '
  for char in tweet:
    if char != ' ':
      tr = tr + char
    else:
      if tr not in stopwords:
        new = new + tr + ' '
      tr = ''
  new = new.strip()
  new = new.lower()
  return new
    
fulldata['Tweet_wostop'] = fulldata['Tweet_cleaned'].apply(lambda x: rem_stopwords(x))

In [None]:
# Performing stemming on cleaned tweets

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

# stemming

def stemming(line):
  
  a = ''
  words = line.split()
  for word in words:
    a = a + stemmer.stem(word) + ' '

  return a.strip()
fulldata['Tweet_wostop'] = fulldata['Tweet_wostop'].apply(lambda x: onechar(x))

In [None]:
# removing escape characters

def escape(line):
  escapes = ''.join([chr(char) for char in range(1, 32)])
  
  translator = str.maketrans('', '', escapes)
  b = line.translate(translator)
  return line.translate(translator)

fulldata['Tweet_wostop'] = fulldata['Tweet_wostop'].apply(lambda x: escape(x))



---
Final Cleaned Data
---





In [None]:
fulldata[['Id','Tweet', 'Tweet_wostop','Label']].head()

In [None]:
# Randomly shuffling the dataframe

fulldata = shuffle(fulldata)
fulldata.reset_index(drop=True, inplace=True)
fulldata

In [None]:
# checking the length of a tweet before and after removing stop words and escape characters

len(fulldata.Tweet[0]), len(fulldata.Tweet_wostop[0])

# Preparing Data for feeding it to the models

In [None]:
# splitting data into cleaned tweets and corresponding labels

x = fulldata.Tweet_wostop # clean tweet data
y = fulldata.Label # labels

In [None]:
# splitting full data into training and testing 

train_len = int(x.shape[0]*0.8) # 80% of the data is for training, 20% for testing

# training data
trainX = x[:train_len]
trainY = y[:train_len]

# testing data
testX = x[train_len:]
testY = y[train_len:]

In [None]:
# splitting training data into train and validation sets

x_train, x_val, y_train, y_val = train_test_split(trainX, trainY, test_size=0.2)

In [None]:
x_train.shape, y_train.shape, x_val.shape, y_val.shape, testX.shape, testY.shape

In [None]:
# Tokenizing tweets

max_words = 10000 # max words to consider in the vocabulary
max_len = 55 # max number of words to consider in a tweet
tokenizer = Tokenizer(num_words = max_words) # creating Tokenizer object
tokenizer.fit_on_texts(x_train) # fitting tokenizer on the train set

In [None]:
# preparing sequences by substituting tokens with their corresponding integer values 
train_sequences = tokenizer.texts_to_sequences(x_train)

# padding the sequences to make all tweets of same length
train_padded = pad_sequences(train_sequences, maxlen=max_len)#, padding='post')

In [None]:
train_padded.shape

In [None]:
# tokenizing and creating padded sequences for validation set
val_sequences = tokenizer.texts_to_sequences(x_val)
val_padded = pad_sequences(val_sequences, maxlen=max_len)#, padding='post')

# tokenizing and creating padded sequences for test set
test_sequences = tokenizer.texts_to_sequences(testX)
test_padded = pad_sequences(test_sequences, maxlen=max_len)#, padding='post')

# default padding is done before (if not specified explicitly)

In [None]:
val_padded.shape, test_padded.shape

In [None]:
# padded sequence example
train_padded[0]

# Training the LSTM model

In [None]:
# preparing the model

model = Sequential([
                    #Input(name='inputs', shape=[max_len]),

                    Embedding(max_words, 60, input_length = max_len),
                    #GlobalAveragePooling1D(),
                    LSTM(128, recurrent_dropout=0.3),#, return_sequences=True),
                    
                    # Dense(128, activation='tanh'),#, kernel_regularizer=l2(0.01)),
                    # Dropout(0.5),
                    #GlobalAveragePooling1D(),
                    # Dense(64, activation='relu'),
                    # Dropout(0.5),
                    # Dense(64, activation='relu'),
                    # Dropout(0.5),
                    Dense(5, activation='softmax')
])

model.summary()

In [None]:
# compiling the model

from keras.optimizers import Adam, RMSprop, SGD

opt = Adam(0.0001)
sgd = SGD(lr = 0.0001)

model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam', metrics=['accuracy']) 

In [None]:
# training the model

# specifying callback to stop when validation loss keeps increasing for 4 straight epochs
callback = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=4, verbose=0, mode='auto', restore_best_weights=True)

history = model.fit(train_padded, y_train, epochs=10, batch_size=32, validation_data=(val_padded, y_val), callbacks=[callback])

In [None]:
# plotting the accuracy and loss for training and validation

legend = ['validation', 'train']

plt.figure(figsize=(10,5))

# plotting accuracy
plt.subplot(1,2,1)
plt.title('Accuracy')
plt.plot(history.history['val_accuracy'])
plt.plot(history.history['accuracy'])
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(legend)

# plotting loss
plt.subplot(1,2,2)
plt.title('Loss')
plt.plot(history.history['val_loss'])
plt.plot(history.history['loss'])
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(legend)

In [None]:
# evaluating model performance on test set

test_loss, test_accuracy = model.evaluate(test_padded, testY)

print('Test Loss = ', test_loss)
print(f'Test Accuracy = {round(test_accuracy*100, 2)}%')

In [None]:
# predicting labels for test set

y_predf1 = model.predict_proba(test_padded)
y_predf1

In [None]:
'''Plotting ROC AUC curves for Test Set'''

'''Referred from: https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html'''


from sklearn.metrics import roc_curve,roc_auc_score,auc 
from sklearn import preprocessing
from itertools import cycle

def plotrocauc(model):
    
    y_pred = model.predict(test_padded)
    y_predf1 = model.predict_proba(test_padded)
    
    #print(testY)

    macro_roc_auc_ovo = roc_auc_score(testY, y_pred, multi_class="ovo",average="macro")
    weighted_roc_auc_ovo = roc_auc_score(testY, y_pred, multi_class="ovo",average="weighted")
    macro_roc_auc_ovr = roc_auc_score(testY, y_pred, multi_class="ovr",average="macro")
    weighted_roc_auc_ovr = roc_auc_score(testY, y_pred, multi_class="ovr",average="weighted")
    
    
    print("One-vs-One-> {:.6f} (weighted by prevalence)".format(weighted_roc_auc_ovo))
    print("One-vs-Rest-> {:.6f} (weighted by prevalence)".format(weighted_roc_auc_ovr))

    lb = preprocessing.LabelBinarizer()
    lb.fit(testY)
    y_test = lb.transform(testY)
    
    n_classes=5
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_pred[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    plt.figure(figsize=(7,7))
    
    colors = cycle(['blue', 'red', 'green', 'orange', 'yellow'])
    
    for i, color in zip(range(n_classes), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=1.5, label='ROC curve of class {0} (area = {1:0.2f})' ''.format(i, roc_auc[i]))
    
    plt.plot([0, 1], [0, 1], 'k-', lw=1.5)
    plt.xlim([-0.05, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
# plotting ROC curve

plotrocauc(model)

In [None]:
ypred = model.predict_classes(test_padded)
ypred

In [None]:
# plotting confusion matrix

from sklearn.metrics import confusion_matrix
import seaborn as sns

label_meanings = ["Extremely Negative", "Negative", "Neutral", "Positive","Extremely Positive"]
conf_mat = confusion_matrix(testY, ypred)

fig, ax = plt.subplots(figsize=(10,10))
plt.title('Confusion matrix for LSTM')
plot = sns.heatmap(conf_mat, annot=True, fmt='d')
plot.set_ylabel('Actual', fontsize=15)
plot.set_xlabel('Predicted', fontsize=15)
plot.set_xticklabels(label_meanings)
#plot.set_yticklabels(label_meanings)
plt.show()

# TF-IDF vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# corpus is a list of all tweets

corpus = fulldata.Tweet_wostop.to_list()
len(corpus)

In [None]:
# tfidf vectorizing the corpus

vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, 2), stop_words='english')
x = vectorizer.fit_transform(corpus)

In [None]:
# splitting into train and test data

X_train, X_test, y_train, y_test = train_test_split(x, fulldata.Label, random_state = 0)

# Multinomial Naive Bayes Model

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()

# fitting the model on train data
model2 = nb.fit(X_train, y_train)

In [None]:
# generating predictions on test data
ypred = model2.predict(X_test)

In [None]:
ypred

In [None]:
# calculating accuracy and f1 score for the model

acc = round(accuracy_score(y_test, ypred),2)
auc = round(roc_auc_score(y_test, model2.predict_proba(X_test), multi_class='ovr'),2)

print(f'Accuracy = {acc*100} %')
print(f'ROC_AUC_Score = {auc}')

In [None]:
# plotting confusion matrix

from sklearn.metrics import confusion_matrix
import seaborn as sns

label_meanings = ["Extremely Negative", "Negative", "Neutral", "Positive","Extremely Positive"]
conf_mat = confusion_matrix(y_test, ypred)

fig, ax = plt.subplots(figsize=(10,10))
plt.title('Confusion matrix for Multinomial Naive Bayes')
plot = sns.heatmap(conf_mat, annot=True, fmt='d')
plot.set_ylabel('Actual', fontsize=15)
plot.set_xlabel('Predicted', fontsize=15)
plot.set_xticklabels(label_meanings)
#plot.set_yticklabels(label_meanings)
plt.show()

# Logistic Regression


In [None]:
from sklearn.linear_model import LogisticRegression

log = LogisticRegression(max_iter=500, solver='liblinear', multi_class='auto')

# fitting the model to training data
logreg = log.fit(X_train, y_train)

In [None]:
# predicting accuracy and f1 score for the model
logpred = logreg.predict(X_test)

In [None]:
# calculating accuracy and f1 score for the model

acc = round(accuracy_score(y_test, logpred),2)
auc = round(roc_auc_score(y_test, logreg.predict_proba(X_test), multi_class='ovr'),2)

print(f'Accuracy = {acc*100} %')
print(f'ROC_AUC_Score = {auc}')


In [None]:
# plotting confusion matrix

from sklearn.metrics import confusion_matrix
import seaborn as sns

label_meanings = ["Extremely Negative", "Negative", "Neutral", "Positive","Extremely Positive"]
conf_mat = confusion_matrix(y_test, logpred)

fig, ax = plt.subplots(figsize=(10,10))
plt.title('Confusion matrix for Logistic Regression')
plot = sns.heatmap(conf_mat, annot=True, fmt='d')
plot.set_ylabel('Actual', fontsize=15)
plot.set_xlabel('Predicted', fontsize=15)
plot.set_xticklabels(label_meanings)
#plot.set_yticklabels(label_meanings)
plt.show()