In [None]:
import pandas as pd
import numpy as np

import os
import re

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

import keras
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks

from keras.preprocessing import text, sequence

from keras.layers import Dense,Input,LSTM,Bidirectional,Activation,Conv1D,GRU
from keras.layers import Dropout,Embedding,GlobalMaxPooling1D, MaxPooling1D, Add, Flatten
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D

from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.models import Model
from keras.optimizers import Adam
from keras.callbacks import Callback

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

print(keras.__version__)

In [None]:
# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)

In [None]:
# Load dataset
def load_data():
    data =pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Data/tweet_covid_processing_file2.csv", engine='python')
    return data

In [None]:
tweet_df = load_data()
tweet_df.head(2)

In [None]:
tweet_df.drop(['Unnamed: 0','OriginalTweet','Sentiment','tweet_token','tweet_token_filtered'],inplace=True,axis = 1)

In [None]:
tweet_df.head(2)

In [None]:
tweet_df['tweet_lemmatized'] = tweet_df['tweet_lemmatized'].str.lower()

In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter

In [None]:
X = tweet_df['tweet_lemmatized']
y = tweet_df['label']

In [None]:
X.shape,y.shape

In [None]:
#50328 only for glove
#2000001 only for Fasttext
max_features=2000001
max_len=100
sequence_input = Input(shape=(max_len, ))
embed_size=300

In [None]:
tokenizer = text.Tokenizer(num_words = max_features,lower = True
)

tokenizer.fit_on_texts(X)
X=tokenizer.texts_to_sequences(X)
X=sequence.pad_sequences(X,maxlen=max_len)


In [None]:
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

In [None]:
from matplotlib import pyplot
counter = Counter(y)
for k,v in counter.items():
	per = v / len(y) * 100
	print('Class=%d, n=%d (%.3f%%)' % (k, v, per))
# plot the distribution
pyplot.bar(counter.keys(), counter.values())
pyplot.show()

In [None]:
X.shape,y.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20, random_state=42)

In [None]:
glove_file = '/content/drive/My Drive/Twiter Sentiment Analysis/Data/glove.6B.100d.txt'

In [None]:
glove_6B_100d_index = {}
with open(glove_file, encoding='utf8') as file:
    for line in file:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        glove_6B_100d_index[word] = coefs

In [None]:
word_index = tokenizer.word_index
#prepare embedding matrix
num_words = min(max_features, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embed_size))
for word, i in word_index.items():
    if i >= max_features:
        continue
    embedding_vector = glove_6B_100d_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
embedding_matrix.shape

In [None]:
# LSTM Implimentation
from tensorflow.keras.layers import Embedding,SpatialDropout1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from keras.layers.convolutional import Conv1D  
from keras.optimizers import SGD
from keras.optimizers import Adam
from tensorflow.keras import optimizers

from keras.models import Sequential
from keras.layers import Embedding,Bidirectional, Dense,Dropout
from keras.layers import Dense, Activation, Flatten
from keras import layers
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.layers import Dropout

In [None]:
model2=Sequential()
model2.add(Embedding(max_features,100,weights=[embedding_matrix],trainable=True ))
model2.add(SpatialDropout1D(0.4))
model2.add(Bidirectional(LSTM(128)))
model2.add(Dropout(0.2))
model2.add(Dense(5,activation='softmax'))
#model2.add(Dense(1,activation='relu'))
adam = optimizers.Adam(lr=0.002, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model2.compile(loss='sparse_categorical_crossentropy',optimizer=adam,metrics=['accuracy'])

In [None]:
history = model2.fit(X_train,y_train,batch_size=128,epochs=5,validation_data=(X_test, y_test))

In [None]:
from keras.models import Sequential
from keras.layers import Embedding,Bidirectional, Dense,Dropout
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.layers import Dropout
kfold = StratifiedKFold(n_splits=5, shuffle=True)
cvscores = []

for train, test in kfold.split(X,y):
    ## Creating model
    model2=Sequential()
    model2.add(Embedding(max_features,100,weights=[embedding_matrix],trainable=True ))
    model2.add(SpatialDropout1D(0.4))
    model2.add(Bidirectional(LSTM(128)))
    model2.add(Dropout(0.2))
    model2.add(Dense(5,activation='softmax'))
    #model2.add(Dense(1,activation='relu'))
    adam = optimizers.Adam(lr=0.002, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model2.compile(loss='sparse_categorical_crossentropy',optimizer=adam,metrics=['accuracy'])
    # Fit the model
    history = model2.fit(X[train], y[train],validation_data=(X_test, y_test),epochs=10,batch_size=128)

In [None]:
hist_df = pd.DataFrame(history.history)
hist_df.head()

In [None]:
import matplotlib.pyplot as plt

plt.clf()
plt.plot(hist_df['val_accuracy'], label='Validation Accuracy')
plt.plot(hist_df['accuracy'], label='Training Accuracy')
#plt.axvline(x = 4, linewidth=1, color='r', linestyle = "--")

plt.title('Training and Validation Accuracy with Glove Pretrained Model')
plt.ylabel('Value')
plt.xlabel('Epoch')
plt.legend(loc="lower right")
plt.show()


plt.plot(hist_df['loss'], label='Training Loss')
plt.plot(hist_df['val_loss'], label='Validation Loss')
#plt.axvline(x = 4, linewidth=1, color='r', linestyle = "--")

plt.title('Training and Validation Loss with Glove Pretrained Model')
plt.ylabel('Value')
plt.xlabel('Epoch')
plt.legend(loc="upper right")
plt.show()

In [None]:
y_pred=model2.predict_classes(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, classification_report
print(classification_report(y_test, y_pred))

Fast Text Implementation

In [None]:
from urllib.request import urlopen
import gzip


In [None]:
file = gzip.open(urlopen('https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz'))

In [None]:
vocab_and_vectors = {}  
for line in file:
  values = line.split()
  word = values[0].decode('utf-8')
  vector = np.asarray(values[1:], dtype='float32')
  vocab_and_vectors[word] = vector

In [None]:
embedding_matrix = np.zeros((len(vocab_and_vectors) + 1, 300))
for i, word, in enumerate(vocab_and_vectors.keys()):
  embedding_vector = vocab_and_vectors.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

In [None]:
model5=Sequential()
model5.add(Embedding(max_features,300,weights=[embedding_matrix],trainable=True ))
model5.add(SpatialDropout1D(0.4))
model5.add(Bidirectional(LSTM(128)))
model5.add(Dropout(0.2))
model5.add(Dense(5,activation='softmax'))
#model2.add(Dense(1,activation='relu'))
adam = optimizers.Adam(lr=0.002, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model5.compile(loss='sparse_categorical_crossentropy',optimizer=adam,metrics=['accuracy'])

In [None]:
[1]*10**10

In [None]:
layer = Embedding(max_features, 
                  embed_size, 
                  weights=[embedding_matrix], 
                  trainable = True
                  )(sequence_input)

layer = SpatialDropout1D(0.2)(layer)

layer = Bidirectional(GRU(128, 
                          return_sequences=True, 
                          dropout=0.2, 
                          recurrent_dropout=0.2)
                      )(layer)

layer = Conv1D(64, kernel_size = 3, 
               padding = "valid", 
               kernel_initializer = "glorot_uniform"
               )(layer)

avg_pool = GlobalAveragePooling1D()(layer)

max_pool = GlobalMaxPooling1D()(layer)

layer = concatenate([avg_pool, max_pool]) 

preds = Dense(5, activation="sigmoid")(layer)

model = Model(sequence_input, preds)

model.compile(loss='sparse_categorical_crossentropy',optimizer=Adam(lr=1e-3),metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
history = model.fit(X_train,y_train,batch_size=128,epochs=5,validation_data=(X_test, y_test))

In [None]:
hist_df = pd.DataFrame(history.history)
hist_df.head()

In [None]:
import matplotlib.pyplot as plt

plt.clf()
plt.plot(hist_df['val_accuracy'], label='Validation Accuracy')
plt.plot(hist_df['accuracy'], label='Training Accuracy')
plt.axvline(x = 4, linewidth=1, color='r', linestyle = "--")

plt.title('Multilabel Classification Training and Validation')
plt.ylabel('Value')
plt.xlabel('Epoch')
plt.legend(loc="lower right")
plt.show()