<a href="https://colab.research.google.com/github/adityamishra5050/Coronavirus-tweets-NLP---Text-Classification-/blob/main/NLU_ass_1_task4(a).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px

import re
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns


# **Task 1 and 2**

In [None]:
Corona_NLP_train = pd.read_csv('/content/drive/MyDrive/Corona_NLP_train.csv', encoding='latin-1')
Corona_NLP_test = pd.read_csv('/content/drive/MyDrive/Corona_NLP_test.csv', encoding='latin-1')

In [None]:
Corona_train_data = Corona_NLP_train['OriginalTweet'].copy()
Corona_test_data = Corona_NLP_test['OriginalTweet'].copy()

Corona_train_labels = Corona_NLP_train['Sentiment'].copy()
Corona_test_labels = Corona_NLP_test['Sentiment'].copy()

In [None]:
labels_replcement = {
    'Extremely Negative': 0,
    'Negative': 0,
    'Neutral': 1,
    'Positive': 2,
    'Extremely Positive': 2
}

Corona_train_labels = Corona_train_labels.replace(labels_replcement)
Corona_test_labels = Corona_test_labels.replace(labels_replcement)

In [None]:
Corona_train_data

0        @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...
1        advice Talk to your neighbours family to excha...
2        Coronavirus Australia: Woolworths to give elde...
3        My food stock is not the only one which is emp...
4        Me, ready to go at supermarket during the #COV...
                               ...                        
41152    Airline pilots offering to stock supermarket s...
41153    Response to complaint not provided citing COVI...
41154    You know itÂs getting tough when @KameronWild...
41155    Is it wrong that the smell of hand sanitizer i...
41156    @TartiiCat Well new/used Rift S are going for ...
Name: OriginalTweet, Length: 41157, dtype: object

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
stop_words = stopwords.words('english')

def process_tweet(tweet):

    # remove urls
    tweet = re.sub(r'http\S+', ' ', tweet)

    # remove html tags
    tweet = re.sub(r'<.*?>', ' ', tweet)

    # remove digits
    tweet = re.sub(r'\d+', ' ', tweet)

    # remove hashtags
    tweet = re.sub(r'#\w+', ' ', tweet)

    # remove mentions
    tweet = re.sub(r'@\w+', ' ', tweet)

    #removing stop words
    tweet = tweet.split()
    tweet = " ".join([word for word in tweet if not word in stop_words])

    return tweet

# Function taken from @Shahraiz's wonderful notebook

In [None]:
Corona_train_data = Corona_train_data.apply(process_tweet)
Corona_test_data = Corona_test_data.apply(process_tweet)

In [None]:
Corona_train_data

0                                                         
1        advice Talk neighbours family exchange phone n...
2        Coronavirus Australia: Woolworths give elderly...
3        My food stock one empty... PLEASE, panic, THER...
4        Me, ready go supermarket outbreak. Not I'm par...
                               ...                        
41152    Airline pilots offering stock supermarket shel...
41153    Response complaint provided citing COVID- rela...
41154    You know itÂs getting tough rationing toilet ...
41155      Is wrong smell hand sanitizer starting turn on?
41156    Well new/used Rift S going $ . Amazon rn altho...
Name: OriginalTweet, Length: 41157, dtype: object

In [None]:
largest_seq_size = np.max(Corona_train_data.apply(lambda tweet: len(tweet)))

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(Corona_train_data)

spell_size = len(tokenizer.word_index) + 1


Corona_train_data = tokenizer.texts_to_sequences(Corona_train_data)
Corona_test_data = tokenizer.texts_to_sequences(Corona_test_data)


Corona_train_data = pad_sequences(Corona_train_data, maxlen=largest_seq_size, padding='post')
Corona_test_data = pad_sequences(Corona_test_data, maxlen=largest_seq_size, padding='post')

In [None]:
print("Vocab length:", spell_size)
print("Max sequence length:", largest_seq_size)

Vocab length: 36117
Max sequence length: 286


In [None]:
Corona_train_data.shape

(41157, 286)

# **RNN Model**

In [None]:
embed_dim = 16


inputs = tf.keras.Input(shape=(largest_seq_size,), name='input_layer')

embedding = tf.keras.layers.Embedding(
    input_dim=spell_size,
    output_dim=embed_dim,
    input_length=largest_seq_size,
    name='word_embedding'
)(inputs)

gru_layer = tf.keras.layers.Bidirectional(
    tf.keras.layers.GRU(units=256, return_sequences=True, name='gru_layer'),
    name='bidirectional_layer'
)(embedding)

max_pooling = tf.keras.layers.GlobalMaxPool1D(name='max_pooling')(gru_layer)

dropout_1 = tf.keras.layers.Dropout(0.4, name='dropout_1')(max_pooling)

dense = tf.keras.layers.Dense(64, activation='relu', name='dense')(dropout_1)

dropout_2 = tf.keras.layers.Dropout(0.4, name='dropout_2')(dense)

outputs = tf.keras.layers.Dense(3, activation='softmax', name='output_layer')(dropout_2)


model_256 = tf.keras.Model(inputs=inputs, outputs=outputs)

print(model_256.summary())



Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_layer (InputLayer)    [(None, 286)]             0         
                                                                 
 word_embedding (Embedding)  (None, 286, 16)           577872    
                                                                 
 bidirectional_layer (Bidire  (None, 286, 512)         420864    
 ctional)                                                        
                                                                 
 max_pooling (GlobalMaxPooli  (None, 512)              0         
 ng1D)                                                           
                                                                 
 dropout_1 (Dropout)         (None, 512)               0         
                                                                 
 dense (Dense)               (None, 64)                32832 

In [None]:
model_256.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)


batch_size = 32
epochs = 3

history = model_256.fit(
    Corona_train_data,
    Corona_train_labels,
    validation_split=0.12,
    batch_size=batch_size,
    epochs=epochs,
    verbose=2
)

Epoch 1/3
1132/1132 - 72s - loss: 0.6270 - accuracy: 0.7378 - val_loss: 0.3895 - val_accuracy: 0.8668 - 72s/epoch - 63ms/step
Epoch 2/3
1132/1132 - 36s - loss: 0.3234 - accuracy: 0.8945 - val_loss: 0.3712 - val_accuracy: 0.8747 - 36s/epoch - 32ms/step
Epoch 3/3
1132/1132 - 37s - loss: 0.2221 - accuracy: 0.9306 - val_loss: 0.4349 - val_accuracy: 0.8629 - 37s/epoch - 33ms/step


In [None]:
fig = px.line(
    history.history,
    y=['loss', 'val_loss'],
    labels={'index': "epoch", 'value': "loss"},
    width=800, height=400
)

fig.show()

In [None]:
fig = px.line(
    history.history, y=['accuracy', 'val_accuracy'],
    labels={'index': 'epoch', 'value': 'accuracy'},width=800, height=400
)

fig.show()

In [None]:
model_256.evaluate(Corona_test_data, Corona_test_labels)



[0.49179136753082275, 0.8399157524108887]

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
preds=model_256.evaluate(Corona_test_data, Corona_test_labels)



In [None]:
embed_dim = 16


inputs1 = tf.keras.Input(shape=(largest_seq_size,), name='input_layer')

embedding1 = tf.keras.layers.Embedding(
    input_dim=spell_size,
    output_dim=embed_dim,
    input_length=largest_seq_size,
    name='word_embedding'
)(inputs1)

gru_layer = tf.keras.layers.Bidirectional(
    tf.keras.layers.GRU(units=64, return_sequences=True, name='gru_layer'),
    name='bidirectional_layer'
)(embedding1)

max_pooling = tf.keras.layers.GlobalMaxPool1D(name='max_pooling')(gru_layer)

dropout_1 = tf.keras.layers.Dropout(0.4, name='dropout_1')(max_pooling)

dense = tf.keras.layers.Dense(64, activation='relu', name='dense')(dropout_1)

dropout_2 = tf.keras.layers.Dropout(0.4, name='dropout_2')(dense)

outputs1 = tf.keras.layers.Dense(3, activation='softmax', name='output_layer')(dropout_2)


model_64 = tf.keras.Model(inputs=inputs1, outputs=outputs1)

print(model_64.summary())

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_layer (InputLayer)    [(None, 286)]             0         
                                                                 
 word_embedding (Embedding)  (None, 286, 16)           577872    
                                                                 
 bidirectional_layer (Bidire  (None, 286, 128)         31488     
 ctional)                                                        
                                                                 
 max_pooling (GlobalMaxPooli  (None, 128)              0         
 ng1D)                                                           
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense (Dense)               (None, 64)                8256

In [None]:
model_64.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)


batch_size = 32
epochs = 3

history1 = model_64.fit(
    Corona_train_data,
    Corona_train_labels,
    validation_split=0.12,
    batch_size=batch_size,
    epochs=epochs,
    verbose=2
)

Epoch 1/3
1132/1132 - 46s - loss: 0.6344 - accuracy: 0.7309 - val_loss: 0.3755 - val_accuracy: 0.8743 - 46s/epoch - 41ms/step
Epoch 2/3
1132/1132 - 24s - loss: 0.3205 - accuracy: 0.8952 - val_loss: 0.3704 - val_accuracy: 0.8763 - 24s/epoch - 21ms/step
Epoch 3/3
1132/1132 - 21s - loss: 0.2221 - accuracy: 0.9313 - val_loss: 0.4261 - val_accuracy: 0.8589 - 21s/epoch - 19ms/step


In [None]:
fig = px.line(
    history1.history,
    y=['loss', 'val_loss'],
    labels={'index': "epoch", 'value': "loss"},width=800, height=400
)

fig.show()

In [None]:
fig = px.line(
    history1.history, y=['accuracy', 'val_accuracy'],
    labels={'index': 'epoch', 'value': 'accuracy'},width=800, height=400
)

fig.show()

In [None]:
model_64.evaluate(Corona_test_data, Corona_test_labels)



[0.4841078519821167, 0.8420221209526062]

In [None]:
from sklearn.metrics import confusion_matrix, classification_report