<a href="https://colab.research.google.com/github/damsoumya/my_NLP/blob/main/Twitter_Sentiment_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import time

In [7]:
from google.colab import files
uploaded = files.upload()

Saving train.csv to train (1).csv


In [8]:
import io
df = pd.read_csv(io.BytesIO(uploaded['train.csv']))

In [None]:
df.shape

In [None]:
df.head()

In [None]:
print((df.target == 1).sum()) # Disaster
print((df.target == 0).sum()) # No Disaster

In [None]:
# Preprocessing
import re
import string

def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)

# https://stackoverflow.com/questions/34293875/how-to-remove-punctuation-marks-from-a-string-in-python-3-x-using-translate/34294022
def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

string.punctuation

In [None]:
pattern = re.compile(r"https?://(\S+|www)\.\S+")
for t in df.text:
    matches = pattern.findall(t)
    for match in matches:
        print(t)
        print(match)
        print(pattern.sub(r"", t))
    if len(matches) > 0:
        break

In [None]:
df["text"] = df.text.map(remove_URL) # map(lambda x: remove_URL(x))
df["text"] = df.text.map(remove_punct)

In [10]:
!pip install nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Stop Words: A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine
# has been programmed to ignore, both when indexing entries for searching and when retrieving them 
# as the result of a search query.
stop = set(stopwords.words("english"))

# https://stackoverflow.com/questions/5486337/how-to-remove-stop-words-using-nltk-or-python
def remove_stopwords(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [11]:
stop

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [12]:
df["text"] = df.text.map(remove_stopwords)

In [13]:
from collections import Counter

# Count unique words
def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count


counter = counter_word(df.text)

In [14]:
len(counter)

27833

In [15]:
counter

Counter({'deeds': 1,
         'reason': 20,
         '#earthquake': 19,
         'may': 87,
         'allah': 4,
         'forgive': 2,
         'us': 99,
         'forest': 54,
         'fire': 209,
         'near': 54,
         'la': 19,
         'ronge': 1,
         'sask.': 1,
         'canada': 6,
         'residents': 7,
         'asked': 9,
         "'shelter": 1,
         "place'": 1,
         'notified': 1,
         'officers.': 1,
         'evacuation': 47,
         'shelter': 5,
         'place': 22,
         'orders': 11,
         'expected': 12,
         '13,000': 1,
         'people': 174,
         'receive': 2,
         '#wildfires': 5,
         'california': 91,
         'got': 111,
         'sent': 13,
         'photo': 13,
         'ruby': 1,
         '#alaska': 2,
         'smoke': 44,
         'pours': 1,
         'school': 63,
         '#rockyfire': 4,
         'update': 14,
         '=>': 1,
         'hwy.': 4,
         '20': 13,
         'closed': 17,
         'd

In [16]:
counter.most_common(5)

[('-', 763), ('like', 341), ('&amp;', 295), ("i'm", 237), ('get', 227)]

In [17]:
num_unique_words = len(counter)

In [18]:
train_size = int(df.shape[0] * 0.8)

train_df = df[:train_size]
val_df = df[train_size:]

# split text and labels
train_sentences = train_df.text.to_numpy()
train_labels = train_df.target.to_numpy()
val_sentences = val_df.text.to_numpy()
val_labels = val_df.target.to_numpy()

In [19]:
train_sentences.shape, val_sentences.shape

((6090,), (1523,))

In [20]:
from tensorflow.keras.preprocessing.text import Tokenizer

# vectorize a text corpus by turning each text into a sequence of integers
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(train_sentences) # fit only to training

In [21]:
word_index = tokenizer.word_index

In [22]:
word_index

{'t': 1,
 'co': 2,
 'http': 3,
 'https': 4,
 'amp': 5,
 'like': 6,
 'fire': 7,
 'get': 8,
 '2': 9,
 'via': 10,
 "i'm": 11,
 'new': 12,
 'people': 13,
 'news': 14,
 'emergency': 15,
 'one': 16,
 "'": 17,
 'video': 18,
 'disaster': 19,
 'body': 20,
 'burning': 21,
 'would': 22,
 'buildings': 23,
 'police': 24,
 'u': 25,
 'us': 26,
 '3': 27,
 'day': 28,
 'crash': 29,
 'first': 30,
 'man': 31,
 'still': 32,
 '1': 33,
 'got': 34,
 'know': 35,
 'california': 36,
 'two': 37,
 'back': 38,
 'time': 39,
 'going': 40,
 'full': 41,
 'accident': 42,
 '4': 43,
 'world': 44,
 'attack': 45,
 'see': 46,
 'nuclear': 47,
 'gt': 48,
 'love': 49,
 'rt': 50,
 'youtube': 51,
 'may': 52,
 'year': 53,
 'go': 54,
 'many': 55,
 'watch': 56,
 'collapse': 57,
 'dead': 58,
 'today': 59,
 '5': 60,
 'mass': 61,
 'car': 62,
 '2015': 63,
 'life': 64,
 'want': 65,
 'hiroshima': 66,
 'years': 67,
 'work': 68,
 'train': 69,
 'last': 70,
 'fires': 71,
 'best': 72,
 'good': 73,
 'think': 74,
 'families': 75,
 'way': 76,
 'w

In [24]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)

In [26]:
print(train_sentences[10:15])
print(train_sequences[10:15])

['three people died heat wave far'
 'haha south tampa getting flooded hah- wait second live south tampa gonna gonna fvck #flooding'
 "#raining #flooding #florida #tampabay #tampa 18 19 days. i've lost count"
 '#flood bago myanmar #we arrived bago'
 'damage school bus 80 multi car crash #breaking']
[[566, 13, 432, 149, 304, 455], [799, 472, 2371, 160, 2372, 2969, 567, 698, 191, 472, 2371, 213, 213, 5924, 133], [2970, 133, 1988, 5925, 2371, 850, 1730, 539, 305, 658, 2971], [107, 3920, 659, 2373, 1528, 3920], [114, 100, 367, 3921, 2374, 62, 29, 325]]


In [27]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Max number of words in a sequence
max_length = 20

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post", truncating="post")
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding="post", truncating="post")
train_padded.shape, val_padded.shape

((6090, 20), (1523, 20))

In [28]:
train_padded[10]

array([566,  13, 432, 149, 304, 455,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0], dtype=int32)

In [29]:
print(train_sentences[10])
print(train_sequences[10])
print(train_padded[10])

three people died heat wave far
[566, 13, 432, 149, 304, 455]
[566  13 432 149 304 455   0   0   0   0   0   0   0   0   0   0   0   0
   0   0]


In [30]:
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])

In [31]:
reverse_word_index

{1: 't',
 2: 'co',
 3: 'http',
 4: 'https',
 5: 'amp',
 6: 'like',
 7: 'fire',
 8: 'get',
 9: '2',
 10: 'via',
 11: "i'm",
 12: 'new',
 13: 'people',
 14: 'news',
 15: 'emergency',
 16: 'one',
 17: "'",
 18: 'video',
 19: 'disaster',
 20: 'body',
 21: 'burning',
 22: 'would',
 23: 'buildings',
 24: 'police',
 25: 'u',
 26: 'us',
 27: '3',
 28: 'day',
 29: 'crash',
 30: 'first',
 31: 'man',
 32: 'still',
 33: '1',
 34: 'got',
 35: 'know',
 36: 'california',
 37: 'two',
 38: 'back',
 39: 'time',
 40: 'going',
 41: 'full',
 42: 'accident',
 43: '4',
 44: 'world',
 45: 'attack',
 46: 'see',
 47: 'nuclear',
 48: 'gt',
 49: 'love',
 50: 'rt',
 51: 'youtube',
 52: 'may',
 53: 'year',
 54: 'go',
 55: 'many',
 56: 'watch',
 57: 'collapse',
 58: 'dead',
 59: 'today',
 60: '5',
 61: 'mass',
 62: 'car',
 63: '2015',
 64: 'life',
 65: 'want',
 66: 'hiroshima',
 67: 'years',
 68: 'work',
 69: 'train',
 70: 'last',
 71: 'fires',
 72: 'best',
 73: 'good',
 74: 'think',
 75: 'families',
 76: 'way',
 77

In [32]:
def decode(sequence):
    return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])

In [33]:
decoded_text = decode(train_sequences[10])

print(train_sequences[10])
print(decoded_text)

[566, 13, 432, 149, 304, 455]
three people died heat wave far


In [34]:
from tensorflow.keras import layers

# Embedding: https://www.tensorflow.org/tutorials/text/word_embeddings
# Turns positive integers (indexes) into dense vectors of fixed size. (other approach could be one-hot-encoding)

# Word embeddings give us a way to use an efficient, dense representation in which similar words have 
# a similar encoding. Importantly, you do not have to specify this encoding by hand. An embedding is a 
# dense vector of floating point values (the length of the vector is a parameter you specify).

model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words, 32, input_length=max_length))

# The layer will take as input an integer matrix of size (batch, input_length),
# and the largest integer (i.e. word index) in the input should be no larger than num_words (vocabulary size).
# Now model.output_shape is (None, input_length, 32), where `None` is the batch dimension.


model.add(layers.LSTM(64, dropout=0.1))
model.add(layers.Dense(1, activation="sigmoid"))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 32)            890656    
_________________________________________________________________
lstm (LSTM)                  (None, 64)                24832     
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 915,553
Trainable params: 915,553
Non-trainable params: 0
_________________________________________________________________


In [35]:
loss = keras.losses.BinaryCrossentropy(from_logits=False)
optim = keras.optimizers.Adam(lr=0.001)
metrics = ["accuracy"]

model.compile(loss=loss, optimizer=optim, metrics=metrics)

In [36]:
model.fit(train_padded, train_labels, epochs=20, validation_data=(val_padded, val_labels), verbose=2)

Epoch 1/20
191/191 - 35s - loss: 0.5366 - accuracy: 0.7248 - val_loss: 0.5066 - val_accuracy: 0.7669
Epoch 2/20
191/191 - 2s - loss: 0.2827 - accuracy: 0.8905 - val_loss: 0.5228 - val_accuracy: 0.7827
Epoch 3/20
191/191 - 2s - loss: 0.1377 - accuracy: 0.9525 - val_loss: 0.6269 - val_accuracy: 0.7715
Epoch 4/20
191/191 - 2s - loss: 0.0854 - accuracy: 0.9757 - val_loss: 0.6897 - val_accuracy: 0.7649
Epoch 5/20
191/191 - 2s - loss: 0.0601 - accuracy: 0.9823 - val_loss: 0.7436 - val_accuracy: 0.7722
Epoch 6/20
191/191 - 2s - loss: 0.0449 - accuracy: 0.9880 - val_loss: 0.9164 - val_accuracy: 0.7498
Epoch 7/20
191/191 - 2s - loss: 0.0343 - accuracy: 0.9921 - val_loss: 0.9768 - val_accuracy: 0.7374
Epoch 8/20
191/191 - 2s - loss: 0.0344 - accuracy: 0.9906 - val_loss: 0.8843 - val_accuracy: 0.7617
Epoch 9/20
191/191 - 2s - loss: 0.0261 - accuracy: 0.9931 - val_loss: 1.2386 - val_accuracy: 0.7413
Epoch 10/20
191/191 - 2s - loss: 0.0226 - accuracy: 0.9934 - val_loss: 0.9611 - val_accuracy: 0.759

<tensorflow.python.keras.callbacks.History at 0x7f2330515050>

In [37]:
predictions = model.predict(train_padded)
predictions = [1 if p > 0.5 else 0 for p in predictions]

In [38]:
print(train_sentences[10:20])

print(train_labels[10:20])
print(predictions[10:20])

['three people died heat wave far'
 'haha south tampa getting flooded hah- wait second live south tampa gonna gonna fvck #flooding'
 "#raining #flooding #florida #tampabay #tampa 18 19 days. i've lost count"
 '#flood bago myanmar #we arrived bago'
 'damage school bus 80 multi car crash #breaking' "what's man?"
 'love fruits' 'summer lovely' 'car fast' 'goooooooaaaaaal!!!!!!']
[1 1 1 1 1 0 0 0 0 0]
[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
