<a href="https://colab.research.google.com/github/aliakbarbadri/nlp-tf/blob/master/week3/week3-exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download -d kazanova/sentiment140
! unzip sentiment140.zip 

Downloading sentiment140.zip to /content
 90% 73.0M/80.9M [00:01<00:00, 57.5MB/s]
100% 80.9M/80.9M [00:01<00:00, 75.8MB/s]
Archive:  sentiment140.zip
  inflating: training.1600000.processed.noemoticon.csv  


In [28]:
! kaggle datasets download -d terenceliu4444/glove6b100dtxt
! unzip glove6b100dtxt.zip

Downloading glove6b100dtxt.zip to /content
 94% 124M/131M [00:03<00:00, 65.4MB/s]
100% 131M/131M [00:03<00:00, 40.8MB/s]
Archive:  glove6b100dtxt.zip
  inflating: glove.6B.100d.txt       


In [0]:
import json
import tensorflow as tf
import pandas as pd
import random
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import regularizers

In [0]:
embedding_dim = 100
max_length = 16
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size= 160000
test_portion=.1

In [5]:
csv = pd.read_csv("training.1600000.processed.noemoticon.csv",encoding='latin-1',header=None,usecols=[0,5])
csv.columns = ['label','tweet']
csv.head(5)

Unnamed: 0,label,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [6]:
csv.dtypes

label     int64
tweet    object
dtype: object

In [7]:
pd.unique(csv['label'])

array([0, 4])

In [8]:
csv['label'].replace({4: 1}, inplace=True)
pd.unique(csv['label'])

array([0, 1])

In [0]:
num_sentences = csv.shape[0]
corpus = csv

In [13]:
print(csv.shape[0])
print(list(csv.loc[1]))

# Expected Output:
# 1600000
# ["is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!", 0]

1600000
[0, "is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!"]


In [19]:
csv = csv.sample(frac = 1).reset_index(drop=True)
list(csv.loc[1])

[0, 'Not happy with cold weather out side ']

In [0]:
sentences = list(csv['tweet'])[:training_size]
labels= list(csv['label'])[:training_size]


tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)

word_index = tokenizer.word_index
vocab_size=len(word_index)

sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences)

split = int(test_portion * training_size)

test_sequences = padded[:split]
training_sequences = padded[split:]
test_labels = labels[:split]
training_labels = labels[split:]

In [26]:
split, len(sentences), len(training_sequences), len(test_sequences)

(16000, 160000, 144000, 16000)

In [25]:
print(vocab_size)
print(word_index['i'])
# Expected Output
# 138737
# 1

138737
1


In [0]:
embeddings_index = {}
with open('glove.6B.100d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

embeddings_matrix = np.zeros((vocab_size+1, embedding_dim))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector

In [31]:
print(len(embeddings_matrix))
# Expected Output
# 138738

138738


In [0]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=max_length, weights=[embeddings_matrix], trainable=False),
    # YOUR CODE HERE - experiment with combining different types, such as convolutions and LSTMs
])
model.compile(# YOUR CODE HERE)
model.summary()

num_epochs = 50
history = model.fit(training_sequences, training_labels, epochs=num_epochs, validation_data=(test_sequences, test_labels), verbose=2)

print("Training Complete")

In [0]:
import matplotlib.image  as mpimg
import matplotlib.pyplot as plt

#-----------------------------------------------------------
# Retrieve a list of list results on training and test data
# sets for each training epoch
#-----------------------------------------------------------
acc=history.history['accuracy']
val_acc=history.history['val_accuracy']
loss=history.history['loss']
val_loss=history.history['val_loss']

epochs=range(len(acc)) # Get number of epochs

#------------------------------------------------
# Plot training and validation accuracy per epoch
#------------------------------------------------
plt.plot(epochs, acc, 'r')
plt.plot(epochs, val_acc, 'b')
plt.title('Training and validation accuracy')
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend(["Accuracy", "Validation Accuracy"])

plt.figure()

#------------------------------------------------
# Plot training and validation loss per epoch
#------------------------------------------------
plt.plot(epochs, loss, 'r')
plt.plot(epochs, val_loss, 'b')
plt.title('Training and validation loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend(["Loss", "Validation Loss"])

plt.figure()


# Expected Output
# A chart where the validation loss does not increase sharply!