Accessing Sentiment140 Dataset from Google Drive

In [0]:
# DataFrame
import pandas as pd

# Matplot
import matplotlib.pyplot as plt
%matplotlib inline

# Scikit-learn
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
#from sklearn.manifold import TSNE
#from sklearn.feature_extraction.text import TfidfVectorizer

# Keras
import keras
#from keras.preprocessing.text import Tokenizer
#from keras.preprocessing.sequence import pad_sequences
#from keras.models import Sequential
#from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
#from keras import utils
#from keras.callbacks import ReduceLROnPlateau, EarlyStopping

# nltk
import nltk
#from nltk.corpus import stopwords
#from  nltk.stem import SnowballStemmer

# Word2vec
import gensim

# Utility
import re
import numpy as np
import os
from collections import Counter
import logging
import time
import pickle
import itertools


Using TensorFlow backend.


In [0]:
#mount the drive from google drive, where dataset is kept
from google.colab import drive
drive.mount('/content/drive')
print(os.listdir("../content/drive/My Drive/sentiment140"))

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive
['training.1600000.processed.noemoticon.csv']


In [0]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Import dataset and create dataframe in runtime

In [0]:
columns = ["target", "ids", "date", "flag", "user", "text"]
dataset = pd.read_csv('../content/drive/My Drive/sentiment140/training.1600000.processed.noemoticon.csv', encoding = 'latin-1', names = columns, engine='python')
train_set, test_set = sk.model_selection.train_test_split(dataset, test_size = 0.2, random_state = 457) #shuffle using random-state seed 457

In [0]:
print('Training Dataset Size: ', len(train_set))
print('Test Dataset Size: ', len(test_set))

NameError: ignored

In [0]:
%%time
decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 4: "POSITIVE"}
def decode_sentiment(label):
    return decode_map[int(label)]
train_set.target = train_set.target.apply(lambda x: decode_sentiment(x))
test_set.target = test_set.target.apply(lambda x: decode_sentiment(x))

In [0]:
# Pre-process - Sentiment Decoded DataFrames for Training and Test
train_set.head(5)

In [0]:
test_set.head(5)

In [0]:
%%time
stop_words = nltk.corpus.stopwords.words("english")
stemmer = nltk.stem.SnowballStemmer("english")
def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub('@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

train_set.text = train_set.text.apply(lambda x: preprocess(x))
test_set.text = test_set.text.apply(lambda x: preprocess(x))

In [0]:
# Post-processed - Sentiment Decoded and Text Processed DataFrames
train_set.head(5)

In [0]:
test_set.head(5)

Word2Vec - Generate word embeddings for embedding layer of LSTM

In [0]:
%%time
documents = [_text.split() for _text in train_set.text] 

In [0]:
print(len(documents))

In [0]:
w2v_model = gensim.models.word2vec.Word2Vec(size=300, 
                                            window=5, 
                                            min_count=10, 
                                            workers=8)

In [0]:
w2v_model.build_vocab(documents)

In [0]:
words = w2v_model.wv.vocab.keys()
vocab_size = len(words)
print("Vocab size", vocab_size)

In [0]:
%%time
w2v_model.train(documents, total_examples=len(documents), epochs=16)

In [0]:
# Verify word2vec
w2v_model.most_similar("hello")

In [0]:
w2v_model.most_similar("world")

In [0]:
w2v_model.most_similar("love")

In [0]:
%%time
tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(train_set.text)

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

In [0]:
tokenizer.texts_to_sequences(train_set.text)[0]

In [0]:
tokenizer.texts_to_sequences(train_set.text)[0]

In [0]:
%%time
x_train = keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(train_set.text), maxlen=300) #Standardize Text input for model training
x_test = keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(test_set.text), maxlen=300) #Standardize Text input for test evaluation
print('x_train', x_train.shape)
print('x_test', x_test.shape)

In [0]:
labels = train_set.target.unique().tolist()
labels.append('NEUTRAL')
labels

#Encode Labels
encoder = sk.preprocessing.LabelEncoder()
encoder.fit(train_set.target.tolist())

y_train = encoder.transform(train_set.target.tolist())
y_test = encoder.transform(test_set.target.tolist())

y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

print("y_train",y_train.shape)
print("y_test",y_test.shape)

In [0]:
print("x_train", x_train.shape)
print("y_train", y_train.shape)
print()
print("x_test", x_test.shape)
print("y_test", y_test.shape)

In [0]:
# Create the embedding layer
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tokenizer.word_index.items():
  if word in w2v_model.wv:
    embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)

In [0]:
embedding_layer = keras.layers.Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=300, trainable=False) 

In [0]:

from tensorflow.python.keras import models
from tensorflow.python.keras import initializers
from tensorflow.python.keras import regularizers

from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.layers import Dropout
from tensorflow.python.keras.layers import Embedding
from tensorflow.python.keras.layers import SeparableConv1D
from tensorflow.python.keras.layers import MaxPooling1D
from tensorflow.python.keras.layers import GlobalAveragePooling1D

kernel_size = 5
dropout_rate = 0.2
filters = 3
pool_size = 3

# Create the Separable CNN model
model = keras.models.Sequential()
model.add(embedding_layer)
model.add(Dropout(rate=dropout_rate))
model.add(SeparableConv1D(filters=filters,
                              kernel_size=kernel_size,
                              activation='relu',
                              bias_initializer='random_uniform',
                              depthwise_initializer='random_uniform',
                              padding='same'))
model.add(SeparableConv1D(filters=filters,
                              kernel_size=kernel_size,
                              activation='relu',
                              bias_initializer='random_uniform',
                              depthwise_initializer='random_uniform',
                              padding='same'))
model.add(MaxPooling1D(pool_size=pool_size))

model.add(SeparableConv1D(filters=filters * 2,
                          kernel_size=kernel_size,
                          activation='relu',
                          bias_initializer='random_uniform',
                          depthwise_initializer='random_uniform',
                          padding='same'))
model.add(SeparableConv1D(filters=filters * 2,
                          kernel_size=kernel_size,
                          activation='relu',
                          bias_initializer='random_uniform',
                          depthwise_initializer='random_uniform',
                          padding='same'))
model.add(GlobalAveragePooling1D())
model.add(Dropout(rate=dropout_rate))
model.add(Dense(2, activation='sigmoid'))

In [0]:
from google.colab import files
keras.utils.plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)
files.download('model_plot.png')

In [0]:
model.summary()
model.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])

In [0]:
callbacks = [ keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
              keras.callbacks.EarlyStopping(monitor='val_accuracy', min_delta=1e-4, patience=1)]

In [0]:
%%time
history = model.fit(x_train, y_train,
                    batch_size=1024,
                    epochs=32,
                    validation_split=0.1,
                    verbose=1,
                    callbacks=callbacks)

In [0]:
%%time
score = model.evaluate(x_test, y_test, batch_size=512)
print()
print("ACCURACY:",score[1])
print("LOSS:",score[0])

In [0]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
 
epochs = range(len(acc))
 
plt.plot(epochs, acc, 'b', label='Training accuracy')
plt.plot(epochs, val_acc, 'r', label='Validation accuracy')
plt.title('Training & Validation Accuracy')
plt.legend()
 
plt.figure()
 
plt.plot(epochs, loss, 'b', label='Training Loss')
plt.plot(epochs, val_loss, 'r', label='Validation Loss')
plt.title('Training & Validation loss')
plt.legend()
 
plt.show()

In [0]:
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
def decode_sentiment(score, include_neutral=False):
    if include_neutral:        
        label = NEUTRAL
        if score <= 0.4:
            label = NEGATIVE
        elif score >= 0.7:
            label = POSITIVE

        return label
    else:
        return NEGATIVE if score < 0.5 else POSITIVE

In [0]:
def predict(text, include_neutral=False):
    start_at = time.time()
    # Tokenize text
    x_test = keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=300)
    # Predict
    score = model.predict([x_test])[0]
    # Decode sentiment
    label = decode_sentiment(score, include_neutral=include_neutral)

    return {"label": label, "score": float(score),
       "elapsed_time": time.time()-start_at}  

In [0]:
predict("I hate how much I love the music")

In [0]:
predict('This ipad is not terrible')

In [0]:
predict('I dont know how to feel about this movie. Its good at some points and bad at others. I think I dont like it')