In [20]:
# Importing the necessary dependencies
import pandas as pd
import numpy as np
from copy import deepcopy
from string import punctuation
from random import shuffle

import gensim
from gensim.models.word2vec import Word2Vec
LabeledSentence = gensim.models.doc2vec.LabeledSentence

from keras.utils import np_utils
import emoji
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
import pickle
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()
from keras.models import Sequential
from keras.optimizers import Adam
from keras.layers import Dense, Activation, LSTM, Embedding
from sklearn.model_selection import train_test_split

In [2]:
# get the data
# Convert the labels from str to int.
def getData():
    data = pd.read_csv("dataset/mydata.csv",error_bad_lines=False, encoding = "ISO-8859-1",header=None)
    data.columns = ["SentimentText","Sentiment"]
    data['Sentiment'] = data['Sentiment'].map(int)
    return data

data = getData()
pos = data[data.Sentiment == 1]
neg = data[data.Sentiment == 0]
print("Pos examples:",pos.shape,"Neg examples:", neg.shape)
data.head()

Pos examples: (27605, 2) Neg examples: (8395, 2)


Unnamed: 0,SentimentText,Sentiment
0,RT @Mayank_M_Joshi: Kovind will make an except...,1
1,@narendramodi @ashrafghani Hii pm modi good mo...,1
2,"And this "" love for Dalit "" emerged to sidelin...",1
3,RT @aijazzakasyed: @AijazZakaSyed writes in @S...,1
4,@soniandtv @vivekagnihotri Mamatajiððð...,1


In [50]:
# Tokenize the tweets
# Remove the tokens which start with '@' or 'http' or '#'
# Replace slangs in the tweets. Ex: k -> ok, u -> you
# Replace emojis with corresponging text. Ex: 😂 -> face with tears of joy
slangs = pickle.load(open("dataset/slang.pkl","rb"))
slangs['&'] = 'and'
def tokenize(tweet):
    try:
        tweet = str(tweet).lower()
        tokens = tokenizer.tokenize(tweet)
        tokens = [ token for token in tokens if not ( token.startswith("@") or token.startswith("http") or token.startswith("#")) ]
        final_tokens = []
        for i in range(len(tokens)):
            try:
                tokens[i] = slangs[tokens[i]]
            except:
                continue
        for i in range(len(tokens)):
            try:
                words = emoji.UNICODE_EMOJI[tokens[i]][1:-1].split("_")
                final_tokens += words
            except:
                final_tokens.append(tokens[i])
        return final_tokens
    except:
        return None

In [51]:
def postprocess(data):
    data['tokens'] = data['SentimentText'].progress_map(tokenize)
    return data

data = postprocess(data)

progress-bar: 100%|██████████| 36000/36000 [00:02<00:00, 14340.64it/s]


In [52]:
# Break the data into test train split
n = data.shape[0]
x_train, x_test, y_train, y_test = train_test_split(np.array(data.head(n).tokens),
                                                    np.array(data.head(n).Sentiment), test_size=0.2)


In [53]:
def labelizeTweets(tweets, label_type):
    labelized = []
    for i,v in tqdm(enumerate(tweets)):
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized

x_train = labelizeTweets(x_train, 'TRAIN')
x_test = labelizeTweets(x_test, 'TEST')

28800it [00:00, 469126.90it/s]
7200it [00:00, 477280.81it/s]


In [47]:
# Load the google's trained word2vec model
tweet_w2v = gensim.models.KeyedVectors.load_word2vec_format("dataset/google.bin.gz", binary=True)

In [54]:
# Converting a scentence to vector
# Basically averaging the vectors of different words
def wordvector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += tweet_w2v[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

In [55]:
from sklearn.preprocessing import scale
n_dim = 300
train_vecs_w2v = np.concatenate([wordvector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_train))])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([wordvector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_test))])
test_vecs_w2v = scale(test_vecs_w2v)

28800it [00:01, 16449.52it/s]
7200it [00:00, 16382.06it/s]


In [56]:
# Converting labels to one hot vector
# 0 -> [0,1]
# 1 - [1,0]
train_y = np_utils.to_categorical(y_train)
test_y = np_utils.to_categorical(y_test)

In [58]:
# building a model with 2 hidden layers
model = Sequential()
model.add(Dense(512, activation='relu', input_dim=300, kernel_initializer='normal'))
model.add(Dense(512, activation='relu', kernel_initializer='normal'))
model.add(Dense(2, activation='softmax', kernel_initializer='normal'))
model.compile(optimizer=Adam(lr=0.0001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.fit(train_vecs_w2v, train_y, validation_data=(test_vecs_w2v, test_y), epochs=30, batch_size=100, verbose=1)

Train on 28800 samples, validate on 7200 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f44c6d6b7b8>

In [62]:
scentence = tokenize("Modi govt is doing a great job 😂😂😂😂😂😂")
print(scentence)
vec = wordvector(scentence, 300)
print(model.predict(vec))

['modi', 'govt', 'is', 'doing', 'a', 'great', 'job', 'face', 'with', 'tears', 'of', 'joy', 'face', 'with', 'tears', 'of', 'joy', 'face', 'with', 'tears', 'of', 'joy']
[[ 0.1157992   0.88420075]]


In [54]:
model.save("model1.h5")