In [43]:
# Importing the necessary dependencies
import pandas as pd
import numpy as np
from copy import deepcopy
from string import punctuation
from random import shuffle

import gensim
from gensim.models.word2vec import Word2Vec
LabeledSentence = gensim.models.doc2vec.LabeledSentence

from keras.utils import np_utils

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()
from keras.models import Sequential
from keras.optimizers import Adam
from keras.layers import Dense, Activation, LSTM, Embedding
from sklearn.model_selection import train_test_split

In [44]:
# get the data
# Convert the labels from str to int.
def getData():
    data = pd.read_csv("mydata.csv",error_bad_lines=False, encoding = "ISO-8859-1",header=None)
    data.columns = ["SentimentText","Sentiment"]
    data['Sentiment'] = data['Sentiment'].map(int)
    return data

data = getData()
pos = data[data.Sentiment == 1]
neg = data[data.Sentiment == 0]
print("Pos examples:",pos.shape,"Neg examples:", neg.shape)
data.head()

Pos examples: (27374, 2) Neg examples: (8344, 2)


Unnamed: 0,SentimentText,Sentiment
0,RT @Mayank_M_Joshi: Kovind will make an except...,1
1,@narendramodi @ashrafghani Hii pm modi good mo...,1
2,"And this "" love for Dalit "" emerged to sidelin...",1
3,RT @aijazzakasyed: @AijazZakaSyed writes in @S...,1
4,@soniandtv @vivekagnihotri Mamatajiððð...,1


In [45]:
# Tokenize the tweets
# Remove the tokens which start with '@' or 'http' or '#
def tokenize(tweet):
    try:
        tweet = str(tweet).lower()
        tokens = tokenizer.tokenize(tweet)
        tokens = [ token for token in tokens if not ( token.startswith("@") or token.startswith("http") or token.startswith("#")) ]
        return tokens
    except:
        return None

In [46]:
def postprocess(data):
    data['tokens'] = data['SentimentText'].progress_map(tokenize)
    return data

data = postprocess(data)



progress-bar:   0%|          | 0/35718 [00:00<?, ?it/s][A[A

progress-bar:   5%|▍         | 1725/35718 [00:00<00:01, 17241.18it/s][A[A

progress-bar:  10%|▉         | 3441/35718 [00:00<00:01, 17215.08it/s][A[A

progress-bar:  15%|█▍        | 5225/35718 [00:00<00:01, 17397.12it/s][A[A

progress-bar:  19%|█▉        | 6925/35718 [00:00<00:01, 17274.31it/s][A[A

progress-bar:  24%|██▍       | 8703/35718 [00:00<00:01, 17422.96it/s][A[A

progress-bar:  30%|██▉       | 10664/35718 [00:00<00:01, 18024.99it/s][A[A

progress-bar:  35%|███▍      | 12422/35718 [00:00<00:01, 17887.80it/s][A[A

progress-bar:  40%|████      | 14363/35718 [00:00<00:01, 18316.89it/s][A[A

progress-bar:  45%|████▌     | 16101/35718 [00:00<00:01, 17966.13it/s][A[A

progress-bar:  50%|████▉     | 17834/35718 [00:01<00:01, 17671.45it/s][A[A

progress-bar:  55%|█████▍    | 19558/35718 [00:01<00:00, 17245.76it/s][A[A

progress-bar:  60%|██████    | 21451/35718 [00:01<00:00, 17718.63it/s][A[A

prog

In [47]:
# Break the data into test train split
n = data.shape[0]
x_train, x_test, y_train, y_test = train_test_split(np.array(data.head(n).tokens),
                                                    np.array(data.head(n).Sentiment), test_size=0.2)


In [48]:
def labelizeTweets(tweets, label_type):
    labelized = []
    for i,v in tqdm(enumerate(tweets)):
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized

x_train = labelizeTweets(x_train, 'TRAIN')
x_test = labelizeTweets(x_test, 'TEST')



0it [00:00, ?it/s][A[A

28574it [00:00, 437981.86it/s][A[A

0it [00:00, ?it/s][A[A

7144it [00:00, 466919.75it/s][A[A

In [7]:
# Load the google's trained word3vec model
tweet_w2v = gensim.models.KeyedVectors.load_word2vec_format("dataset/google.bin.gz", binary=True)

In [49]:
# Converting a scentence to vector
# Basically averaging the vectors of different words
def wordvector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += tweet_w2v[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

In [50]:
from sklearn.preprocessing import scale
n_dim = 300
train_vecs_w2v = np.concatenate([wordvector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_train))])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([wordvector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_test))])
test_vecs_w2v = scale(test_vecs_w2v)



0it [00:00, ?it/s][A[A

1792it [00:00, 17916.90it/s][A[A

3550it [00:00, 17813.74it/s][A[A

5335it [00:00, 17822.43it/s][A[A

7077it [00:00, 17697.55it/s][A[A

8892it [00:00, 17830.43it/s][A[A

10644it [00:00, 17734.49it/s][A[A

12379it [00:00, 17613.72it/s][A[A

14144it [00:00, 17623.70it/s][A[A

15913it [00:00, 17643.05it/s][A[A

17651it [00:01, 17562.92it/s][A[A

19391it [00:01, 17510.32it/s][A[A

21149it [00:01, 17522.62it/s][A[A

22911it [00:01, 17549.99it/s][A[A

24683it [00:01, 17599.37it/s][A[A

26432it [00:01, 17564.31it/s][A[A

28181it [00:01, 17513.52it/s][A[A

28574it [00:01, 17601.47it/s][A[A

0it [00:00, ?it/s][A[A

1749it [00:00, 17487.85it/s][A[A

3499it [00:00, 17490.65it/s][A[A

5253it [00:00, 17502.77it/s][A[A

6943it [00:00, 17316.97it/s][A[A

7144it [00:00, 17334.22it/s][A[A

In [51]:
# Converting labels to one hot vector
# 0 -> [0,1]
# 1 - [1,0]
train_y = np_utils.to_categorical(y_train)
test_y = np_utils.to_categorical(y_test)

In [52]:
# building a model with 2 hidden layers
model = Sequential()
model.add(Dense(512, activation='relu', input_dim=300, kernel_initializer='normal'))
model.add(Dense(512, activation='relu', kernel_initializer='normal'))
model.add(Dense(2, activation='softmax', kernel_initializer='normal'))
model.compile(optimizer=Adam(lr=0.0001),
              loss='mean_squared_error',
              metrics=['accuracy'])
model.fit(train_vecs_w2v, train_y, validation_data=(test_vecs_w2v, test_y), epochs=30, batch_size=100, verbose=1)

Train on 28574 samples, validate on 7144 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fde0a69d358>

In [53]:
m = 108
print(data.SentimentText[m], data.Sentiment[m])
scentence = data.tokens[m]
scentence = tokenize("Kovind will make an exceptional president: PM Narendra Modi")
vec = wordvector(scentence, 300)
print(model.predict(vec))

To make the #InternationalYogaDay programme in #Lucknow a success, various NGOs and #Yoga gurus have camped there. https://t.co/TNuAucVF2Q 1
[[ 0.30909264  0.69090742]]


In [54]:
model.save("model1.h5")