In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy
from string import punctuation
from random import shuffle

import gensim
from gensim.models.word2vec import Word2Vec
LabeledSentence = gensim.models.doc2vec.LabeledSentence

from keras.utils import np_utils

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()
from keras.models import Sequential
from keras.optimizers import Adam
from keras.layers import Dense, Activation, LSTM, Embedding
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
def ingest():
    data = pd.read_csv("mydata.csv",error_bad_lines=False, encoding = "ISO-8859-1",header=None)
    data.columns = ["SentimentText","Sentiment"]
    data['Sentiment'] = data['Sentiment'].map(int)
    return data

data = ingest()
pos = data[data.Sentiment == 1]
neg = data[data.Sentiment == 0]
print("Pos examples:",pos.shape,"Neg examples:", neg.shape)

Pos examples: (27374, 2) Neg examples: (8344, 2)


In [4]:
def tokenize(tweet):
    try:
        tweet = str(tweet).lower()
        tokens = tokenizer.tokenize(tweet)
        tokens = [ token for token in tokens if not ( token.startswith("@") or token.startswith("http") or token.startswith("#")) ]
        return tokens
    except:
        return None

In [5]:
def postprocess(data):
    data['tokens'] = data['SentimentText'].progress_map(tokenize)
    return data

data = postprocess(data)

progress-bar: 100%|██████████| 35718/35718 [00:01<00:00, 18091.23it/s]


In [6]:
n = data.shape[0]

x_train, x_test, y_train, y_test = train_test_split(np.array(data.head(n).tokens),
                                                    np.array(data.head(n).Sentiment), test_size=0.2)


In [11]:
def labelizeTweets(tweets, label_type):
    labelized = []
    for i,v in tqdm(enumerate(tweets)):
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized

x_train = labelizeTweets(x_train, 'TRAIN')
x_test = labelizeTweets(x_test, 'TEST')



0it [00:00, ?it/s][A[A

28574it [00:00, 443543.23it/s][A[A

0it [00:00, ?it/s][A[A

7144it [00:00, 469414.06it/s][A[A

In [7]:
tweet_w2v = gensim.models.KeyedVectors.load_word2vec_format("dataset/google.bin.gz", binary=True)

In [8]:
def wordvector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += tweet_w2v[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

In [12]:
from sklearn.preprocessing import scale
n_dim = 300
train_vecs_w2v = np.concatenate([wordvector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_train))])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([wordvector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_test))])
test_vecs_w2v = scale(test_vecs_w2v)



0it [00:00, ?it/s][A[A

1595it [00:00, 15943.06it/s][A[A

3240it [00:00, 16090.46it/s][A[A

4896it [00:00, 16228.04it/s][A[A

6570it [00:00, 16360.04it/s][A[A

8249it [00:00, 16486.61it/s][A[A

9912it [00:00, 16527.99it/s][A[A

11531it [00:00, 16424.71it/s][A[A

13179it [00:00, 16438.56it/s][A[A

14827it [00:00, 16449.59it/s][A[A

16469it [00:01, 16439.76it/s][A[A

18090it [00:01, 16369.54it/s][A[A

19737it [00:01, 16397.77it/s][A[A

21364it [00:01, 16357.36it/s][A[A

22987it [00:01, 16317.37it/s][A[A

24627it [00:01, 16341.82it/s][A[A

26271it [00:01, 16370.91it/s][A[A

27969it [00:01, 16548.76it/s][A[A

28574it [00:01, 16446.98it/s][A[A

0it [00:00, ?it/s][A[A

1647it [00:00, 16467.54it/s][A[A

3223it [00:00, 16245.43it/s][A[A

4852it [00:00, 16256.33it/s][A[A

6459it [00:00, 16197.91it/s][A[A

7144it [00:00, 16064.67it/s][A[A

In [13]:
train_y = np_utils.to_categorical(y_train)
test_y = np_utils.to_categorical(y_test)

In [20]:
model = Sequential()
model.add(Dense(512, activation='relu', input_dim=300, kernel_initializer='normal'))
model.add(Dense(512, activation='relu', kernel_initializer='normal'))
model.add(Dense(2, activation='softmax', kernel_initializer='normal'))
model.compile(optimizer=Adam(lr=0.0003),
              loss='mean_squared_error',
              metrics=['accuracy'])
model.fit(train_vecs_w2v, train_y, validation_data=(test_vecs_w2v, test_y), epochs=30, batch_size=100, verbose=1)

Train on 28574 samples, validate on 7144 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fde434f2eb8>

In [26]:
m = 108
print(data.SentimentText[m], data.Sentiment[m])
scentence = data.tokens[m]
scentence = tokenize("Kovind will make an exceptional president: PM Narendra Modi")
vec = wordvector(scentence, 300)
print(model.predict(vec))

To make the #InternationalYogaDay programme in #Lucknow a success, various NGOs and #Yoga gurus have camped there. https://t.co/TNuAucVF2Q 1
[[ 0.35325179  0.64674824]]


In [27]:
model.save("model1.h5")