In [1]:
import pandas as pd # provide sql-like data manipulation tools. very handy.
pd.options.mode.chained_assignment = None
import numpy as np # high dimensional vector computing library.
from copy import deepcopy
from string import punctuation
from random import shuffle

import gensim
from gensim.models.word2vec import Word2Vec # the word2vec model gensim class
LabeledSentence = gensim.models.doc2vec.LabeledSentence # we'll talk about this down below

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

from nltk import word_tokenize
from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
tokenizer = TweetTokenizer()

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
def ingest():
    data = pd.read_csv('./trainingandtestdata/tweets.csv')
    data.columns=["Sentiment","ItemID","Date","Blank","SentimentSource","SentimentText"]
    data.drop(['ItemID', 'SentimentSource'], axis=1, inplace=True)
    data = data[data.Sentiment.isnull() == False]
    data['Sentiment'] = data['Sentiment'].map( {4:1, 0:0})
    data = data[data['SentimentText'].isnull() == False]
    data.reset_index(inplace=True)
    data.drop('index', axis=1, inplace=True)
    print 'dataset loaded with shape', data.shape    
    return data

data = ingest()
data.head(5)

dataset loaded with shape (1599999, 4)


Unnamed: 0,Sentiment,Date,Blank,SentimentText
0,0,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,is upset that he can't update his Facebook by ...
1,0,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,@Kenichan I dived many times for the ball. Man...
2,0,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,my whole body feels itchy and like its on fire
3,0,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,"@nationwideclass no, it's not behaving at all...."
4,0,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,@Kwesidei not the whole crew


In [7]:
def tokenize(tweet):
    try:
        tweet = unicode(tweet.decode('utf-8').lower())
        tokens = tokenizer.tokenize(tweet)
        tokens = filter(lambda t: not t.startswith('@'), tokens)
        tokens = filter(lambda t: not t.startswith('#'), tokens)
        tokens = filter(lambda t: not t.startswith('http'), tokens)
        return tokens
    except:
        return 'NC'

In [65]:
def postprocess(data, n=1000000):
    data = data.head(n)
    data['tokens'] = data['SentimentText'].progress_map(tokenize)  ## progress_map is a variant of the map function plus a progress bar. Handy to monitor DataFrame creations.
    data = data[data.tokens != 'NC']
    data.reset_index(inplace=True)
    data.drop('index', inplace=True, axis=1)
    return data

data = postprocess(data)
data.head(5)

progress-bar: 100%|██████████| 999597/999597 [02:09<00:00, 7734.27it/s]


Unnamed: 0,Sentiment,Date,Blank,SentimentText,tokens
0,0,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,is upset that he can't update his Facebook by ...,"[is, upset, that, he, can't, update, his, face..."
1,0,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,@Kenichan I dived many times for the ball. Man...,"[i, dived, many, times, for, the, ball, ., man..."
2,0,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its..."
3,0,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,"@nationwideclass no, it's not behaving at all....","[no, ,, it's, not, behaving, at, all, ., i'm, ..."
4,0,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,@Kwesidei not the whole crew,"[not, the, whole, crew]"


In [9]:
x_train, x_test, y_train, y_test = train_test_split(np.array(data.head(1000000).tokens),
                                                    np.array(data.head(1000000).Sentiment), test_size=0.2)

In [10]:
def labelizeTweets(tweets, label_type):
    labelized = []
    for i,v in tqdm(enumerate(tweets)):
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized

x_train = labelizeTweets(x_train, 'TRAIN')
x_test = labelizeTweets(x_test, 'TEST')

  """
799677it [00:08, 89217.09it/s]
199920it [00:01, 121964.47it/s]


In [11]:
x_train[12]

TaggedDocument(words=[u'my', u'legs', u'ache'], tags=['TRAIN_12'])

In [12]:
n=1000000
n_dim = 200
tweet_w2v = Word2Vec(size=n_dim, min_count=10)
tweet_w2v.build_vocab([x.words for x in tqdm(x_train)])

100%|██████████| 799677/799677 [00:01<00:00, 636153.19it/s]


In [13]:
tweet_w2v.train([x.words for x in tqdm(x_train)],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter) 

100%|██████████| 799677/799677 [00:01<00:00, 773219.83it/s]
  """Entry point for launching an IPython kernel.


(42963599, 59733865)

In [14]:
tweet_w2v['good']


  """Entry point for launching an IPython kernel.


array([ 0.28444976,  0.69942904,  0.14818138,  0.31924927, -0.4680742 ,
        0.9024628 ,  1.4263029 , -0.47907603, -1.332639  , -0.6724394 ,
        1.3112695 ,  2.681598  , -0.03382508, -0.35525182, -0.8161919 ,
       -0.04111078, -0.11380698,  0.64018035,  1.8797659 ,  2.4005365 ,
        0.398094  , -0.1675036 ,  2.4800208 ,  1.8750612 , -0.8103979 ,
        1.817616  ,  1.5765214 , -0.07570176, -0.95706964, -1.3454179 ,
        1.2725383 ,  0.85002136, -0.69953024, -1.2051468 , -1.0216347 ,
       -0.67992294,  0.21946803,  0.09953847, -1.5138327 , -0.5141085 ,
       -0.3735861 ,  0.7434036 ,  0.04046262,  1.2473485 ,  0.6869495 ,
        2.1402824 ,  1.8341174 ,  0.97273093, -2.7310767 , -0.49897045,
       -1.7020036 ,  0.38104835,  0.476334  ,  0.6793673 , -0.16692251,
        2.1517544 ,  0.23652887, -0.472027  , -0.95358783, -0.7426821 ,
        0.82521725, -2.129602  , -1.8416278 , -0.36993757,  0.08857165,
        1.6699846 ,  1.1030434 ,  1.9440069 , -0.39135978,  0.18

In [15]:
tweet_w2v.most_similar('good')


  """Entry point for launching an IPython kernel.


[(u'goood', 0.7055902481079102),
 (u'great', 0.6897282004356384),
 (u'pleasant', 0.6649350523948669),
 (u'rough', 0.6346817016601562),
 (u'nice', 0.606987714767456),
 (u'fabulous', 0.6029807329177856),
 (u'gd', 0.6005925536155701),
 (u'fantastic', 0.5976760983467102),
 (u'gooooood', 0.5951219797134399),
 (u'goooood', 0.5848761796951294)]

In [16]:
tweet_w2v.most_similar('bar')


  """Entry point for launching an IPython kernel.


[(u'cafe', 0.7416696548461914),
 (u'grill', 0.7310184240341187),
 (u'pub', 0.7039318680763245),
 (u'restaurant', 0.6984879970550537),
 (u'table', 0.6982424259185791),
 (u'gate', 0.6821745038032532),
 (u'club', 0.6722133159637451),
 (u'ranch', 0.6401692628860474),
 (u'lounge', 0.6189289093017578),
 (u'casino', 0.6142687797546387)]

In [17]:
tweet_w2v.most_similar('happy')


  """Entry point for launching an IPython kernel.


[(u'pleased', 0.5856743454933167),
 (u'blessed', 0.5512657761573792),
 (u'thrilled', 0.5119697451591492),
 (u'celebrating', 0.4954250454902649),
 (u'grateful', 0.49438053369522095),
 (u'excited', 0.49332529306411743),
 (u'stoked', 0.47863951325416565),
 (u'impressed', 0.47155770659446716),
 (u'thankful', 0.46986669301986694),
 (u'proud', 0.4659450948238373)]

In [18]:
tweet_w2v.most_similar('iphone')


  """Entry point for launching an IPython kernel.


[(u'itouch', 0.7675461769104004),
 (u'blackberry', 0.729372501373291),
 (u'mac', 0.704586386680603),
 (u'mms', 0.7000615000724792),
 (u'pc', 0.6999794244766235),
 (u'upgrade', 0.6975575685501099),
 (u'macbook', 0.6925462484359741),
 (u'3.0', 0.6885600090026855),
 (u'ipod', 0.6884199380874634),
 (u'jailbreak', 0.6821657419204712)]

In [19]:
# importing bokeh library for interactive dataviz
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

# defining the chart
output_notebook()
plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="A map of 10000 word vectors",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

# getting a list of word vectors. limit to 10000. each is of 200 dimensions
word_vectors = [tweet_w2v[w] for w in tweet_w2v.wv.vocab.keys()[:5000]]

# dimensionality reduction. converting the vectors to 2d vectors
from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_w2v = tsne_model.fit_transform(word_vectors)

# putting everything in a dataframe
tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
tsne_df['words'] = tweet_w2v.wv.vocab.keys()[:5000]

# plotting. the corresponding word appears when you hover on the data point.
plot_tfidf.scatter(x='x', y='y', source=tsne_df)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"word": "@words"}
show(plot_tfidf)

  del sys.path[0]


[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 5000 samples in 0.054s...
[t-SNE] Computed neighbors for 5000 samples in 12.934s...
[t-SNE] Computed conditional probabilities for sample 1000 / 5000
[t-SNE] Computed conditional probabilities for sample 2000 / 5000
[t-SNE] Computed conditional probabilities for sample 3000 / 5000
[t-SNE] Computed conditional probabilities for sample 4000 / 5000
[t-SNE] Computed conditional probabilities for sample 5000 / 5000
[t-SNE] Mean sigma: 0.270461
[t-SNE] KL divergence after 250 iterations with early exaggeration: 90.955437
[t-SNE] Error after 1000 iterations: 2.831332


In [20]:
print 'building tf-idf matrix ...'
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x.words for x in x_train])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print 'vocab size :', len(tfidf)


building tf-idf matrix ...
vocab size : 23032


In [21]:
def buildWordVector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += tweet_w2v[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [23]:
from sklearn.preprocessing import scale
train_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_train))])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_test))])
test_vecs_w2v = scale(test_vecs_w2v)

  
100%|██████████| 799677/799677 [04:54<00:00, 2716.47it/s]
100%|██████████| 199920/199920 [01:14<00:00, 2680.92it/s]


In [25]:
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.constraints import maxnorm
from keras.optimizers import SGD
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.utils import np_utils
from keras.callbacks import Callback

model = Sequential()
model.add(Dense(32, activation='relu', input_dim=200))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(train_vecs_w2v, y_train, epochs=9, batch_size=32, verbose=2)

Using TensorFlow backend.


Epoch 1/9
 - 43s - loss: 0.3512 - acc: 0.8531
Epoch 2/9
 - 43s - loss: 0.3378 - acc: 0.8591
Epoch 3/9
 - 43s - loss: 0.3344 - acc: 0.8605
Epoch 4/9
 - 43s - loss: 0.3325 - acc: 0.8615
Epoch 5/9
 - 44s - loss: 0.3312 - acc: 0.8616
Epoch 6/9
 - 43s - loss: 0.3300 - acc: 0.8625
Epoch 7/9
 - 43s - loss: 0.3292 - acc: 0.8628
Epoch 8/9
 - 43s - loss: 0.3285 - acc: 0.8632
Epoch 9/9
 - 46s - loss: 0.3278 - acc: 0.8635


<keras.callbacks.History at 0x1492be5d0>

In [38]:
score = model.evaluate(test_vecs_w2v, y_test, batch_size=128, verbose=2)
print score[1]

0.8589885954477158


# Testing on our Query

In [66]:
def wtokenize(tweet):
    try:
        tweet = unicode(tweet.decode('utf-8').lower())
        tokens=word_tokenize(tweet)
# #         tokens = tokenizer.tokenize(tweet)
#         print(tokens)
#         tokens = list(filter(lambda t: not t.startswith('@'), tokens))
#         tokens = list(filter(lambda t: not t.startswith('#'), tokens))
#         tokens = list(filter(lambda t: not t.startswith('http'), tokens))
        return tokens
    except:
        print "reg"

In [67]:
def buildWordVector1(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0
    for word in tokens:
        try:
            vec += tweet_w2v[word].reshape((1, size))
            count += 1
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [99]:
query=['35 crore people shun open defecation in a matter of 3.5 years! Follow this page to follow real-time updates on #SwachhBharat']
query=word_tokenize(query[0])


In [100]:
query_vecs_w2v = buildWordVector1(query, n_dim)
query_vecs_w2v

  


array([[-3.03265501e-01, -1.17426710e-01,  9.57767115e-02,
        -1.99453651e-01, -3.56789869e-01,  1.02993995e-01,
         4.50830157e-01,  8.55580411e-04, -6.55356174e-01,
        -5.02624939e-02,  3.47901658e-01, -2.59610134e-03,
         2.02734186e-01, -1.13517616e-01, -3.67891725e-01,
        -4.81679191e-01,  4.44278722e-01,  4.55893563e-02,
         7.37217411e-01, -8.53219789e-02, -1.23548228e-01,
        -1.14641660e-01,  1.24547989e-01,  3.21938319e-01,
        -3.21812696e-01,  1.97158147e-01, -5.02050223e-01,
        -5.80615336e-01,  5.96601237e-01, -1.80007264e-01,
        -4.57968782e-04, -7.95456339e-01,  4.02354458e-02,
         1.24362146e-02,  2.29759424e-01,  1.55673368e-01,
         3.41849017e-01, -2.34683940e-01, -5.61971924e-01,
         2.11887871e-01, -1.59208750e-01, -4.07688349e-01,
         1.91239728e-02, -1.66312356e-01, -1.18925300e-01,
        -2.80435296e-01,  1.92287106e-01, -1.10682053e-01,
        -4.70230873e-01, -1.73964416e-01, -1.32152184e-0

In [101]:
model.predict_classes(query_vecs_w2v)

array([[0]], dtype=int32)