In [1]:
import pandas as pd
import numpy as np
import gensim
from gensim.models.word2vec import Word2Vec
from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

In [33]:
LabeledSentence = gensim.models.doc2vec.LabeledSentence # we'll talk about this down below
tqdm.pandas(desc="progress-bar")
tokenizer = TweetTokenizer()
pd.options.mode.chained_assignment = None

In [34]:
df_train = pd.read_csv('data/train.csv', encoding = 'Latin-1', header=None, nrows=100000)
df_test = pd.read_csv('data/test.csv', encoding = 'Latin-1', header=None)

In [35]:
df_train.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
df_test.head()

Unnamed: 0,0,1,2,3,4,5
0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
2,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
4,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...


In [36]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 6 columns):
0    100000 non-null int64
1    100000 non-null int64
2    100000 non-null object
3    100000 non-null object
4    100000 non-null object
5    100000 non-null object
dtypes: int64(2), object(4)
memory usage: 4.6+ MB


In [37]:
df_train.describe()

Unnamed: 0,0,1
count,100000.0,100000.0
mean,0.0,1643454000.0
std,0.0,94531560.0
min,0.0,1467810000.0
25%,0.0,1558188000.0
50%,0.0,1678338000.0
75%,0.0,1695002000.0
max,0.0,1793821000.0


In [8]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 498 entries, 0 to 497
Data columns (total 6 columns):
0    498 non-null int64
1    498 non-null int64
2    498 non-null object
3    498 non-null object
4    498 non-null object
5    498 non-null object
dtypes: int64(2), object(4)
memory usage: 23.4+ KB


In [38]:
df_train = df_train.drop([1, 2, 3, 4], axis=1)

In [39]:
df_test = df_test.drop([1, 2, 3, 4], axis=1)

In [40]:
def tokenize(tweet):
    try:
#         tweet = unicode(tweet.decode('Latin-1').lower())
        tokens = tokenizer.tokenize(tweet.lower())
        tokens = list(filter(lambda t: not t.startswith('@'), tokens))
        tokens = list(filter(lambda t: not t.startswith('#'), tokens))
        tokens = list(filter(lambda t: not t.startswith('http'), tokens))
        return tokens
    except:
        return 'NC'

In [41]:
def postprocess(data, n=1000000):
    data = data.head(n)
    data['tokens'] = data[5].progress_map(tokenize)  ## progress_map is a variant of the map function plus a progress bar. Handy to monitor DataFrame creations.
    data = data[data.tokens != 'NC']
    data.reset_index(inplace=True)
    data.drop('index', inplace=True, axis=1)
    return data

data = postprocess(df_train)

progress-bar: 100%|██████████| 100000/100000 [00:07<00:00, 14166.03it/s]


In [42]:
x_train, x_test, y_train, y_test = train_test_split(data['tokens'], data[0], test_size=0.2)

In [43]:
def labelizeTweets(tweets, label_type):
    labelized = []
    for i,v in tqdm(enumerate(tweets)):
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized

x_train = labelizeTweets(x_train, 'TRAIN')
x_test = labelizeTweets(x_test, 'TEST')

  """
80000it [00:00, 126837.17it/s]
20000it [00:00, 232277.47it/s]


In [44]:
tweet_w2v = Word2Vec(size=len(x_train), min_count=1)

In [None]:
tweet_w2v.build_vocab([x.words for x in tqdm(x_train)])

100%|██████████| 80000/80000 [00:00<00:00, 1419998.90it/s]


In [None]:
tweet_w2v.train([x.words for x in tqdm(x_train)],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.epochs)

100%|██████████| 80000/80000 [00:00<00:00, 995804.58it/s]


In [None]:
tweet_w2v.most_similar('good')