# Import necessary libraries

In [1]:
import pandas as pd

# Data Loading and Preprocessing

In [2]:
data = pd.read_csv('/content/tweets.csv',encoding='latin-1')

In [3]:
data

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...
...,...,...,...
7915,7916,0,Live out loud #lol #liveoutloud #selfie #smile...
7916,7917,0,We would like to wish you an amazing day! Make...
7917,7918,0,Helping my lovely 90 year old neighbor with he...
7918,7919,0,Finally got my #smart #pocket #wifi stay conne...


In [4]:
pd.set_option('display.max_colwidth',None)
df = data[['label','tweet']]
df.columns = ['label','tweet']
df

Unnamed: 0,label,tweet
0,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone
1,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperiasâ¦ http://instagram.com/p/YGEt5JC6JM/
2,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu
3,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/
4,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!
...,...,...
7915,0,Live out loud #lol #liveoutloud #selfie #smile #sony #music #headphones https://instagram.com/p/5spiNsJ_c9/
7916,0,"We would like to wish you an amazing day! Make every minute count #tls #today #iphone #accessories #news #life February 23, 2017 at 0â¦"
7917,0,Helping my lovely 90 year old neighbor with her iPad this morning has just made me realise that 'I' don't actually need an I pad!
7918,0,"Finally got my #smart #pocket #wifi stay connected anytime,anywhere! #ipad and #samsung #s3 #gadget # http://instagr.am/p/U-53G_vJU8/"


# Tweet Cleaning Function

In [250]:
from gensim.parsing.preprocessing import STOPWORDS
import re
def clean_tweet(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r'#\w+', '', tweet)
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet)
    tweet = re.sub(r'\d+', '', tweet)
    tweet = re.sub(r'[^\w\s]', '', tweet)
    tweet_tokens = gensim.utils.simple_preprocess(tweet)
    tweet_tokens = [word for word in tweet_tokens if word not in STOPWORDS]
    return tweet_tokens

df['text_clean_gensim'] = df['tweet'].apply(clean_tweet)

df

Unnamed: 0,label,tweet,text_clean_gensim
0,0,Test,[test]
1,0,Finally a transparant silicon case Thanks to my uncle,"[finally, transparant, silicon, case, thanks, uncle]"
2,0,We love this Would you go,[love]
3,0,Im wired I know Im George I was made that way,"[im, wired, know, im, george, way]"
4,1,What amazing service Apple wont even talk to me about a question I have unless I pay them 1995 for their stupid support,"[amazing, service, apple, wont, talk, question, pay, stupid, support]"
...,...,...,...
7915,0,Live out loud,"[live, loud]"
7916,0,We would like to wish you an amazing day Make every minute count February 23 2017 at 0â,"[like, wish, amazing, day, minute, count, february]"
7917,0,Helping my lovely 90 year old neighbor with her iPad this morning has just made me realise that I dont actually need an I pad,"[helping, lovely, year, old, neighbor, ipad, morning, realise, dont, actually, need, pad]"
7918,0,Finally got my stay connected anytimeanywhere and,"[finally, got, stay, connected, anytimeanywhere]"


# Apply Cleaning and Split Data

In [179]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df['text_clean_gensim'],df['label'],test_size=0.2, random_state=42)

# Word2Vec Model Training

In [215]:
w2v_model = gensim.models.Word2Vec(
    sentences=X_train,
    vector_size=200,
    window=5,
    min_count=1,
    workers=4,
    sg=1,
    epochs=20
)

In [216]:
w2v_model.wv['hello']

array([ 1.19148165e-01, -1.44769058e-01, -2.84182820e-02,  1.93492889e-01,
        6.10716492e-02, -1.10996835e-01, -1.55935297e-02,  3.99685204e-01,
       -2.37138614e-01,  8.21976587e-02, -1.88502938e-01, -2.13031247e-01,
        2.20063645e-02,  2.19985470e-01, -2.80959206e-03, -1.03778824e-01,
       -5.24738207e-02,  9.37898159e-02,  1.40687898e-01, -7.79898837e-02,
        2.45976374e-01, -2.36970425e-01, -1.29625604e-01,  3.73394862e-02,
       -3.80969942e-02, -3.02939147e-01, -4.67449985e-02, -1.46116003e-01,
       -2.61240840e-01,  1.58837624e-02,  2.28160024e-01,  6.71339855e-02,
       -2.79964376e-02,  1.95957739e-02,  9.25354660e-02,  1.59866914e-01,
        1.66654944e-01, -1.50180832e-01, -1.03527874e-01, -1.50413796e-01,
        5.50435297e-02,  3.43930139e-03, -1.05867647e-01,  2.29341611e-01,
        1.55144632e-01, -1.72742996e-02, -6.46688715e-02,  9.99954194e-02,
        1.37524471e-01,  4.51433659e-02,  9.28630978e-02, -1.40229523e-01,
        3.01853102e-02, -

In [217]:
len(w2v_model.wv['hello'])

200

In [218]:
w2v_model.wv.most_similar('hello')

[('toy', 0.9882174730300903),
 ('brand', 0.9852010011672974),
 ('arrived', 0.9807296991348267),
 ('camera', 0.9769953489303589),
 ('tv', 0.9700777530670166),
 ('smart', 0.9673795104026794),
 ('edge', 0.9666755199432373),
 ('yay', 0.9653140306472778),
 ('stallingborough', 0.9650667905807495),
 ('inch', 0.964811384677887)]

# Vectorization

In [219]:
words = w2v_model.wv.index_to_key

In [220]:
len(words)

8023

In [221]:
import numpy as np
X_train_vect = []
X_test_vect = []

for ls in X_train:
  vectors = []
  for word in ls:
    if word in words:
      vectors.append(w2v_model.wv[word])
  X_train_vect.append(np.array(vectors))

In [222]:
X_test_vect = []
for ls in X_test:
  vectors = []
  for word in ls:
    if word in words:
      vectors.append(w2v_model.wv[word])
  X_test_vect.append(np.array(vectors))

# Averaging Vectors

In [226]:
X_train_vect_avg = []
X_test_vect_avg = []
for v in X_train_vect:
  if v.size:
    X_train_vect_avg.append(v.mean(axis=0))
  else:
    X_train_vect_avg.append(np.zeros(200,dtype=float))

for v in X_test_vect:
  if v.size:
    X_test_vect_avg.append(v.mean(axis=0))
  else:
    X_test_vect_avg.append(np.zeros(200,dtype=float))

# Model Training and Evaluation

In [227]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

In [228]:
clf.fit(X_train_vect_avg,y_train)

In [229]:
from sklearn import metrics
y_pred = clf.predict(X_train_vect_avg)
metrics.accuracy_score(y_pred,y_train)

0.9987373737373737

# Prediction

In [245]:
twt = "Contemplating giving in to the iPhone bandwagon simply because Cellcom has no new Androids "
twt = gensim.utils.simple_preprocess(twt)

In [246]:
X_test1 = np.array([w2v_model.wv[i] for i in twt if i in words]).mean(axis=0)

In [247]:
y_pred = clf.predict([X_test1])
y_pred

array([1])