In [1]:
import pandas as pd
import numpy as np
import re

import plotly.express as ex

from gensim.models.word2vec import Word2Vec
from tqdm.notebook import tqdm

from sklearn.manifold import TSNE

### Content

It contains the following 6 fields:

```
target: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
ids: The id of the tweet ( 2087)
date: the date of the tweet (Sat May 16 23:58:44 UTC 2009)
flag: The query (lyx). If there is no query, then this value is NO_QUERY.
user: the user that tweeted (robotickilldozr)
text: the text of the tweet (Lyx is cool)
```

In [2]:
df = pd.read_csv("./data.csv",encoding = "ISO-8859-1",names=["target","ids","date","flag","user","text"])

In [3]:
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
for key in ['ids','date','flag','user']:
    del df[key]

In [5]:
df.text.head()

0    @switchfoot http://twitpic.com/2y1zl - Awww, t...
1    is upset that he can't update his Facebook by ...
2    @Kenichan I dived many times for the ball. Man...
3      my whole body feels itchy and like its on fire 
4    @nationwideclass no, it's not behaving at all....
Name: text, dtype: object

### Cleaning Text

#### Tasks

1. Removing @usernames
2. Removing urls
3. Extracting Quoted Text
4. Replacing emojis

In [6]:
sample = df.sample(n=512)['text'].copy()
sample

262394                                        cousins bday. 
890372     Today Hmmmm. ... got up got dinner done make u...
817049                 @benshephard Welcome to Scotland..  x
217083      in bed thinking about how much i miss my sister 
292664                  really dont get the design argument 
                                 ...                        
1094450    @robertkneschke A system that does not change ...
1065480    did not shower the whooooleeee day! and did no...
343160     @stevyncolgan gone stale      not the same eve...
1276694    today started poorly, and ended wonderfully.  ...
709614                                  @mcdermr I'm sorry. 
Name: text, Length: 512, dtype: object

In [7]:
USER_RE = re.compile(r'@\w+')
URL_RE = re.compile(r'(http|https)://(\w+\.)*(\w+/|\w+)*')
QUOTE_RE = re.compile(r"'([a-z ]+)'")
EMOJI_RE = re.compile(r'([-:;]+)([()DoO0]+)')
ABBR_RE = re.compile(r'\.')
SYMBOL_RE = re.compile(r'[\?\-!#$%@^&*(){}\[\],.<>\"|\\:;]')

WHITESPACE_RE = re.compile(r' +')
TOKEN_RE = re.compile(r'(token)(\w+)')

In [8]:
def apply_re(x):
    x = x.lower()
    x = USER_RE.sub('tokenusername',x)
    x = URL_RE.sub('tokenurl',x)
    x = EMOJI_RE.sub('tokenemoji',x)
    
    x = QUOTE_RE.sub(r'\g<1>',x)
    
    x = ABBR_RE.sub('',x)
    x = SYMBOL_RE.sub(' ',x)
    x = WHITESPACE_RE.sub(' ',x)
    
    return TOKEN_RE.sub(r'<\g<2>>',x)

sample.apply(apply_re)

262394                                         cousins bday 
890372     today hmmmm got up got dinner done make up hai...
817049                      <username> welcome to scotland x
217083      in bed thinking about how much i miss my sister 
292664                  really dont get the design argument 
                                 ...                        
1094450    <username> a system that does not change is dead 
1065480    did not shower the whooooleeee day and did not...
343160     <username> gone stale not the same even with j...
1276694    today started poorly and ended wonderfully i'm...
709614                                 <username> i'm sorry 
Name: text, Length: 512, dtype: object

In [9]:
%%time

df['text'] = df.text.apply(apply_re)

CPU times: user 32.8 s, sys: 295 ms, total: 33.1 s
Wall time: 33.1 s


In [10]:
df.text.head()

0    <username> <url> awww that's a bummer you shou...
1    is upset that he can't update his facebook by ...
2    <username> i dived many times for the ball man...
3      my whole body feels itchy and like its on fire 
4    <username> no it's not behaving at all i'm mad...
Name: text, dtype: object

In [11]:
df.to_csv("./clean_text.csv",index=False) 

### Word Vectors

In [46]:
sentences = df.text.apply(lambda x:x.split(" ")).values
max_len = max([len(s) for s in sentences])

In [47]:
%%time

wv = Word2Vec(sentences=sentences,size=64,workers=8).wv

CPU times: user 3min 43s, sys: 926 ms, total: 3min 44s
Wall time: 1min 28s


In [49]:
wv.most_similar('hello')

[('hi', 0.8436288237571716),
 ('howdy', 0.7102457284927368),
 ('hola', 0.7050547003746033),
 ("g'morning", 0.6639456748962402),
 ('goodmorning', 0.6483098864555359),
 ('hey', 0.6125234365463257),
 ('ello', 0.6033692359924316),
 ("g'day", 0.5862616300582886),
 ('welcome', 0.5669145584106445),
 ('hiya', 0.5635590553283691)]

### Projecting Word Embeddings

In [68]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import Model
from tensorflow.keras.layers import *

In [100]:
def dense(x,units,activation=LeakyReLU(0.5)):
    x = Dense(units,use_bias=False,)(x)
    x = BatchNormalization()(x)
    x = Activation(activation)(x)
    
    return x

emb_in = Input(shape=(64,))

e = dense(emb_in,128,)
e = dense(e,64)
e = dense(e,64)
e = dense(e,32)
e = dense(e,32)
e = dense(e,32)
e = dense(e,16)
e = dense(e,16)
e = dense(e,16)
e = dense(e,16)

emb_d = dense(e,3)

d = dense(emb_d,16)
e = dense(e,16)
e = dense(e,16)
e = dense(e,16)
d = dense(d,32)
d = dense(d,32)
d = dense(d,32)
d = dense(d,64)
d = dense(d,64)
d = dense(d,64)

model = Model(emb_in,d)
emb_gen = Model(emb_in,emb_d)

In [101]:
opt = keras.optimizers.Adam(0.00009)
loss = keras.losses.MeanAbsoluteError()

model.compile(optmizer=opt,loss=loss)

In [102]:
model.fit(x=wv.vectors,y=wv.vectors,batch_size=64,epochs=3)

Train on 58455 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f60c072ff98>

In [103]:
embeddings = emb_gen.predict(wv.vectors,batch_size=64,verbose=1)



In [104]:
fig = ex.scatter_3d(x=embeddings[:,0],y=embeddings[:,1],z=embeddings[:,2],hover_name=wv.index2word)

In [105]:
with open("./projections.html","w+") as file:
    file.write(fig.to_html())