# SF-DAT-21 |Tweets & Word2Vec (Lab)

# Gensim

Gensim (http://radimrehurek.com/gensim) is a library of language processing tools focused on latent variable models of text.

In [55]:
import os
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import feature_extraction
from gensim import matutils, models

pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

%matplotlib inline
plt.style.use('ggplot')

The data is about sentiments on Amazon reviews.

In [56]:
content = []

with open(os.path.join('..', 'datasets', 'tweets.txt')) as f:
    for line in f.readlines():
        line = line.strip('\n')#
        content.append(line.lower())

#df = pd.DataFrame({'content': content})

In [57]:
df = pd.DataFrame({'content': content})

In [58]:
df.head()

Unnamed: 0,content
0,i made a(n) small tourmaline in paradise islan...
1,rt @purelovebeast: -เช็ครายละเอียด- 27th birth...
2,https://t.co/eofbdvqufo
3,@vigigu google it :) simple
4,nerd ass girl https://t.co/t7kdirxpel


In [59]:
df.dropna(inplace = True) # Let's drop the NaN

In [60]:
df.head()

Unnamed: 0,content
0,i made a(n) small tourmaline in paradise islan...
1,rt @purelovebeast: -เช็ครายละเอียด- 27th birth...
2,https://t.co/eofbdvqufo
3,@vigigu google it :) simple
4,nerd ass girl https://t.co/t7kdirxpel


## LDA with Gensim

### Let's first translate a set of documents (articles) into a matrix representation with a row per document and a column per feature (word or n-gram)

In [84]:
vectorizer = feature_extraction.text.CountVectorizer(stop_words = 'english')

In [85]:
documents = vectorizer.fit_transform(df.content)

In [63]:
df.content

0       i made a(n) small tourmaline in paradise islan...
1       rt @purelovebeast: -เช็ครายละเอียด- 27th birth...
2                                 https://t.co/eofbdvqufo
3                             @vigigu google it :) simple
4                  nerd ass girl  https://t.co/t7kdirxpel
                              ...                        
4888    tesla rival may be producing a batmobile-like ...
4889    mercedes-benz plans four new electric #tesla f...
4890    rt @hblodget: new tesla competitor is launchin...
4891    spacex rocket in great shape after historic la...
4892    new photos of spacex booster show sooty but un...
Name: content, dtype: object

In [78]:
import re
j = 0
for i in df.content:
    df.content[j] = re.sub(r"http\S+", "", i)
    j = j + 1

In [80]:
df.content

0       new photos of spacex booster show sooty but un...
1       rt @purelovebeast: -เช็ครายละเอียด- 27th birth...
2                                                        
3                             @vigigu google it :) simple
4                                         nerd ass girl  
                              ...                        
4888    tesla rival may be producing a batmobile-like ...
4889    mercedes-benz plans four new electric #tesla f...
4890    rt @hblodget: new tesla competitor is launchin...
4891    spacex rocket in great shape after historic la...
4892    new photos of spacex booster show sooty but un...
Name: content, dtype: object

In [66]:
text = "this is an https://URL.com blah blah blah"
result = re.sub(r"http\S+", "", text)
result

'this is an  blah blah blah'

In [81]:
df.content

0       new photos of spacex booster show sooty but un...
1       rt @purelovebeast: -เช็ครายละเอียด- 27th birth...
2                                                        
3                             @vigigu google it :) simple
4                                         nerd ass girl  
                              ...                        
4888    tesla rival may be producing a batmobile-like ...
4889    mercedes-benz plans four new electric #tesla f...
4890    rt @hblodget: new tesla competitor is launchin...
4891    spacex rocket in great shape after historic la...
4892    new photos of spacex booster show sooty but un...
Name: content, dtype: object

In [86]:
# Let's now build a mapping of numerical ID to word

id2word = dict(enumerate(vectorizer.get_feature_names()))

In [87]:
id2word

{0: u'00',
 1: u'000',
 2: u'00am',
 3: u'01',
 4: u'0161',
 5: u'01c',
 6: u'02',
 7: u'034',
 8: u'039',
 9: u'04',
 10: u'0430mayi',
 11: u'0502',
 12: u'0574',
 13: u'06',
 14: u'07',
 15: u'10',
 16: u'100',
 17: u'1000',
 18: u'100000',
 19: u'1000s',
 20: u'100bn',
 21: u'100d',
 22: u'100esquses',
 23: u'100mp',
 24: u'100s',
 25: u'1010lt',
 26: u'1028',
 27: u'1091',
 28: u'10pm',
 29: u'11',
 30: u'11000',
 31: u'110k',
 32: u'111',
 33: u'1159pm',
 34: u'1159pmpactm',
 35: u'118',
 36: u'11sixedits',
 37: u'12',
 38: u'1213',
 39: u'123000',
 40: u'125',
 41: u'127',
 42: u'1279',
 43: u'1282048b33ba483',
 44: u'12me',
 45: u'13',
 46: u'13000',
 47: u'131',
 48: u'1311',
 49: u'132k',
 50: u'14',
 51: u'1439',
 52: u'145',
 53: u'1451886241',
 54: u'148',
 55: u'14gb',
 56: u'14th',
 57: u'15',
 58: u'150',
 59: u'1540',
 60: u'1588days',
 61: u'158b78679ddc4b6',
 62: u'15gb',
 63: u'16',
 64: u'16000',
 65: u'16gb',
 66: u'16mile',
 67: u'17',
 68: u'1723091184',
 69: u'1

In [70]:
\

### We want to learn which columns are correlated (i.e., likely to come from the same topic).  This is the word distribution.  We can also determine what topics are in each document, the topic distribution.

In [88]:
# First we convert our word-matrix into gensim's format

corpus = matutils.Sparse2Corpus(documents, documents_columns = False)

(Check https://radimrehurek.com/gensim/matutils as needed)

In [89]:
corpus

<gensim.matutils.Sparse2Corpus at 0xa5f66d8>

(Check https://radimrehurek.com/gensim/models/ldamodel as needed)

In [90]:
# Then we fit an LDA model

model = models.ldamodel.LdaModel(corpus = corpus, num_topics = 15, id2word = id2word, passes = 10)

In this model, we need to explicitly specify the number of topic we want the model to uncover.  This is a critical parameter, but there isn't much guidance on how to choose it.  Try to use domain expertise where possible.

In [91]:
model

<gensim.models.ldamodel.LdaModel at 0xafc8748>

### Goodness of fit

Now we need to assess the goodness of fit for our model.  Like other unsupervised learning techniques, our validation techniques are mostly about interpretation.

Use the following questions to guide you:
- Did we learn reasonable topics?
- Do the words that make up a topic make sense?
- Is this topic helpful towards our goal?

In [92]:
model.print_topics()

[(14,
  u'0.078*just + 0.030*google + 0.023*afghanistan + 0.019*gen + 0.017*lawyer + 0.017*pinned + 0.016*rt + 0.012*android + 0.011*androidgames + 0.011*gameinsight'),
 (6,
  u'0.049*microsoft + 0.036*google + 0.019*cancer + 0.018*reason + 0.015*windows + 0.014*uk + 0.012*best + 0.012*years + 0.011*rt + 0.010*10'),
 (8,
  u'0.046*google + 0.041*today + 0.037*play + 0.027*app + 0.025*store + 0.024*rt + 0.024*darksummoner + 0.023*libya + 0.022*king + 0.022*news'),
 (5,
  u'0.069*google + 0.026*good + 0.018*rt + 0.018*deal + 0.015*nexus + 0.015*start + 0.013*data + 0.013*ve + 0.012*youtube + 0.012*solar'),
 (10,
  u'0.024*rt + 0.023*google + 0.017*bid + 0.012*ukraine + 0.012*mtp + 0.011*fns + 0.010*usa + 0.010*android + 0.010*palestine + 0.008*israel'),
 (1,
  u'0.069*iran + 0.066*rt + 0.065*saudi + 0.034*arabia + 0.020*syria + 0.019*ties + 0.014*war + 0.014*amp + 0.011*saudiarabia + 0.010*world'),
 (4,
  u'0.041*israel + 0.024*rt + 0.019*amp + 0.012*gt + 0.011*state + 0.010*google + 0.0

Some topics will be clearer than others.  The following topics represent clear concepts:
- Cooking and Recipes: 0.009 \* cup + 0.009 \* recipe + 0.007 \* make + 0.007 \* food + 0.006 \* sugar
- Cooking and recipes: 0.013 \* butter + 0.010 \* baking + 0.010 \* dough + 0.009 \* cup + 0.009 \* sugar
- Fashion and Style: 0.013 \* fashion + 0.006 \* like + 0.006 \* dress + 0.005 \* style

## Word2Vec with Gensim

In [93]:
# Setup the body text
sentences = df.content.map(lambda content: content.split())

In [94]:
sentences

0       [new, photos, of, spacex, booster, show, sooty...
1       [rt, @purelovebeast:, -เช็ครายละเอียด-, 27th, ...
2                                                      []
3                       [@vigigu, google, it, :), simple]
4                                       [nerd, ass, girl]
                              ...                        
4888    [tesla, rival, may, be, producing, a, batmobil...
4889    [mercedes-benz, plans, four, new, electric, #t...
4890    [rt, @hblodget:, new, tesla, competitor, is, l...
4891    [spacex, rocket, in, great, shape, after, hist...
4892    [new, photos, of, spacex, booster, show, sooty...
Name: content, dtype: object

In [95]:
model = models.Word2Vec(sentences, size = 100, window = 5, min_count = 5, workers = 4)



`Word2Vec` has many arguments:
- `size` represents how many concepts or topics we should use
- `window` represents how many words surrounding a sentence we should use as our original feature
- `min_count` is the number of times that context or word must appear
- `workers` is the number of CPU cores to use to speed up model training

(Check http://radimrehurek.com/gensim/models/word2vec as needed)

In [96]:
model

<gensim.models.word2vec.Word2Vec at 0xa60e6a0>

### Most similar words

The model has a `most_similar` function that helps find the words most similar to the one you queried.  This will return words that are most often used in the same context.

In [103]:
model.most_similar(positive = ['syria'])

[('over', 0.9998472929000854),
 ('after', 0.999840497970581),
 ('who', 0.999830424785614),
 ('people', 0.9998241662979126),
 ('2016', 0.9998232126235962),
 ('#news', 0.9998089671134949),
 ('has', 0.9998032450675964),
 ('via', 0.9998021125793457),
 ("i'm", 0.9997999668121338),
 ('an', 0.9997988939285278)]

In [98]:
vectorizer.get_feature_names()

[u'00',
 u'000',
 u'00am',
 u'01',
 u'0161',
 u'01c',
 u'02',
 u'034',
 u'039',
 u'04',
 u'0430mayi',
 u'0502',
 u'0574',
 u'06',
 u'07',
 u'10',
 u'100',
 u'1000',
 u'100000',
 u'1000s',
 u'100bn',
 u'100d',
 u'100esquses',
 u'100mp',
 u'100s',
 u'1010lt',
 u'1028',
 u'1091',
 u'10pm',
 u'11',
 u'11000',
 u'110k',
 u'111',
 u'1159pm',
 u'1159pmpactm',
 u'118',
 u'11sixedits',
 u'12',
 u'1213',
 u'123000',
 u'125',
 u'127',
 u'1279',
 u'1282048b33ba483',
 u'12me',
 u'13',
 u'13000',
 u'131',
 u'1311',
 u'132k',
 u'14',
 u'1439',
 u'145',
 u'1451886241',
 u'148',
 u'14gb',
 u'14th',
 u'15',
 u'150',
 u'1540',
 u'1588days',
 u'158b78679ddc4b6',
 u'15gb',
 u'16',
 u'16000',
 u'16gb',
 u'16mile',
 u'17',
 u'1723091184',
 u'1763210325',
 u'1783',
 u'17carat',
 u'18',
 u'1800',
 u'1810',
 u'1811',
 u'1862185791',
 u'1865846254',
 u'19',
 u'1942',
 u'1957_tintin_',
 u'1970s',
 u'1977755529',
 u'1979',
 u'1980s',
 u'1989',
 u'1992',
 u'1998',
 u'1999',
 u'19th',
 u'1d',
 u'1evilidiot',
 u'1gb'

In [105]:
sentences

0       [new, photos, of, spacex, booster, show, sooty...
1       [rt, @purelovebeast:, -เช็ครายละเอียด-, 27th, ...
2                                                      []
3                       [@vigigu, google, it, :), simple]
4                                       [nerd, ass, girl]
                              ...                        
4888    [tesla, rival, may, be, producing, a, batmobil...
4889    [mercedes-benz, plans, four, new, electric, #t...
4890    [rt, @hblodget:, new, tesla, competitor, is, l...
4891    [spacex, rocket, in, great, shape, after, hist...
4892    [new, photos, of, spacex, booster, show, sooty...
Name: content, dtype: object

In [107]:
sentences = list(map(lambda sentence: list(filter(lambda word: word in vectorizer.get_feature_names(), sentence)), sentences))

  if __name__ == '__main__':


In [108]:
sentences

[['new', 'photos', 'spacex', 'booster', 'sooty', 'undamaged', 'rocket'],
 ['rt', '27th', 'birthday', 'special', 'goods', '3d', 'yoseop', 'usb'],
 [],
 ['google', 'simple'],
 ['nerd', 'ass', 'girl'],
 ['leadcorp', 'media'],
 ['rt', 'chiemoney', 'use', 'google', 'learn'],
 ['morning',
  'bro',
  'new',
  'video',
  'need',
  'yr',
  'help',
  'im',
  'talented',
  'bt',
  'lack',
  'manager',
  'bro'],
 ['google', 'play', 'gift', 'card', 'code'],
 ['claim', 'google', 'play', 'gift', 'card'],
 ['devs', 'sport', 'come'],
 [],
 [],
 ['king', 'dark', 'summon', 'app', 'google'],
 ['king', 'dark', 'summon', 'app', 'google'],
 ['read', 'published', 'article', 'russell', 'simmons', 'wrote', 'google'],
 ['entered', 'win', 'google', 'nexus', '6p'],
 ['saw', 'online', 'like', 'google', 'reviews'],
 ['rt', 'entered', 'win', 'google', 'nexus', '6p'],
 ['yayyyyy', 'thanks', 'follow'],
 ['love', 'google', 'plus', 'page'],
 ['google', 'read', 'ton', 'articles', 'just', 'searched', 'youtube', 'shit'],
 [

In [109]:
model = models.Word2Vec(sentences, size = 100, window = 3, min_count = 5, workers = 4)



In [113]:
model.most_similar(positive = ['girl'])

[('million', 0.9831863641738892),
 ('money', 0.98276287317276),
 ('protect', 0.9826767444610596),
 ('oil', 0.9824026226997375),
 ('alerts', 0.9820623397827148),
 ('tehran', 0.9820042848587036),
 ('proactive', 0.9819892644882202),
 ('injured', 0.9819713830947876),
 ('nice', 0.9819350242614746),
 ('smart', 0.9818744659423828)]