# Import Libraries

In [1]:
import tensorflow as tf
print(tf.__version__)

2.4.1


In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
import nltk
import pandas as pd
import gensim
from gensim.models import Word2Vec, KeyedVectors

# Data Preprocesing

In [None]:
# get a google pretrained model

# https://www.kaggle.com/rootuser/worldnews-on-reddit

# install kaggle API
! pip install -q kaggle

In [None]:
! cd ~/

In [None]:
! ls -l

total 80248
-rw-r--r-- 1 root root       64 Mar  3 00:38 kaggle.json
-rw-r--r-- 1 root root 82161571 Mar  3 00:43 reddit_worldnews_start_to_2016-11-22.csv
drwxr-xr-x 1 root root     4096 Mar  1 14:35 sample_data


In [48]:
! pwd

/content


In [None]:
# create a directory as kaggle
! mkdir -p ~/.kaggle

In [None]:
# copy API key to kaggle directory
! cp kaggle.json ~/.kaggle

In [None]:
# disable the API key
! chmod 600 /root/.kaggle/kaggle.json

In [None]:
# to download directly
! kaggle datasets download -d rootuser/worldnews-on-reddit

In [4]:
df = pd.read_csv('/content/reddit_worldnews_start_to_2016-11-22.csv')
df.head()

FileNotFoundError: ignored

In [50]:
# take the title column
news_titles = df['title'].values

In [51]:
news_titles

array(['Scores killed in Pakistan clashes',
       'Japan resumes refuelling mission',
       'US presses Egypt on Gaza border', ...,
       'Professor receives Arab Researchers Award',
       'Nigel Farage attacks response to Trump ambassador tweet',
       'Palestinian wielding knife shot dead in West Bank: Israel police'],
      dtype=object)

In [52]:
df.shape

(509236, 8)

In [53]:
# tokenize the titles
new_vec = [nltk.word_tokenize(title) for title in news_titles]

In [54]:
new_vec[0]

['Scores', 'killed', 'in', 'Pakistan', 'clashes']

# Build the Model

In [55]:
# text, mins word count, size of each vector

model = Word2Vec(new_vec, min_count=1, size = 32)

# Make Predictions

In [56]:
# find the 10 closest words in the vector space that was created
model.wv.most_similar('man')
model.wv['man']

array([ 3.6744542 ,  0.8593975 , -2.1986523 ,  0.7193417 ,  0.3725205 ,
       -0.79489094, -2.415703  ,  0.06890036,  1.5766871 , -2.5533304 ,
       -4.2389956 , -1.0188018 ,  0.07964461,  2.052519  , -3.042821  ,
        0.94223654, -4.884871  , -4.715568  , -3.27172   ,  1.087353  ,
        1.684153  , -5.364921  ,  3.5274348 , -0.967145  ,  6.724106  ,
        1.417519  , -3.0506818 , -1.4823015 ,  1.3240402 , -0.65318   ,
        0.2262108 ,  0.5866153 ], dtype=float32)

In [57]:
model.wv.most_similar('man')

[('woman', 0.9751670360565186),
 ('girl', 0.9166860580444336),
 ('couple', 0.9041752815246582),
 ('teenager', 0.9005223512649536),
 ('boy', 0.8992596864700317),
 ('mother', 0.893301784992218),
 ('doctor', 0.8723492622375488),
 ('father', 0.8568341732025146),
 ('teacher', 0.8529676198959351),
 ('pair', 0.8410673141479492)]

In [58]:
# famous relationship should get some version of woman back
vec = model.wv['king'] - model.wv['man'] + model.wv['women']
model.wv.most_similar([vec])

[('women', 0.7090500593185425),
 ('minorities', 0.6739330887794495),
 ('freedoms', 0.670643150806427),
 ('equal', 0.6660119295120239),
 ('clerics', 0.6650686264038086),
 ('religious', 0.6606276035308838),
 ('couples', 0.6549668312072754),
 ('headscarves', 0.6513099670410156),
 ('lacy', 0.6494085192680359),
 ('LGBT', 0.6492511034011841)]

In [59]:
# another relationship should get France Back
vec = model.wv['Germany'] - model.wv['Berlin'] + model.wv['Paris']
model.wv.most_similar([vec])

[('France', 0.876397967338562),
 ('Belgium', 0.8687744140625),
 ('Paris', 0.8177279233932495),
 ('Sweden', 0.8103444576263428),
 ('Germany', 0.8079019784927368),
 ('Brussels', 0.7726388573646545),
 ('Britain', 0.7675787806510925),
 ('Turkey', 0.7575868368148804),
 ('UK', 0.7574286460876465),
 ('Norway', 0.721234142780304)]

In [60]:
# Messi is God of Footall, who is God of Cricket
vec = model.wv['Messi'] - model.wv['Football'] + model.wv['Cricket']
model.wv.most_similar([vec])

[('granddad', 0.7719060182571411),
 ('bin-Laden', 0.7424380779266357),
 ('Jacenko', 0.7353984713554382),
 ('ruptures', 0.7342279553413391),
 ('2-yr-old', 0.7298926115036011),
 ('supermodels', 0.7246538400650024),
 ('292', 0.7226129770278931),
 ('BB', 0.7193324565887451),
 ('210', 0.709683895111084),
 ('Shopping-Related', 0.709388792514801)]

In [None]:
# get a pretrained model 
# each word is a vector of size 300
# https://www.kaggle.com/umbertogriffo/googles-trained-word2vec-model-in-python
model = KeyedVectors.load_word2vec_format('/content/GoogleNews-vectors-negatve.bin', binary=True, limit=100000)


In [None]:
# find the 10 closest words to the verctore space
model.most_similar('man')

In [None]:
# see the vector
model['man']

In [None]:
# famous relationship should get some version of woman back
vec = model['king'] - model['man'] + model['women']
model.most_similar([vec])

In [None]:
# another relationship should get France Back
vec = model['Germany'] - model['Berlin'] + model['Paris']
model.most_similar([vec])

In [None]:
# Messi is God of Footall, who is God of Cricket
vec = model['Messi'] - model['Football'] + model['Cricket']
model.most_similar([vec])