### Co-Occurence Matrix

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

docs = ['product_x is awesome',
        'product_x is better than product_y',
        'product_x is dissapointing','product_y beats product_x by miles', 
'ill definitely recommend product_x over others']

In [3]:
# Using unbuilt stopwords to remove noise

count_vectorizer = CountVectorizer(stop_words="english")
vectorized_matrix = count_vectorizer.fit_transform(docs)

In [4]:
# We can do matrix multiplication wiht the Transposed image of the same matrix

co_occurence_matrix = (vectorized_matrix.T * vectorized_matrix)

In [7]:
import pandas as pd

print(pd.DataFrame(co_occurence_matrix.A,
                   columns=count_vectorizer.get_feature_names(),
                   index=count_vectorizer.get_feature_names()))

               awesome  beats  better  definitely  dissapointing  ill  miles  \
awesome              1      0       0           0              0    0      0   
beats                0      1       0           0              0    0      1   
better               0      0       1           0              0    0      0   
definitely           0      0       0           1              0    1      0   
dissapointing        0      0       0           0              1    0      0   
ill                  0      0       0           1              0    1      0   
miles                0      1       0           0              0    0      1   
product_x            1      1       1           1              1    1      1   
product_y            0      1       1           0              0    0      1   
recommend            0      0       0           1              0    1      0   

               product_x  product_y  recommend  
awesome                1          0          0  
beats                



### Word2Vec

In [8]:
!pip install --upgrade gensim

Collecting gensim
  Downloading gensim-4.2.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 1.4 MB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.2.0


In [9]:
# Word2Vec expects list of list representation of words, the outer list represents
# the sentence, while the inner list represents the individual words in a sentence
# Ex: ["I love NLP", "NLP is awesome"] -> [["I", "love", "NLP"], ["NLP", "is", "awesome"]]

import gensim
sentences = ["ML is awesome", "ML is a branch of AI", "ML and AI are used interchangably nowadays", 
             "nlp is a branch and AI", "AI has fastforwarded nlp",
             "RL is also a branch of AI", "word2vec is a high dimensional vector space embedding",
            "word2vec falls under text representation for nlp"]

In [12]:
# Preprocessing sentence to convert to format expected by w2v

sentences_list = []
for i in sentences:
    li = list(i.split(" "))
    sentences_list.append(li)

print(sentences_list)

[['ML', 'is', 'awesome'], ['ML', 'is', 'a', 'branch', 'of', 'AI'], ['ML', 'and', 'AI', 'are', 'used', 'interchangably', 'nowadays'], ['nlp', 'is', 'a', 'branch', 'and', 'AI'], ['AI', 'has', 'fastforwarded', 'nlp'], ['RL', 'is', 'also', 'a', 'branch', 'of', 'AI'], ['word2vec', 'is', 'a', 'high', 'dimensional', 'vector', 'space', 'embedding'], ['word2vec', 'falls', 'under', 'text', 'representation', 'for', 'nlp']]


### GloVe

In [16]:
import gensim.downloader as api

# Lets download a 25 dimensional GloVe representation of 2 Billion tweets
# Info on this & other embeddings : <https://nlp.stanford.edu/projects/glove/>
# Gensim provides an awesome interface to easily download pre-trained embeddings
# > 100MB to be downloaded

twitter_glove = api.load("glove-twitter-25")



In [17]:
# To find most similar words
# Note : All outputs are lowercased. If you use upper case letters, it will throw out of vocab error

twitter_glove.most_similar("modi", topn=10)

[('kejriwal', 0.9501368403434753),
 ('bjp', 0.9385530352592468),
 ('arvind', 0.9274109601974487),
 ('narendra', 0.9249325394630432),
 ('nawaz', 0.9142388105392456),
 ('pmln', 0.9120966792106628),
 ('rahul', 0.9069461822509766),
 ('congress', 0.904523491859436),
 ('zardari', 0.8963413238525391),
 ('gujarat', 0.8910367488861084)]

In [18]:
twitter_glove["modi"]

array([ 0.45668  ,  0.31166  , -0.61599  , -0.81725  , -1.1438   ,
       -1.943    , -1.1992   , -0.40083  ,  0.35442  ,  0.28049  ,
       -0.57553  , -0.10921  , -2.6641   ,  0.99871  , -0.0043145,
       -0.65479  , -0.59466  ,  0.50801  , -0.17169  , -0.21874  ,
        0.32743  ,  1.1411   , -0.62828  , -1.3242   , -1.7793   ],
      dtype=float32)

In [19]:
twitter_glove.similarity("modi", "india")

0.7346285