### NLP Naive BoW, TF, TF-IDF and Word & sentence Embeddings using IMDB dataset from Kaggle 

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import nltk

In [2]:
df = pd.read_csv("IMDB-Movie-Data.csv")
df.head(2)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0


#### BoW using TransactionEncoder() from Machine Learning Extensions (mlxtend)

In [3]:
# Use Genre column
data = df['Genre'].str.split(',')
data.loc[0:5] # Display 5 rows

0     [Action, Adventure, Sci-Fi]
1    [Adventure, Mystery, Sci-Fi]
2              [Horror, Thriller]
3     [Animation, Comedy, Family]
4    [Action, Adventure, Fantasy]
5    [Action, Adventure, Fantasy]
Name: Genre, dtype: object

In [4]:
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
print(te.fit_transform(data))               #Returns a boolean table
print(te.fit_transform(data).astype(int))   #Returns int table
# 0 --> Absence of a feature
# 1 --> Presence

[[ True  True False ... False False False]
 [False  True False ... False False False]
 [False False False ...  True False False]
 ...
 [False False False ... False False False]
 [False  True False ... False False False]
 [False False False ... False False False]]
[[1 1 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 0 ... 1 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [5]:
te.columns_ #Unique column names (All genres determined from the input)

['Action',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Drama',
 'Family',
 'Fantasy',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Sport',
 'Thriller',
 'War',
 'Western']

In [6]:
# Put it in a dataframe
tr_data = pd.DataFrame(te.fit(data).transform(data).astype('int'), columns= te.columns_)
tr_data

Unnamed: 0,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,Fantasy,History,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
3,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0
996,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
997,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0
998,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


> Note: TransactionEncoder() returns only the presence (True) or absence (False) for each column. Even if a word is repeated multiple times in a piece of text, it will just return a True for the word.

#### Term Frequency (TF)

> To make a term frequency (count of each word) matrix, you can use CountVectorizer() from sklearn

> It is important to note that the CountVectorizer() expects the input as a list of strings or an array of strings.

In [7]:
# Use Description column
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
cv.fit_transform(df['Description'].values).todense() # Convert it to dense representation

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [8]:
cv.vocabulary_# The unique words are stored in a dictionary
# The key is the word, the value is the index

{'group': 2331,
 'of': 3643,
 'intergalactic': 2751,
 'criminals': 1276,
 'are': 345,
 'forced': 2094,
 'to': 5288,
 'work': 5839,
 'together': 5291,
 'stop': 4944,
 'fanatical': 1954,
 'warrior': 5698,
 'from': 2166,
 'taking': 5107,
 'control': 1194,
 'the': 5212,
 'universe': 5513,
 'following': 2085,
 'clues': 1049,
 'origin': 3706,
 'mankind': 3230,
 'team': 5141,
 'finds': 2038,
 'structure': 4979,
 'on': 3669,
 'distant': 1551,
 'moon': 3442,
 'but': 812,
 'they': 5231,
 'soon': 4826,
 'realize': 4216,
 'not': 3599,
 'alone': 240,
 'three': 5259,
 'girls': 2256,
 'kidnapped': 2933,
 'by': 818,
 'man': 3218,
 'with': 5807,
 'diagnosed': 1492,
 '23': 75,
 'distinct': 1552,
 'personalities': 3870,
 'must': 3499,
 'try': 5410,
 'escape': 1822,
 'before': 571,
 'apparent': 319,
 'emergence': 1732,
 'frightful': 2164,
 'new': 3563,
 '24th': 78,
 'in': 2658,
 'city': 1006,
 'humanoid': 2575,
 'animals': 288,
 'hustling': 2595,
 'theater': 5213,
 'impresario': 2652,
 'attempt': 428,
 's

In [9]:
# You can see that the dictionary is not sorted on the values (indexes).
sorted(cv.vocabulary_)

['000',
 '007',
 '10',
 '100',
 '10th',
 '12',
 '12th',
 '13',
 '1408',
 '15',
 '1630s',
 '17',
 '17th',
 '1820',
 '1820s',
 '19',
 '1920',
 '1920s',
 '1930s',
 '1931',
 '1942',
 '1944',
 '1945',
 '1949',
 '1950s',
 '1951',
 '1952',
 '1954',
 '1956',
 '1960s',
 '1962',
 '1965',
 '1967',
 '1969',
 '1970',
 '1970s',
 '1972',
 '1974',
 '1978',
 '1979',
 '1980',
 '1980s',
 '1982',
 '1984',
 '1985',
 '1987',
 '1988',
 '1990',
 '1990s',
 '1992',
 '1996',
 '1997',
 '1998',
 '19th',
 '20',
 '200',
 '2000s',
 '2001',
 '2004',
 '2005',
 '2008',
 '2009',
 '2010',
 '2011',
 '2013',
 '2014',
 '2018',
 '2027',
 '2028',
 '2057',
 '2074',
 '2084',
 '20th',
 '21',
 '2154',
 '23',
 '2307',
 '24',
 '24th',
 '25',
 '27',
 '300',
 '40',
 '400',
 '47',
 '480',
 '51',
 '59',
 '600',
 '60s',
 '70',
 '70s',
 '74th',
 '80',
 '90',
 'aang',
 'aaron',
 'abandoned',
 'abandons',
 'abba',
 'abby',
 'abducted',
 'abilities',
 'ability',
 'aboard',
 'about',
 'above',
 'abroad',
 'absence',
 'absolutely',
 'absorbed'

In [10]:
#Putting everything in a dataframe
str_df = pd.DataFrame(cv.transform(df['Description'].values).todense(), columns = sorted(cv.vocabulary_))
str_df

Unnamed: 0,000,007,10,100,10th,12,12th,13,1408,15,...,zamperini,zealand,zero,zeus,zodiac,zombie,zombies,zone,zubrowka,zuckerberg
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Using TF-IDF
> CountVectorizer simply counts the number of times a word appears in a document (using a bag-of-words approach), while TF-IDF Vectorizer takes into account not only how many times a word appears in a document but also how important that word is to the whole corpus.

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
fidf = TfidfVectorizer()
fidf.fit(df.loc[:5,'Description'].values)
fidf.transform(df.loc[:5,'Description'].values).todense()

matrix([[0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.17765115,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.25660546, 0.        ,
         0.25660546, 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.25660546, 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.25660546, 0.        , 0.        , 0.21042015, 0.        ,
         0.        , 0.        , 0.        , 0.25660546, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.25660546, 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0

In [12]:
#Putting everything in a dataframe
tfidf_df = pd.DataFrame(fidf.transform(df['Description'].values).todense(), columns = sorted(fidf.vocabulary_))
tfidf_df

Unnamed: 0,23,24th,against,agency,alone,animals,anticipates,apocalypse,apparent,are,...,together,try,universe,villains,wall,warrior,will,with,work,world
0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.177651,...,0.256605,0.000000,0.256605,0.000000,0.0,0.256605,0.000000,0.000000,0.256605,0.000000
1,0.000000,0.000000,0.0,0.000000,0.230476,0.000000,0.000000,0.000000,0.000000,0.159562,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.220967,0.220967,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.220967,0.152978,...,0.000000,0.220967,0.000000,0.000000,0.0,0.000000,0.000000,0.181196,0.000000,0.000000
3,0.000000,0.000000,0.0,0.000000,0.000000,0.175828,0.175828,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.175828,0.144181,0.000000,0.000000
4,0.000000,0.000000,0.0,0.211999,0.000000,0.000000,0.000000,0.211999,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.211999,0.0,0.000000,0.000000,0.000000,0.000000,0.211999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.313937,0.000000,0.000000
996,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.497202,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
997,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
998,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.340820,0.000000,0.000000


#### Word Embeddings (Word2Vec) - skip-gram and CBOW

 Word2vec accepts several parameters that affect both training speed and quality.

One of them is for pruning the internal dictionary. Words that appear only once or twice in a billion-word corpus are probably uninteresting typos and garbage. In addition, there’s not enough data to make any meaningful training on those words, so it’s best to ignore them:
1

model = Word2Vec(sentences, min_count=10)  # default value is 5

A reasonable value for min_count is between 0-100, depending on the size of your dataset.

Another parameter is the size of the NN layers, which correspond to the “degrees” of freedom the training algorithm has:
1

model = Word2Vec(sentences, vector_size=200)  # default value is 100

Bigger size values require more training data, but can lead to better (more accurate) models. Reasonable values are in the tens to hundreds.

Other hyper-parameters:

>   size: window=window_size for capturing context for target word

>   sample: The threshold for configuring which higher-frequency words are randomly down sampled, useful range is (0, 1e-5)

>   workers: Use these many worker threads to train the model (faster training with multicore machines)

>   sg: Training algorithm: skip-gram if sg=1, otherwise CBOW.

>   iter: Number of iterations (epochs) over the corpus.

* https://radimrehurek.com/gensim/models/word2vec.html

In [13]:
# Use Genre column data which is a list of Genre data
data

0       [Action, Adventure, Sci-Fi]
1      [Adventure, Mystery, Sci-Fi]
2                [Horror, Thriller]
3       [Animation, Comedy, Family]
4      [Action, Adventure, Fantasy]
                   ...             
995         [Crime, Drama, Mystery]
996                        [Horror]
997         [Drama, Music, Romance]
998             [Adventure, Comedy]
999       [Comedy, Family, Fantasy]
Name: Genre, Length: 1000, dtype: object

In [14]:
from gensim.models import Word2Vec
model = Word2Vec(sentences=data, vector_size=10, window=5, min_count=1, workers=4) # vector_size = 10 denotes the length of embedding
#model.train(data, total_examples=1, epochs=100) # We can train the model on Genre data if needed, with model params
model.save("word2vec.model") # Save the model
model = Word2Vec.load("word2vec.model") # load the saved model

In [15]:
embedding = model.wv['Action']
print(embedding)
print(len(embedding))

[ 0.07751261 -0.01359703 -0.04042795  0.06635427 -0.04243213 -0.01777572
  0.04671292  0.01964733 -0.09431129 -0.09352869]
10


In [16]:
# Get the most similar words (having the most similar embeddings)
similar_words = model.wv.most_similar('Action',topn = 3) #topn denotes the top 3 similar words
print(similar_words)

[('Drama', 0.5938251614570618), ('Biography', 0.43530386686325073), ('Comedy', 0.4127630889415741)]


In [17]:
# Store just the words + their trained embeddings.
word_vectors = model.wv
word_vectors.save("word2vec.wordvectors")

In [18]:
# Load back with memory-mapping = read-only, shared across processes.
from gensim.models import KeyedVectors
wv = KeyedVectors.load("word2vec.wordvectors", mmap='r')
wv['Action']  # Get numpy vector embedding for 'computer'

array([ 0.07751261, -0.01359703, -0.04042795,  0.06635427, -0.04243213,
       -0.01777572,  0.04671292,  0.01964733, -0.09431129, -0.09352869],
      dtype=float32)

#### Gensim pre-trained models, from Gensim-data repository

In [19]:
import gensim.downloader
# Show all available models in gensim-data
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [20]:
# Download the "glove-twitter-25" embeddings
# Pre-trained glove vectors based on 2B tweets, 27B tokens, 1.2M vocab, uncased.
glove_vectors = gensim.downloader.load('glove-twitter-25')
glove_vectors

<gensim.models.keyedvectors.KeyedVectors at 0x2cc6cdc40>

In [21]:
# Use the downloaded vectors as usual:
glove_vectors.most_similar('twitter')

[('facebook', 0.948005199432373),
 ('tweet', 0.9403423070907593),
 ('fb', 0.9342358708381653),
 ('instagram', 0.9104824066162109),
 ('chat', 0.8964965343475342),
 ('hashtag', 0.8885937333106995),
 ('tweets', 0.8878158330917358),
 ('tl', 0.8778461217880249),
 ('link', 0.877821147441864),
 ('internet', 0.8753896355628967)]

#### Sentence/Document Embeddings (Doc2vec)
Paragraph, Sentence, and Document embeddings

* https://radimrehurek.com/gensim/models/doc2vec.html

In [22]:
# Use 3 Descriptions for Doc2Vec embedding
sentences = df.loc[:2,'Description'].values
print(sentences)
print(len(sentences))

['A group of intergalactic criminals are forced to work together to stop a fanatical warrior from taking control of the universe.'
 'Following clues to the origin of mankind, a team finds a structure on a distant moon, but they soon realize they are not alone.'
 'Three girls are kidnapped by a man with a diagnosed 23 distinct personalities. They must try to escape before the apparent emergence of a frightful new 24th.']
3


In [23]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
tagged_data = [TaggedDocument(words=sentence.split(), tags=[str(i)]) for i, sentence in enumerate(sentences)] # Tag the sentences for training
model = Doc2Vec(tagged_data, vector_size=10, window=2, min_count=1, workers=4) # Train the model
sentence_vectors = [model.infer_vector(sentence.split()) for sentence in sentences] # Get the embeddings for the sentences, infer_vectors expects the input as a list of words (nltk.word_tokenize())

print("Sentence Embeddings:")
print(sentence_vectors) #Embeddings of the sentences

import numpy as np
print("\nShape:")
print(np.array(sentence_vectors).shape)

Sentence Embeddings:
[array([-0.03488031,  0.04760626,  0.00438918, -0.0438889 ,  0.00671681,
        0.01807399,  0.00505884, -0.03431821,  0.0026237 , -0.00580727],
      dtype=float32), array([-0.01203412,  0.03905879, -0.04048118, -0.03343662, -0.02399926,
       -0.04903608,  0.02998802,  0.00076669,  0.03936699, -0.01577356],
      dtype=float32), array([-0.01994767, -0.00748677, -0.05321181,  0.00212716,  0.00250726,
       -0.05625424, -0.00501994,  0.03474486,  0.0284529 , -0.01321101],
      dtype=float32)]

Shape:
(3, 10)


In [24]:
# Find Cosine similarity between sentence embeddings
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(sentence_vectors[0].reshape(1,-1),sentence_vectors[1].reshape(1,-1))[0][0]

0.336813

In [25]:
# Find the similarity between all the sentences
similarity = cosine_similarity(sentence_vectors)
similarity

array([[ 0.99999994,  0.33681303, -0.26270026],
       [ 0.33681303,  1.        ,  0.6336158 ],
       [-0.26270026,  0.6336158 ,  1.0000001 ]], dtype=float32)

In [26]:
#Find the most similar sentence to the first sentence (at index = 0)
ind = 0  # The index of the sentence for which you want to find the most similar sentence
max = -1 # This will store the cosine_similarity of the most similar document
print("Input Sentence -->", sentences[ind])
for i in range(np.array(sentence_vectors).shape[0]):
    if i != ind:
        if max < cosine_similarity(sentence_vectors[i].reshape(1,-1),sentence_vectors[ind].reshape(1,-1))[0][0]:
            max = cosine_similarity(sentence_vectors[i].reshape(1,-1),sentence_vectors[ind].reshape(1,-1))[0][0]
            s_ind = i

print("Most Similar Sentence -->", sentences[s_ind])
print("Cosine Simialrity:", max)

Input Sentence --> A group of intergalactic criminals are forced to work together to stop a fanatical warrior from taking control of the universe.
Most Similar Sentence --> Following clues to the origin of mankind, a team finds a structure on a distant moon, but they soon realize they are not alone.
Cosine Simialrity: 0.336813
