# Overview
In this tutorial, we are going to build a Word2Vec model from scratch to predict the keyword similarities in the Udemy course titles. Note that we will be using the `raw_udemy_databse.csv ` file.

# Steps

1.   Clean the database
2.   Build a corpus
3.   Create and Train a Word2Vec Model
4.   Visualize the model
5.   Predict a word similarity

# Import the libraries

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import nltk
import re

from gensim.models import Word2Vec
from sklearn.manifold import TSNE
pd.options.mode.chained_assignment = None 


# Read the databse
Note: we are using the raw databse in this tutorial. Make sure the file is present in your directory

In [None]:
data = pd.read_csv('/content/raw_udemy_databse.csv').sample(9000, random_state=23)

# Clean the database
1.   Remove the stop words in the ***title*** column of the database
2.   Remove NAN in the database



In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

STOP_WORDS = nltk.corpus.stopwords.words()
"I added this line becuse numbers were not included in the STOP_WORDS"
STOP_WORDS += ['2022','2021','2020','2019','2018','2','1','0']


def clean_sentence(val):
    "remove chars that are not letters or numbers, downcase, then remove stop words"
    regex = re.compile('([^\s\w]|_)+')
    sentence = regex.sub('', val).lower()
    sentence = sentence.split(" ")
    
    for word in list(sentence):
        if word in STOP_WORDS:
            sentence.remove(word)  
            
    sentence = " ".join(sentence)
    return sentence

def clean_dataframe(data):
    "drop nans, then apply 'clean_sentence' function"
    data = data.dropna(how="any")
    
    for col in ['title']:
        data[col] = data[col].apply(clean_sentence)
    
    return data

data = clean_dataframe(data)
data.head(5)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,id,title,is_paid,price,currency,price_string,currency_symbol,url
2118,1407900.0,docker swarm handson devops,1.0,89.99,EUR,€89.99,€,https://www.udemy.com/course/learn-docker-adva...
8603,1261458.0,speak medical spanish patients clients,1.0,49.99,EUR,€49.99,€,https://www.udemy.com/course/medical-spanish/
7660,1212996.0,instagram growth academy learn grow monetize ig,1.0,19.99,EUR,€19.99,€,https://www.udemy.com/course/instagram-growth-...
5811,2433792.0,hardware asset management servicenow,1.0,34.99,EUR,€34.99,€,https://www.udemy.com/course/hardware-asset-ma...
3523,604972.0,hyperfocus selfcontrol productivity masterclass,1.0,94.99,EUR,€94.99,€,https://www.udemy.com/course/unlimited-success...


# Build Corpus
In simple words,corpus refers to an entire set of a particular linguistic element within a language, such as words or sentences

In [None]:
def build_corpus(data):
    "Creates a list of lists containing words from each sentence"
    corpus = []
    for col in ['title']:
        for sentence in data[col].iteritems():
            word_list = sentence[1].split(" ")
            corpus.append(word_list)
            
    return corpus

corpus = build_corpus(data)        
corpus[0:5]

[['docker', '', 'swarm', '', 'handson', '', 'devops'],
 ['speak', 'medical', 'spanish', 'patients', 'clients'],
 ['instagram', 'growth', 'academy', '', 'learn', 'grow', 'monetize', 'ig'],
 ['hardware', 'asset', 'management', 'servicenow'],
 ['hyperfocus', 'selfcontrol', 'productivity', 'masterclass']]

# Create the Word2Vec model
The Word to Vec model produces a vocabulary, with each word being represented by an n-dimensional numpy array (100 values in this example)

In [None]:
model = Word2Vec(corpus, size=100, window=20, min_count=200, workers=4)
model.wv['python']

array([-0.00535747,  0.01567873, -0.02968977,  0.01315348, -0.00211317,
        0.00248521, -0.03548419,  0.02273437, -0.02435968, -0.02424064,
       -0.07162066, -0.0327274 , -0.0485316 , -0.04129174, -0.05568272,
       -0.02767501, -0.00269501,  0.00130491,  0.0055487 ,  0.02364198,
       -0.00549363,  0.00716766, -0.01056529, -0.02693963,  0.0064958 ,
        0.00300804, -0.03261491, -0.00408113, -0.02235338,  0.0002432 ,
       -0.03063837, -0.01756367, -0.00366217,  0.01738412, -0.01192252,
       -0.02318337, -0.03590273,  0.01029773,  0.00406185,  0.0262019 ,
        0.01532762, -0.01856728, -0.01219605, -0.00107666, -0.00214153,
       -0.00618159, -0.00345814,  0.02123312, -0.00346965,  0.00767859,
        0.02705146, -0.00556246, -0.04543208, -0.02513365,  0.00884293,
       -0.02527453, -0.0271896 ,  0.00273305, -0.02912466, -0.01212348,
        0.01702892, -0.04724373,  0.00137648, -0.01705699,  0.02936824,
        0.03060568, -0.02207456, -0.0224853 ,  0.02486581,  0.05

#Visualise the model
Using matplotlib we may visualise the relatedness of words on a two dimensional plane (the closer they are to one another, the more related they are).

In [None]:
def tsne_plot(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()

In [None]:
tsne_plot(model)

# Train the model
Lowering the "workers" parameter may lead to more accurate results

In [None]:
model = Word2Vec(corpus, size=100, window=20, min_count=2, workers=50)

# Predict a word's similarities
You may use a list of keywords such as `['python', 'r', 'programming']`

In [None]:
similarities = model.wv.most_similar('python', topn=5)

The second parameter is the similarity score that is out of 1. 

In [None]:
for word in similarities:
  print(word)

('learning', 0.9997335076332092)
('deep', 0.9996577501296997)
('r', 0.9996521472930908)
('bootcamp', 0.9995537996292114)
('programming', 0.9994513988494873)


# Save the model
You may need to use the same model in your future projects.

In [None]:
# with .model extension
model.save("NAME OF YOUR MODEL") 

# Load the model
You may load your pre-trained model by the code snippet shown as below

In [None]:
# with .model extension
model = Word2Vec.load("NAME OF YOUR MODEL")

In [None]:
sims = model.wv.most_similar('python', topn=5)
for sim in sims:
  print(sim)

('learning', 0.999753475189209)
('r', 0.9997533559799194)
('bootcamp', 0.9996874928474426)
('deep', 0.9996565580368042)
('data', 0.9996529817581177)
