# This notebook will be about training new embedders, and fine-tuning pre-trained Embedders


## CBOW architecture has already been tested in previous Part, the embedders below will be on Skip-gram

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
df= pd.read_csv("./Data/Preprocessed/Cleaned_Nepali_dataset.csv")
df_copy= df.copy()
df_copy.head()

Unnamed: 0,Text,Target
0,गुठी विधेक ल्याएर ठमेल राज गुठि जग्गा छाया सेन...,0
1,दले देश सकेछन बेचे खान सुरू गरेछन दले लखेटनु पछ ।,1
2,नेपाल ससकृती ध्वस्त पार्ने योजना !,1
3,मठ मन्दिर गुम्बा जग्गा हरु भुमाफिया नजर परे हु...,1
4,नेपाल कल कर्खाना नदि नाला बेची सके मठ मन्दीर ब...,1


In [3]:
output_path = "./Data/Embeddings/"

In [4]:
# Tokenizing using nltk and creating a corpus for models to train on

import nltk
from nltk.tokenize import word_tokenize

sentences= df_copy['Text'].apply(lambda x: word_tokenize(x)).tolist()


## Word2Vec scratch 

In [5]:
from gensim.models import Word2Vec

word2vec_scratch = Word2Vec(sentences, vector_size= 100,sg=1 ,window= 6, min_count=1, workers=4, seed= 42)  #sg=1 means using Skip-gram


In [6]:
print(word2vec_scratch.wv.most_similar('दलाल'))

[('नेता', 0.9978492259979248), ('हरु', 0.9974229335784912), ('झोले', 0.9968293309211731), ('प्रचण्ड', 0.9968018531799316), ('पार्टी', 0.9965466856956482), ('भ्रष्ट', 0.9964882731437683), ('सब', 0.996459424495697), ('जनता', 0.9963836073875427), ('सरकार', 0.9963688850402832), ('नेपाली', 0.9962961077690125)]


दलाल being most similar to भ्रष्ट and चोर, really shows the current context of the word in social media

In [7]:
print(word2vec_scratch.wv.most_similar('मुजि'))

[('गर्दा', 0.9988914132118225), ('ओलि', 0.9988536238670349), ('भन्छ', 0.9988439083099365), ('पछी', 0.9988129138946533), ('यहि', 0.9987989068031311), ('नेकपा', 0.9987727999687195), ('अनुहार', 0.9987668991088867), ('नभए', 0.9987576007843018), ('ए', 0.9987449049949646), ('बहस', 0.998744010925293)]


this profane word is most semantically similar to these words according to the model


In [8]:
print(word2vec_scratch.wv.most_similar('रन्डि'))

[('अनुहार', 0.9985929727554321), ('मुजि', 0.9985781311988831), ('गरिब', 0.9985146522521973), ('भन्न', 0.9984620809555054), ('राम', 0.9984380602836609), ('पनी', 0.9984345436096191), ('बेला', 0.9984260201454163), ('लगाएर', 0.9984238147735596), ('हुदा', 0.9984168410301208), ('साझा', 0.9984106421470642)]


In [9]:
print(word2vec_scratch.wv.most_similar('चिक्ने'))

[('छोरा', 0.9976100921630859), ('कुन', 0.9976083040237427), ('भारत', 0.9975743889808655), ('यार', 0.9975153803825378), ('ऋषि', 0.997500479221344), ('की', 0.9974930882453918), ('दिएर', 0.9974638819694519), ('कस्तो', 0.9974360466003418), ('चै', 0.9974192380905151), ('तेरो', 0.997410774230957)]


## Saving the word vector

In [10]:
# Specify the output file path
word2vec_scratch_output_file = os.path.join(output_path, 'Word2Vec_scratch_embeddings.txt')

word2vec_scratch.wv.save_word2vec_format(word2vec_scratch_output_file, binary=False)

## Loading using KeyedVectors


In [11]:
# from gensim.models import KeyedVectors

# word2vec_scratch_loaded= KeyedVectors.load_word2vec_format(word2vec_scratch_output_file, binary=False)

# We already made generate_embeddings script that we can use to load and generate embeddings from the loaded model

In [12]:
from generate_embeddings_weighted import generate_weighted_word2vec_embeddings, load_word2vec_model

word2vec_scratch_loaded= load_word2vec_model('./Data/Embeddings/Word2Vec_scratch_embeddings.txt')

df_word2vec= df_copy.copy()
df_word2vec = generate_weighted_word2vec_embeddings(df_word2vec, 'Text', word2vec_scratch_loaded)
df_word2vec.head()

Unnamed: 0,Text,0,1,2,3,4,5,6,7,8,...,91,92,93,94,95,96,97,98,99,Target
0,गुठी विधेक ल्याएर ठमेल राज गुठि जग्गा छाया सेन...,-0.097175,0.037607,-0.112759,0.028592,-0.099917,0.122079,0.137278,0.161555,-0.05604,...,0.029141,0.086424,-0.177104,0.134328,0.020842,-0.03032,0.060714,-0.139892,-0.084863,0
1,दले देश सकेछन बेचे खान सुरू गरेछन दले लखेटनु पछ ।,-0.060358,0.025221,-0.071019,0.012363,-0.053112,0.073997,0.081764,0.093119,-0.03376,...,0.013916,0.054754,-0.105563,0.080853,0.010461,-0.021752,0.03492,-0.084815,-0.050754,1
2,नेपाल ससकृती ध्वस्त पार्ने योजना !,-0.111043,0.038801,-0.114933,0.029722,-0.095024,0.130788,0.142487,0.173299,-0.060231,...,0.026738,0.094869,-0.183841,0.144524,0.019272,-0.028467,0.060803,-0.153386,-0.092006,1
3,मठ मन्दिर गुम्बा जग्गा हरु भुमाफिया नजर परे हु...,-0.107331,0.040279,-0.119178,0.031434,-0.10763,0.128477,0.146452,0.177857,-0.059065,...,0.031872,0.095145,-0.192617,0.143582,0.026082,-0.033372,0.06633,-0.151109,-0.095058,1
4,नेपाल कल कर्खाना नदि नाला बेची सके मठ मन्दीर ब...,-0.08984,0.032169,-0.096967,0.027235,-0.085516,0.105764,0.121383,0.146297,-0.054604,...,0.028413,0.080607,-0.156071,0.118282,0.01993,-0.029502,0.057832,-0.130237,-0.079404,1


In [13]:
# Creating new csv file of Word2Vec vectors

df_word2vec.drop("Text", axis=1, inplace=True)
df_word2vec.to_csv("./Data/Preprocessed/Word2vec_scratch_dataset.csv", index=False)

## Weighing the embeddings with tf-idf scores didn't perform as hoped, let's generate embeddings by simply mean averaging

In [14]:
df_copy.head()

Unnamed: 0,Text,Target
0,गुठी विधेक ल्याएर ठमेल राज गुठि जग्गा छाया सेन...,0
1,दले देश सकेछन बेचे खान सुरू गरेछन दले लखेटनु पछ ।,1
2,नेपाल ससकृती ध्वस्त पार्ने योजना !,1
3,मठ मन्दिर गुम्बा जग्गा हरु भुमाफिया नजर परे हु...,1
4,नेपाल कल कर्खाना नदि नाला बेची सके मठ मन्दीर ब...,1


In [15]:
from generate_embeddings import generate_word2vec_embeddings

df_word2vec_no_weight= df_copy.copy()
df_word2vec_no_weight= generate_word2vec_embeddings(df_word2vec_no_weight, 'Text', word2vec_scratch_loaded)
df_word2vec_no_weight.head(1)

Unnamed: 0,Text,0,1,2,3,4,5,6,7,8,...,91,92,93,94,95,96,97,98,99,Target
0,गुठी विधेक ल्याएर ठमेल राज गुठि जग्गा छाया सेन...,-0.11756,0.04505,-0.133415,0.030642,-0.115353,0.149121,0.165624,0.193793,-0.069321,...,0.035498,0.105436,-0.211654,0.162046,0.022811,-0.03604,0.06833,-0.170917,-0.101775,0


In [18]:
df_word2vec_no_weight.drop("Text", axis=1, inplace= True)
df_word2vec_no_weight.to_csv('./Data/Preprocessed/Word2vec_scratch_no_weight_dataset.csv', index=False)

# Fasttext


In [19]:
from gensim.models import FastText

fasttext= FastText(sentences=sentences, vector_size=100, min_count=1, window=6, seed=42, sg=1, workers=4)

In [20]:
print(fasttext.wv.most_similar('दलाल'))

[('भ्रष्ट', 0.9997515678405762), ('भ्रष्टाचरि', 0.9996430277824402), ('भ्रष्टाचार', 0.9996086359024048), ('दलाल्', 0.9995893836021423), ('भ्रष्टाचारी', 0.9995713233947754), ('भ्रष्टाचारीकाे', 0.999549388885498), ('भ्रष्टाचारि', 0.9994513392448425), ('जन्ता', 0.9994223713874817), ('भ्रष्टचार', 0.9994112253189087), ('नेताकाे', 0.9994082450866699)]


In [21]:
print(fasttext.wv.most_similar('मुजि'))

[('मुनि', 0.9999259114265442), ('जिल्ला', 0.9999111294746399), ('मुद्धा', 0.9999097585678101), ('धमलालाइ', 0.9999095797538757), ('आइसक्यो', 0.9999081492424011), ('कुर्सी', 0.9999070763587952), ('बोक्न', 0.9999068379402161), ('ठोक्ने', 0.9999058842658997), ('मुलालाइ', 0.9999054670333862), ('मुलुक', 0.9999041557312012)]


In [22]:
print(fasttext.wv.most_similar('रन्डि'))

[('इन्डियन', 0.999956488609314), ('ख्रिस्चियन', 0.9999527335166931), ('टुंन्डिखेल', 0.999952495098114), ('संरक्षण', 0.9999513030052185), ('भर्सटचारि', 0.9999498724937439), ('उत्तरदायित्व', 0.9999492168426514), ('उन्मुक्ती', 0.9999468922615051), ('रन्डी', 0.9999452829360962), ('क्रिश्चियन', 0.9999450445175171), ('पाखन्डि', 0.9999449253082275)]


In [23]:
# Specify the output file path
fasttext_output_file = os.path.join(output_path, 'fasttext_embeddings.txt')

fasttext.wv.save_word2vec_format(fasttext_output_file, binary=False)

In [24]:
# Also saving the model

fasttext.save(os.path.join(output_path, 'fasttext_model'))

In [3]:
# loading model

from generate_embeddings import load_fasttext_model, generate_fasttext_embeddings

fasttext_loaded= load_fasttext_model('./Data/Embeddings/fasttext_model')
df_fasttext= df_copy.copy()
df_fasttext= generate_fasttext_embeddings(df_fasttext, "Text", fasttext_loaded)
df_fasttext.head()

Unnamed: 0,Text,0,1,2,3,4,5,6,7,8,...,91,92,93,94,95,96,97,98,99,Target
0,गुठी विधेक ल्याएर ठमेल राज गुठि जग्गा छाया सेन...,0.332578,-0.102626,0.37407,-0.147144,0.024742,0.078759,-0.111112,0.234482,-0.094451,...,0.533157,-0.355296,-0.710457,0.126522,-0.145634,-0.085188,0.256028,-0.271981,0.150822,0
1,दले देश सकेछन बेचे खान सुरू गरेछन दले लखेटनु पछ ।,0.302399,-0.093528,0.341173,-0.13157,0.023275,0.071763,-0.101102,0.216264,-0.086953,...,0.483671,-0.322421,-0.643361,0.117718,-0.132818,-0.076983,0.231773,-0.250802,0.137868,1
2,नेपाल ससकृती ध्वस्त पार्ने योजना !,0.300474,-0.096678,0.344405,-0.136217,0.022518,0.072743,-0.096957,0.219617,-0.087057,...,0.478695,-0.323674,-0.641463,0.129422,-0.13207,-0.077451,0.233735,-0.255186,0.146775,1
3,मठ मन्दिर गुम्बा जग्गा हरु भुमाफिया नजर परे हु...,0.289442,-0.088763,0.327231,-0.124751,0.022739,0.069775,-0.097282,0.202963,-0.079635,...,0.462768,-0.307386,-0.616574,0.107359,-0.125195,-0.073896,0.221342,-0.235846,0.129929,1
4,नेपाल कल कर्खाना नदि नाला बेची सके मठ मन्दीर ब...,0.307251,-0.095812,0.349455,-0.135839,0.023597,0.073046,-0.101997,0.219555,-0.087869,...,0.493153,-0.331401,-0.658701,0.122008,-0.136398,-0.080337,0.239287,-0.257596,0.141716,1


In [4]:
df_fasttext.drop('Text', axis=1, inplace=True)
df_fasttext.to_csv('./Data/Preprocessed/fasttext_scratch_dataset.csv', index=False)

# Load Pre-trained Glove using KeyedVectors

In [5]:
from gensim.scripts.glove2word2vec import glove2word2vec
from generate_embeddings import load_glove_embeddings, generate_glove_embeddings

glove_file= './Data/Embeddings/processed_tokenized_stemmed.glove.txt'
glove_word2vec_format= './Data/Embeddings/glove_word2vec_format.txt'

desc= glove2word2vec(glove_file, glove_word2vec_format)



  desc= glove2word2vec(glove_file, glove_word2vec_format)


TypeError: cannot unpack non-iterable int object

In [6]:
print(desc)

(302596, 300)


In [7]:
# Now loading Glove

glove= load_glove_embeddings(glove_word2vec_format)
print(glove.most_similar('दलाल'))

[('बिचौल', 0.6204380393028259), ('पुँजीपति', 0.6165770888328552), ('दलालहर', 0.6044521331787109), ('ोकरशाह', 0.5913015604019165), ('पुजीपत', 0.581291675567627), ('पूँजीपतिवर्ग', 0.5632365345954895), ('विचौल', 0.5593226552009583), ('पुँजिपत', 0.5415458083152771), ('तस्कर', 0.5345401167869568), ('ोकरसाह', 0.5341207385063171)]


In [8]:
print(glove.most_similar('मुजि'))

KeyError: "Key 'मुजि' not present in vocabulary"

In [9]:
print(glove.most_similar('रन्डि'))

KeyError: "Key 'रन्डि' not present in vocabulary"

## Profanity Words not being in vocabuly could result in poor performance in out classification Task