In [1]:
import pandas as pd
import numpy as np
import nltk
import gensim 
from gensim.models.phrases import Phrases, Phraser
import re
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict
from collections import Counter 

In [2]:
data=pd.read_csv("Indian_songs.csv")


In [3]:
data.shape


(3027, 6)

In [4]:
data.head()

Unnamed: 0,movieName,songName,songSinger,songMusic,songLyricist,songLyrics
0,Bala,Don’t Be Shy,"Badshah, Shalmali Kholgade, Gurdeep Mehendi",Sachin-Jigar,"Mellow D, Badshah","Sun, main hoon thoda sanki\nKarun mann ki\nBab..."
1,Laal Kaptaan,Lahu Ka Rang Kara,Samira Koppikar,Samira Koppikar,Sahib,Morey.. lahu ka rang kara\nMorey lahu ka rang ...
2,Laal Kaptaan,Red Red Najariya,Shreya Ghoshal,Samira Koppikar,Saurabh Jain,"Badnaam shehar, badnaam gully\nIsme har raat h..."
3,Laal Kaptaan,Kaal Kaal,"Brijesh Shandilya, Dino James",Samira Koppikar,Saurabh Jain,"Kaal kaal, kaal kaal, jo sapaat chal raha\nWo ..."
4,Laal Kaptaan,Taandav,"Kailash Kher, Brijesh Shandilya",Samira Koppikar,Puneet Sharma,Shor hai andher mein\nJo dher murda pedon ka\n...


# simple lyrics doc

In [5]:
doc=[]
for i in range(data.shape[0]):
    doc.append(nltk.word_tokenize(re.sub('[^a-zA-z\s]','',data['songLyrics'][i].lower())))

In [6]:
doc[0]

['sun',
 'main',
 'hoon',
 'thoda',
 'sanki',
 'karun',
 'mann',
 'ki',
 'baby',
 'gaana',
 'lagade',
 'thoda',
 'funky',
 'nahi',
 'mann',
 'ki',
 'nahi',
 'dhan',
 'ki',
 'ye',
 'baat',
 'hai',
 'tere',
 'tann',
 'ki',
 'paagal',
 'ho',
 'jaaun',
 'jab',
 'tu',
 'ho',
 'rubaru',
 'na',
 'lamba',
 'ocha',
 'gora',
 'chitta',
 'phir',
 'bhi',
 'dil',
 'mein',
 'tu',
 'ishq',
 'ne',
 'tere',
 'kiya',
 'hai',
 'bekaboo',
 'jaisa',
 'hai',
 'waisa',
 'hi',
 'pasand',
 'mujhko',
 'tu',
 'jaanu',
 'i',
 'just',
 'wan',
 'na',
 'feel',
 'your',
 'body',
 'saanu',
 'kare',
 'ishaare',
 'touch',
 'my',
 'body',
 'dont',
 'be',
 'shy',
 'my',
 'honey',
 'saanu',
 'kare',
 'ishaare',
 'i',
 'wan',
 'na',
 'feel',
 'your',
 'body',
 'saanu',
 'kare',
 'ishaare',
 'touch',
 'my',
 'body',
 'dont',
 'be',
 'shy',
 'my',
 'honey',
 'saanu',
 'kare',
 'ishaare',
 'touch',
 'my',
 'body',
 'fly',
 'my',
 'honey',
 'dont',
 'be',
 'shy',
 'my',
 'honey',
 'befikar',
 'fly',
 'my',
 'honey',
 'dont',
 '

# without repitition

In [7]:
doc_sent=[]
for i in range(data.shape[0]):
    doc_sent.append(re.sub('[^a-zA-z\s]','',data['songLyrics'][i].lower()).split("\n"))
nonrep_doc=[]
for song in doc_sent:
    d=[]
    for line in song:
        if line not in d:
            d.append(line)
    nonrep_doc.append(d)
        

In [8]:
nonrep_doc[0]

['sun main hoon thoda sanki',
 'karun mann ki',
 'baby gaana lagade thoda funky',
 'nahi mann ki',
 'nahi dhan ki',
 'ye baat hai tere tann ki',
 '',
 'paagal ho jaaun jab tu ho rubaru',
 'na lamba ocha gora chitta',
 'phir bhi dil mein tu',
 'ishq ne tere kiya hai bekaboo',
 'jaisa hai waisa hi pasand mujhko tu jaanu',
 'i just wanna feel your body',
 'saanu kare ishaare',
 'touch my body',
 'dont be shy my honey',
 'i wanna feel your body',
 'fly my honey',
 'befikar fly my honey',
 'kuch bhi bole bina',
 'aankhon se tu baat kare',
 'ye shaitani jaan bujhke mere saath kare',
 'tujhko dekhe bina dil mera ab kahin lage nahi',
 'tere husn pe aankhein sek sek kar thake nahi',
 'upar se raat hoti ja rahi hai naughty naughty naughty']

# line wise doc

In [9]:
lyrics_sent=[]
for i in nonrep_doc:
    for j in i:
        lyrics_sent.append(nltk.word_tokenize(j))

In [10]:
lyrics_sent[0]

['sun', 'main', 'hoon', 'thoda', 'sanki']

# creating dictt

In [11]:
hindidictt= pd.read_csv("Hindi - Word Transliteration Pairs 1.txt", header=None,delimiter='\t')
hindidictt.columns=['Eng','Hin']

In [12]:
hindidictt.head()

Unnamed: 0,Eng,Hin
0,khushboo,खुशबू
1,khushbuu,खुशबू
2,khushbu,खुशबू
3,khusbhu,खुशबू
4,tera,तेरा


In [13]:
EtoH={}
for i in range(len(hindidictt)):
    EtoH[hindidictt['Eng'][i]]=hindidictt['Hin'][i]

In [14]:
HtoE={}
x=len(hindidictt)-1
while x>=0:
    HtoE[hindidictt['Hin'][x]]=hindidictt['Eng'][x]
    x=x-1

# word2vec on simple lyrics

without dictt

In [15]:
w2v_model = gensim.models.Word2Vec(doc,size=300,window=15,min_count=2,workers=10,iter=10) 

In [16]:
w2v_model.wv.most_similar(positive=["ladki"],topn=30)

[('bhadki', 0.7658559083938599),
 ('bholi', 0.7557170391082764),
 ('nakhrezi', 0.7509742975234985),
 ('nagin', 0.7416472434997559),
 ('pagli', 0.7378005981445312),
 ('bolti', 0.7344050407409668),
 ('kadki', 0.7296107411384583),
 ('uski', 0.7186790704727173),
 ('chull', 0.7177284955978394),
 ('tezi', 0.7129014134407043),
 ('ladka', 0.7054895758628845),
 ('lakdi', 0.7050880193710327),
 ('tabiyat', 0.6971628665924072),
 ('shaamat', 0.6957572102546692),
 ('kharaab', 0.6894088983535767),
 ('gharwaali', 0.6842488646507263),
 ('kiski', 0.6826586723327637),
 ('sharaab', 0.6816666126251221),
 ('sanwali', 0.680025577545166),
 ('jamaaye', 0.6791594624519348),
 ('thenga', 0.6790791749954224),
 ('naadaan', 0.6780648231506348),
 ('padti', 0.6755293607711792),
 ('komalkomal', 0.6725174188613892),
 ('niklegi', 0.6678140759468079),
 ('mastani', 0.6676426529884338),
 ('hasina', 0.6643911004066467),
 ('jungli', 0.6633493304252625),
 ('taisi', 0.6632841229438782),
 ('iaditi', 0.6602733135223389)]

In [61]:
w2v_model.wv.most_similar(positive=["look"],topn=30)

[('wikipedia', 0.7981844544410706),
 ('at', 0.7961621284484863),
 ('babuji', 0.7953083515167236),
 ('boy', 0.7578973770141602),
 ('criminal', 0.7533833980560303),
 ('blaring', 0.7497865557670593),
 ('girl', 0.7464805841445923),
 ('killar', 0.7429620623588562),
 ('been', 0.7390487194061279),
 ('your', 0.7331137657165527),
 ('talk', 0.7304757237434387),
 ('tryna', 0.7227292060852051),
 ('smile', 0.7219364047050476),
 ('getting', 0.7210797071456909),
 ('adayein', 0.7206111550331116),
 ('cheers', 0.7191591262817383),
 ('stop', 0.7178800702095032),
 ('sirens', 0.717401921749115),
 ('see', 0.7167708277702332),
 ('ego', 0.7154247760772705),
 ('eyes', 0.7130755186080933),
 ('janhit', 0.7108755707740784),
 ('shubrubabru', 0.7101786732673645),
 ('dream', 0.7093521356582642),
 ('angel', 0.7074534893035889),
 ('lets', 0.7052059173583984),
 ('youre', 0.7045307159423828),
 ('everybody', 0.7038923501968384),
 ('by', 0.7031441330909729),
 ('looking', 0.6993191242218018)]

using dictt

In [17]:
for i in range(len(doc)):
    for l in range(len(doc[i])):
        j=doc[i][l]
        if j in EtoH:
            k=EtoH[j]
            doc[i][l]=HtoE[k]
   

In [18]:
w2v_model1 = gensim.models.Word2Vec(doc,size=300,window=15,min_count=2,workers=10,iter=10) 

In [19]:
w2v_model1.wv.most_similar(positive=["ladki"],topn=30)

[('bholi', 0.740206241607666),
 ('kadaki', 0.7264542579650879),
 ('chull', 0.7113338708877563),
 ('pagali', 0.7093102931976318),
 ('ekdam', 0.707396388053894),
 ('sharaab', 0.6981011033058167),
 ('marey', 0.697447657585144),
 ('sabaah', 0.6855283975601196),
 ('beautiful', 0.6849833130836487),
 ('bhadki', 0.6770075559616089),
 ('ladka', 0.672040581703186),
 ('tabiyat', 0.6692796349525452),
 ('shaamat', 0.6682986617088318),
 ('garm', 0.6662520170211792),
 ('komalkomal', 0.6658504009246826),
 ('taisi', 0.6654826998710632),
 ('hirani', 0.664230465888977),
 ('maserati', 0.6613790392875671),
 ('strongraftaarstrong', 0.6604737043380737),
 ('buddhi', 0.659834623336792),
 ('saanvali', 0.6598261594772339),
 ('padati', 0.6539756059646606),
 ('lagti', 0.6511334776878357),
 ('dheeli', 0.6490904092788696),
 ('aadmi', 0.6471246480941772),
 ('bolti', 0.6463819742202759),
 ('muskurayenge', 0.6436578035354614),
 ('jhulaaye', 0.6422997713088989),
 ('naadaan', 0.6364174485206604),
 ('banegi', 0.6322613954

In [36]:
w2v_model1.wv.most_similar(positive=["ladka"],topn=30)

[('komalkomal', 0.8082857131958008),
 ('ghaas', 0.8002029657363892),
 ('naadaan', 0.7983545660972595),
 ('dkeh', 0.7945951223373413),
 ('tabiyat', 0.7943574786186218),
 ('pataaye', 0.79245924949646),
 ('affair', 0.7872856855392456),
 ('buddha', 0.7823206186294556),
 ('rakkhi', 0.7790163159370422),
 ('fillam', 0.7766212821006775),
 ('kukkad', 0.7764028310775757),
 ('aatish', 0.7762962579727173),
 ('teeno', 0.7740685939788818),
 ('alignment', 0.7727333903312683),
 ('mahadev', 0.7720894813537598),
 ('dheeli', 0.7716362476348877),
 ('badneeyat', 0.7701950073242188),
 ('unknown', 0.769494891166687),
 ('dhani', 0.7693310976028442),
 ('gundo', 0.7689433097839355),
 ('beatles', 0.7678130865097046),
 ('gannnakshatra', 0.7673767805099487),
 ('bombai', 0.7630811929702759),
 ('halava', 0.7617206573486328),
 ('kheenchon', 0.7615123987197876),
 ('snaan', 0.7607138156890869),
 ('buddhi', 0.7586770057678223),
 ('alaav', 0.7577664852142334),
 ('pakkad', 0.7571420073509216),
 ('effect', 0.75693166255950

# word2vec using non repeating lyrics doc

without dictt

In [20]:
doc1=[]
for i in range(data.shape[0]):
    doc1.append(nltk.word_tokenize(" ".join(nonrep_doc[i])))

In [21]:
w2v_model2 = gensim.models.Word2Vec(doc1,size=300,window=15,min_count=2,workers=10,iter=10) 

In [22]:
w2v_model2.wv.most_similar(positive=["ladki"],topn=30)

[('uski', 0.8006220459938049),
 ('isliye', 0.7830924391746521),
 ('dikhlati', 0.7764444947242737),
 ('philosophy', 0.7728574275970459),
 ('maasa', 0.7676728963851929),
 ('chhooti', 0.7670385837554932),
 ('nagin', 0.7654126286506653),
 ('bhadki', 0.7606104612350464),
 ('aandhi', 0.7570255994796753),
 ('lagti', 0.7569928765296936),
 ('sharaab', 0.7562861442565918),
 ('po', 0.7557681202888489),
 ('adayein', 0.753081202507019),
 ('bholi', 0.7506587505340576),
 ('shauq', 0.7502321600914001),
 ('sanwali', 0.7468936443328857),
 ('chull', 0.7431329488754272),
 ('bolti', 0.7426114082336426),
 ('hindustani', 0.7411711812019348),
 ('tola', 0.739315927028656),
 ('imliyon', 0.7383874654769897),
 ('nivaran', 0.7383432984352112),
 ('lakdi', 0.7328816652297974),
 ('tikhi', 0.7315342426300049),
 ('tezz', 0.730852484703064),
 ('babum', 0.7300989627838135),
 ('besharmi', 0.7271488904953003),
 ('tapori', 0.727026641368866),
 ('fuljhadi', 0.7241003513336182),
 ('khaamiyan', 0.7223764061927795)]

In [38]:
w2v_model2.wv.most_similar(positive=["ladka"],topn=30)

[('tabiyat', 0.9394675493240356),
 ('udati', 0.9370902180671692),
 ('sharab', 0.9344600439071655),
 ('nadaani', 0.9282681941986084),
 ('isi', 0.9280832409858704),
 ('baantein', 0.9238791465759277),
 ('sharaba', 0.9229364395141602),
 ('teeno', 0.922819972038269),
 ('kheli', 0.9218839406967163),
 ('vaani', 0.9202572107315063),
 ('shama', 0.9202229976654053),
 ('tedhi', 0.9197221398353577),
 ('peena', 0.9191327095031738),
 ('ikki', 0.9177263975143433),
 ('taakat', 0.9174764752388),
 ('unknown', 0.9173531532287598),
 ('handa', 0.9165998697280884),
 ('khatra', 0.9148816466331482),
 ('copy', 0.913922905921936),
 ('insaani', 0.9138121604919434),
 ('dhareya', 0.9137499928474426),
 ('kijiye', 0.9134860634803772),
 ('behisab', 0.9132664799690247),
 ('dehki', 0.9123424291610718),
 ('bhasa', 0.9122605323791504),
 ('byopari', 0.9110100269317627),
 ('khauf', 0.9109174013137817),
 ('dhaba', 0.9095838665962219),
 ('loss', 0.9095075726509094),
 ('uthaya', 0.9094421863555908)]

using dictt

In [23]:
for i in range(len(doc1)):
    for l in range(len(doc1[i])):
        j=doc1[i][l]
        if j in EtoH:
            k=EtoH[j]
            doc1[i][l]=HtoE[k]
     

In [24]:
w2v_model3 = gensim.models.Word2Vec(doc1,size=300,window=15,min_count=2,workers=10,iter=10) 

In [34]:
w2v_model3.wv.most_similar(positive=["ladki"],topn=30)

[('bhasad', 0.8294181823730469),
 ('isliye', 0.7912216186523438),
 ('margarita', 0.7850803732872009),
 ('tikhi', 0.7811753153800964),
 ('bhaalu', 0.7800151109695435),
 ('shaamat', 0.7738759517669678),
 ('upar', 0.7734025716781616),
 ('besharmi', 0.7685476541519165),
 ('bolti', 0.7656598687171936),
 ('ladka', 0.7649540901184082),
 ('sherni', 0.7605345249176025),
 ('sabaki', 0.7581216096878052),
 ('fullon', 0.7565603852272034),
 ('imliyon', 0.7562714219093323),
 ('animal', 0.7561612129211426),
 ('zulfen', 0.7523071765899658),
 ('gandi', 0.7517340779304504),
 ('gully', 0.7509292364120483),
 ('mastiyaan', 0.7475036978721619),
 ('rokta', 0.7474570274353027),
 ('bachane', 0.7430034875869751),
 ('frustration', 0.7405591011047363),
 ('joote', 0.7362022399902344),
 ('aiyyo', 0.7331588268280029),
 ('ghooma', 0.7327808141708374),
 ('ac', 0.732765793800354),
 ('sharmaati', 0.7327355742454529),
 ('york', 0.7319541573524475),
 ('dirty', 0.7307860851287842),
 ('bop', 0.7298811674118042)]

In [40]:
w2v_model3.wv.most_similar(positive=["ladka"],topn=30)

[('tabiyat', 0.9475221037864685),
 ('kijiye', 0.9408711194992065),
 ('gharavaali', 0.9392458200454712),
 ('affair', 0.9354391098022461),
 ('gardan', 0.9318481683731079),
 ('isi', 0.9308705925941467),
 ('inki', 0.9298444390296936),
 ('ujala', 0.9296845197677612),
 ('vaani', 0.9249085187911987),
 ('cheete', 0.9240995645523071),
 ('bheetar', 0.9225527048110962),
 ('sharaba', 0.9214366674423218),
 ('nange', 0.9210196733474731),
 ('bheje', 0.9207580089569092),
 ('kutegi', 0.9207130074501038),
 ('peekar', 0.920386791229248),
 ('citylights', 0.9203657507896423),
 ('mukshiya', 0.9203606843948364),
 ('boti', 0.9202898144721985),
 ('pujari', 0.920116662979126),
 ('scooter', 0.9192569255828857),
 ('gandi', 0.9191246032714844),
 ('karli', 0.9187825918197632),
 ('juuta', 0.9186747670173645),
 ('teeno', 0.9178236722946167),
 ('everyday', 0.9166399240493774),
 ('banti', 0.916577160358429),
 ('omkar', 0.915634274482727),
 ('tiya', 0.9150502681732178),
 ('dhandha', 0.9147684574127197)]

# word2vec doc-line wise 

without dictt

In [26]:
w2v_model4 = gensim.models.Word2Vec(lyrics_sent,size=300,window=10,min_count=2,workers=10,iter=10) 

In [33]:
w2v_model4.wv.most_similar(positive=["ladki"],topn=30)

[('shabnami', 0.8927114605903625),
 ('ilteja', 0.8895827531814575),
 ('ladka', 0.8880984783172607),
 ('meethi', 0.8847825527191162),
 ('adhuri', 0.8837752342224121),
 ('khhal', 0.8831612467765808),
 ('lagti', 0.8790958523750305),
 ('adaa', 0.8722476959228516),
 ('hansi', 0.8680562973022461),
 ('mulaqaat', 0.864368200302124),
 ('narm', 0.8643609285354614),
 ('sargoshi', 0.8632267713546753),
 ('shartein', 0.8614993095397949),
 ('halki', 0.8607339859008789),
 ('talab', 0.8603284358978271),
 ('sheet', 0.8599220514297485),
 ('teekhi', 0.8589272499084473),
 ('aurat', 0.8581842184066772),
 ('tabahi', 0.8578122854232788),
 ('dabi', 0.8572455644607544),
 ('uski', 0.8572390079498291),
 ('achhi', 0.856391429901123),
 ('hasrat', 0.8563069105148315),
 ('saanwali', 0.8560760021209717),
 ('chingari', 0.8556385040283203),
 ('bigdi', 0.8552053570747375),
 ('rookhi', 0.854721188545227),
 ('leher', 0.8521286249160767),
 ('peedhi', 0.8510965704917908),
 ('jiski', 0.8492339849472046)]

In [42]:
w2v_model4.wv.most_similar(positive=["ladka"],topn=30)

[('dilnasheen', 0.963772177696228),
 ('kaafi', 0.9495877027511597),
 ('sitara', 0.9483656883239746),
 ('tabahi', 0.9462727904319763),
 ('chhudaye', 0.9456583261489868),
 ('chumma', 0.944084644317627),
 ('disha', 0.9364385008811951),
 ('lehar', 0.9350032210350037),
 ('chingari', 0.9309958219528198),
 ('pyaasi', 0.930656909942627),
 ('judi', 0.9302205443382263),
 ('tika', 0.9301836490631104),
 ('mukhtasar', 0.930181622505188),
 ('chaaron', 0.9297844171524048),
 ('nashaa', 0.9263249635696411),
 ('bisra', 0.9255631566047668),
 ('maarpeet', 0.9254401922225952),
 ('leher', 0.9251890778541565),
 ('bhaagta', 0.9248265027999878),
 ('manto', 0.9247971773147583),
 ('sehra', 0.9247340559959412),
 ('guzra', 0.924484372138977),
 ('barsun', 0.924392819404602),
 ('mulaqaat', 0.9242676496505737),
 ('dhadakta', 0.9240180253982544),
 ('angle', 0.923642635345459),
 ('besabar', 0.9235888123512268),
 ('maange', 0.9234976768493652),
 ('makhmali', 0.9228940010070801),
 ('malaal', 0.9227485060691833)]

using dictt

In [28]:
for i in range(len(lyrics_sent)):
    for l in range(len(lyrics_sent[i])):
        j=lyrics_sent[i][l]
        if j in EtoH:
            k=EtoH[j]
            lyrics_sent[i][l]=HtoE[k]
          

In [29]:
w2v_model5 = gensim.models.Word2Vec(lyrics_sent,size=300,window=10,min_count=2,workers=10,iter=10) 

In [30]:
w2v_model5.wv.most_similar(positive=["ladki"],topn=30)

[('chingari', 0.9099149703979492),
 ('mulaaqaat', 0.9093437194824219),
 ('daasataan', 0.9052655696868896),
 ('adhuri', 0.9049338102340698),
 ('langoti', 0.9030437469482422),
 ('judi', 0.8964478969573975),
 ('nadi', 0.891637921333313),
 ('ulajhi', 0.8915126919746399),
 ('ladka', 0.8896576166152954),
 ('jhalak', 0.889411211013794),
 ('nadaani', 0.8869551420211792),
 ('lambi', 0.886256992816925),
 ('tishnagi', 0.886091947555542),
 ('lagti', 0.8856196999549866),
 ('nashaa', 0.8854144215583801),
 ('dikhi', 0.8851754665374756),
 ('hasrat', 0.8848868608474731),
 ('khamoshi', 0.8845192193984985),
 ('gaadi', 0.8843827247619629),
 ('rango', 0.88355553150177),
 ('aarzoo', 0.8833932876586914),
 ('ret', 0.8828492164611816),
 ('hansi', 0.8814917802810669),
 ('udaati', 0.8814011216163635),
 ('demand', 0.880613386631012),
 ('tamanaa', 0.877042829990387),
 ('ilteja', 0.8765590190887451),
 ('chuppi', 0.876535177230835),
 ('rishton', 0.8759307265281677),
 ('lipti', 0.8757636547088623)]

In [43]:
w2v_model5.wv.most_similar(positive=["ladka"],topn=30)

[('banjar', 0.9644784927368164),
 ('chhudaye', 0.9529496431350708),
 ('lahar', 0.9516887664794922),
 ('imtehaan', 0.9489978551864624),
 ('toofaani', 0.9480761885643005),
 ('chumma', 0.9477107524871826),
 ('tadpaaye', 0.9475594162940979),
 ('rozana', 0.9459986090660095),
 ('chingari', 0.9453579783439636),
 ('hasne', 0.9439098834991455),
 ('chuppi', 0.9438514709472656),
 ('chowkidari', 0.9437969923019409),
 ('muskuraane', 0.9422505497932434),
 ('sehra', 0.9420530796051025),
 ('samandar', 0.940314531326294),
 ('kadaki', 0.9400314688682556),
 ('jismein', 0.9397870302200317),
 ('muskaan', 0.9393460750579834),
 ('kashti', 0.9389660358428955),
 ('nashaa', 0.9388132095336914),
 ('nikli', 0.938430666923523),
 ('hairat', 0.9377894997596741),
 ('makhmali', 0.93769371509552),
 ('ajeebsi', 0.9374241828918457),
 ('thandi', 0.9367092847824097),
 ('utni', 0.936041533946991),
 ('kaafi', 0.9357544183731079),
 ('khaatir', 0.9356319904327393),
 ('ulajhti', 0.93547523021698),
 ('ummeed', 0.9352965354919434

In [98]:
HtoE[EtoH['sharab']]

'sharab'

In [70]:
for i in lyrics_sent:
    if "saawala" in i:
        print(i)

['saakoon', 'nah', 'meet', 'bana', 'we', 'saawala']
['sakun', 'nah', 'meet', 'bana', 'we', 'saawala']


In [100]:
w2v_model3.wv.similarity(w1="ladka",w2="sharab")

0.897436793825698

In [88]:
w2v_model5.wv.similarity(w1="ladka",w2="saawala")

0.7234995355469539

In [99]:
w2v_model3.wv.similarity(w1="ladki",w2="sharab")

0.6330556310961375

In [31]:
# w2v_model.wv.similarity(w1="ladki",w2="makhmali")

In [32]:
# w2v_model.wv.similarity(w1="ladka",w2="makhmali")