In [1]:
# to mount my drive for simplicity
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import nltk
import string
from nltk.stem import PorterStemmer

w2v = {}
df = pd.read_csv('drive/MyDrive/dataset_lyrics.csv')

nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))
docs_without_punctuation = []
ps = PorterStemmer()

docs_without_stopwords = df['lyrics'].apply(lambda x: ' '.join([w for w in x.split() if w not in (stop_words)]))
for row in docs_without_stopwords:
  docs_without_punctuation.append(row.translate(str.maketrans('','', string.punctuation)))
stemmed = [[ps.stem(w) for w in row.split(" ")] for row in docs_without_punctuation]
print(len(stemmed))



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


28372


In [3]:
from gensim.models import Word2Vec
model1 = Word2Vec(stemmed, min_count = 1,size = 10, window = 5, sg=0)
model1.wv.vocab

model2 = Word2Vec(stemmed, min_count = 1,size = 30, window = 10, sg=0)



In [5]:
import numpy as np

# avgerage pooling implementation
def avg_pool(doc, model):
    pool = np.array([0.0]*model.vector_size)
    
    for idx, word in enumerate(doc):
      vec = model.wv[word]
      pool += vec[0]

    pool = pool / len(doc)

        
    return pool

# maximum pooling implementation
def max_pool(doc, model):
    pool = np.array([0.0]*model.vector_size)
    
    for idx, word in enumerate(doc):
      vec = model.wv[word]
      for i, val in np.ndenumerate(vec):
        if abs(val) > abs(pool[i[1]]):
          pool[i[1]] = val
        
    return pool

# avgerage pool list of wvs
def avg_pool_wvs(wvs):
    pool = np.array([0.0]*len(wvs[0]))
    
    for idx, song in enumerate(wvs):
      pool += song

    pool = pool / len(wvs)
        
    return pool

# maxiumu pool list of wvs
def max_pool_wvs(wvs):
    pool = np.array([0.0]*len(wvs[0]))
    
    for idx, song in enumerate(wvs):
      for i, val in np.ndenumerate(song):
        if abs(val) > abs(pool[i[0]]):
          pool[i[0]] = val
        
    return pool

# cosine similarity implementation
def cosine_sim(a, b):
  dot = np.dot(a, b)
  aNorm = np.linalg.norm(a)
  bNorm = np.linalg.norm(b)

  denom = (aNorm * bNorm)

  if denom > 0.0:
    return dot / denom
  elif dot == 0.0:
    return 0.0

In [6]:
track_to_lyrics = dict()

track_to_wv = dict()
artist_to_wv = dict()
artist_to_wvs = dict()
genre_to_wv = dict()
genre_to_wvs = dict()
track_to_wv_test = dict()

track_to_wv_max = dict()
artist_to_wv_max = dict()
artist_to_wvs_max = dict()
genre_to_wv_max = dict()
genre_to_wvs_max = dict()
track_to_wv_test_max = dict()

def reset_globals():
  # reset globals 
  global track_to_lyrics

  global track_to_wv
  global artist_to_wv
  global artist_to_wvs
  global genre_to_wv
  global genre_to_wvs
  global track_to_wv_test

  global track_to_wv_max
  global artist_to_wv_max
  global artist_to_wvs_max
  global genre_to_wv_max
  global genre_to_wvs_max
  global track_to_wv_test_max
  
  track_to_lyrics = dict()

  track_to_wv = dict()
  artist_to_wv = dict()
  artist_to_wvs = dict()
  genre_to_wv = dict()
  genre_to_wvs = dict()
  track_to_wv_test = dict()

  track_to_wv_max = dict()
  artist_to_wv_max = dict()
  artist_to_wvs_max = dict()
  genre_to_wv_max = dict()
  genre_to_wvs_max = dict()
  track_to_wv_test_max = dict()

def extract_data(model, test_size):
  reset_globals()

  test = test_size
  cnt = 0

  # split dataset into test, train data
  for index, row in df.iterrows():
    # prepare train data
    if cnt < test:
      docs_without_punctuation = []
      docs_without_stopwords = [w for w in row['lyrics'].split() if w not in (stop_words)]
      docs_without_punctuation = [w for w in docs_without_stopwords if w not in (string.punctuation)]

      stemmed = [[ps.stem(w) for w in roww.split(" ")] for roww in docs_without_punctuation]

      track_to_lyrics[str(row['artist_name']) + " " + str(row['track_name']) + " " + str(row['release_date'])] = stemmed
      track_to_wv[str(row['artist_name']) + " " + str(row['track_name']) + " " + str(row['release_date'])] = avg_pool(stemmed, model)
      track_to_wv_max[str(row['artist_name']) + " " + str(row['track_name']) + " " + str(row['release_date'])] = max_pool(stemmed, model)

      if str(row['artist_name']) not in artist_to_wvs:
        artist_to_wvs[str(row['artist_name'])] = []
        artist_to_wvs_max[str(row['artist_name'])] = []
      if str(row['genre']) not in genre_to_wvs:
        genre_to_wvs[str(row['genre'])] = []
        genre_to_wvs_max[str(row['genre'])] = []
      artist_to_wvs[str(row['artist_name'])].append(avg_pool(stemmed, model))
      artist_to_wvs_max[str(row['artist_name'])].append(max_pool(stemmed, model))
      genre_to_wvs[str(row['genre'])].append(avg_pool(stemmed, model))
      genre_to_wvs_max[str(row['genre'])].append(max_pool(stemmed, model))

    # or prepare test data
    elif cnt >= test:
      docs_without_punctuation = []
      docs_without_stopwords = [w for w in row['lyrics'].split() if w not in (stop_words)]
      docs_without_punctuation = [w for w in docs_without_stopwords if w not in (string.punctuation)]

      stemmed = [[ps.stem(w) for w in roww.split(" ")] for roww in docs_without_punctuation]

      track_to_lyrics[str(row['artist_name']) + " " + str(row['track_name']) + " " + str(row['release_date'])] = stemmed
      track_to_wv_test[str(row['artist_name']) + " " + str(row['track_name']) + " " + str(row['release_date'])] = avg_pool(stemmed, model)
      track_to_wv_test_max[str(row['artist_name']) + " " + str(row['track_name']) + " " + str(row['release_date'])] = max_pool(stemmed, model)
    
    if cnt == int(test*1.2):
      break
    cnt = cnt + 1

  for name, songs in artist_to_wvs.items():
    artist_to_wv[name] = avg_pool_wvs(songs)
    artist_to_wv_max[name] = max_pool_wvs(songs)

  for name, songs in genre_to_wvs.items():
    genre_to_wv[name] = avg_pool_wvs(songs)
    genre_to_wv_max[name] = max_pool_wvs(songs)

extract_data(model1, 10000)

print(len(artist_to_wv))
print(artist_to_wv['ricky skaggs'])
print(artist_to_wv_max['ricky skaggs'])

2047
[-0.57010402  2.8910937   0.47800374  1.00027635  1.95123399 -0.38375959
 -0.4560154  -1.37622476  0.95674472  2.22095609]
[-2.44391821  4.05838157  2.43635092  2.26530496  3.01332528 -2.23334832
 -1.58690384 -2.63245206  2.12948244  4.07038303]


In [7]:
# remove stopwords, punctuaction, and do stemming.
# returns stemmed query and average pooled word vector
def preprocess_query(user_query, model, pooling='avg'):
  user_query_without_stopwords = []
  user_query_without_punctuation = []
  user_query_without_stopwords = [w for w in user_query.split() if w not in (stop_words)]
  user_query_without_punctuation = [w for w in user_query_without_stopwords if w not in (string.punctuation)]

  user_query_stemmed = [[ps.stem(w) for w in roww.split(" ")] for roww in user_query_without_punctuation]
  if pooling == 'avg':
    return user_query_stemmed, avg_pool(user_query_stemmed, model)
  elif pooling == 'max':
    return user_query_stemmed, max_pool(user_query_stemmed, model)

In [8]:
# the doors - hello, i love you
the_doors_user_query = "hello i love you"

user_query_stemmed, user_query_stemmed_wv = preprocess_query(the_doors_user_query, model1)

In [9]:
import torch
from torch import nn

# Create model
device = "cpu"

# Define model
class LyricsSimilarityModel1(nn.Module):
    def __init__(self):
        super(LyricsSimilarityModel1, self).__init__()
        self.linear = torch.nn.Sequential(
            torch.nn.Linear(10, 10),
            torch.nn.ReLU(),
            torch.nn.Linear(10, 1)
            
        )

    def forward(self, inputs_):
        x = self.linear(inputs_)
        return x

lyricsSimModel1 = LyricsSimilarityModel1().to(device)
lyricsSimModel1 = lyricsSimModel1.float()

In [10]:
def train(data, user_query_stemmed, model, loss_fn, optimizer):
  model.train()

  for key, item in data.items():
    # predict output using one hot encoding of word

    pred = model(torch.tensor(item).float())

    target = cosine_sim(avg_pool(user_query_stemmed, model1), item)

    #print(pred)
    #print(torch.tensor([target]).float())

    # Compute prediction error
    loss = loss_fn(pred, torch.tensor([target]).float())

    # Backpropagation
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [11]:
def test(data, user_query_stemmed, model, loss_fn, optimizer):
  model.eval()

  threshold = 0.01
  correct = 0

  for key, item in data.items():
    # predict output using one hot encoding of word

    pred = model(torch.tensor(item).float())

    target = cosine_sim(avg_pool(user_query_stemmed, model1), item)

    #print(pred)
    #print(torch.tensor([target]).float())

    # Compute prediction error
    loss = loss_fn(pred, torch.tensor([target]).float())
    #print(loss)
    #print()

    if (target.item() - pred.item()) <= threshold:
      correct += 1
  
  print("accuracy: " + str(correct / len(data.items())))
  return

In [12]:
# Baseline LyricsModel1 

optimizer = torch.optim.Adagrad(lyricsSimModel1.parameters(), lr=0.2)
loss_fn = torch.nn.BCEWithLogitsLoss()

epochs = 5
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(track_to_wv, user_query_stemmed, lyricsSimModel1, loss_fn, optimizer)
    test(track_to_wv_test, user_query_stemmed, lyricsSimModel1, loss_fn, optimizer)
print("Done!")

Epoch 1
-------------------------------
accuracy: 0.6401799100449775
Epoch 2
-------------------------------
accuracy: 0.6421789105447276
Epoch 3
-------------------------------
accuracy: 0.6516741629185407
Epoch 4
-------------------------------
accuracy: 0.6531734132933533
Epoch 5
-------------------------------
accuracy: 0.6516741629185407
Done!


In [13]:
# LyricsModel1 with other loss / optimizer

lyricsSimModel1 = LyricsSimilarityModel1().to(device)
lyricsSimModel1 = lyricsSimModel1.float()

# use mseloss because this is not a classification task
# also change optimizer for testing
optimizer = torch.optim.SGD(lyricsSimModel1.parameters(), lr=0.05)
loss_fn = torch.nn.MSELoss()

epochs = 5
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(track_to_wv, user_query_stemmed, lyricsSimModel1, loss_fn, optimizer)
    test(track_to_wv_test, user_query_stemmed, lyricsSimModel1, loss_fn, optimizer)
print("Done!")

Epoch 1
-------------------------------
accuracy: 0.6891554222888556
Epoch 2
-------------------------------
accuracy: 0.6926536731634183
Epoch 3
-------------------------------
accuracy: 0.68015992003998
Epoch 4
-------------------------------
accuracy: 0.6796601699150425
Epoch 5
-------------------------------
accuracy: 0.688655672163918
Done!


In [14]:
# Create model
device = "cpu"

# Define 2nd model with more layers
class LyricsSimilarityModel2(nn.Module):
    def __init__(self):
        super(LyricsSimilarityModel2, self).__init__()
        self.linear = torch.nn.Sequential(
            torch.nn.Linear(10, 20),
            torch.nn.ReLU(),
            torch.nn.Linear(20, 10),
            torch.nn.ReLU(),
            torch.nn.Linear(10, 1)
        )

    def forward(self, inputs_):
        x = self.linear(inputs_)
        return x

lyricsSimModel2 = LyricsSimilarityModel2().to(device)
lyricsSimModel2 = lyricsSimModel2.float()

In [15]:
# use mseloss because this is not a classification task
# also change optimizer for testing
optimizer = torch.optim.SGD(lyricsSimModel2.parameters(), lr=0.05)
loss_fn = torch.nn.MSELoss()

epochs = 5
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(track_to_wv, user_query_stemmed, lyricsSimModel2, loss_fn, optimizer)
    test(track_to_wv_test, user_query_stemmed, lyricsSimModel2, loss_fn, optimizer)
print("Done!")

Epoch 1
-------------------------------
accuracy: 0.6141929035482259
Epoch 2
-------------------------------
accuracy: 0.6621689155422289
Epoch 3
-------------------------------
accuracy: 0.7251374312843578
Epoch 4
-------------------------------
accuracy: 0.7941029485257372
Epoch 5
-------------------------------
accuracy: 0.8470764617691154
Done!


In [16]:
# Create model
device = "cpu"

# Define 2nd model with more layers
class LyricsSimilarityModel3(nn.Module):
    def __init__(self):
        super(LyricsSimilarityModel3, self).__init__()
        self.linear = torch.nn.Sequential(
            torch.nn.Linear(10, 20),
            torch.nn.ReLU(),
            torch.nn.Linear(20, 30),
            torch.nn.ReLU(),
            torch.nn.Linear(30, 20),
            torch.nn.ReLU(),
            torch.nn.Linear(20, 10),
            torch.nn.ReLU(),
            torch.nn.Linear(10, 1),
        )

    def forward(self, inputs_):
        x = self.linear(inputs_)
        return x

lyricsSimModel3 = LyricsSimilarityModel3().to(device)
lyricsSimModel3 = lyricsSimModel3.float()

In [17]:
# use mseloss because this is not a classification task
# also change optimizer for testing
optimizer = torch.optim.SGD(lyricsSimModel3.parameters(), lr=0.05)
loss_fn = torch.nn.MSELoss()

epochs = 5
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(track_to_wv, user_query_stemmed, lyricsSimModel3, loss_fn, optimizer)
    test(track_to_wv_test, user_query_stemmed, lyricsSimModel3, loss_fn, optimizer)
print("Done!")

# LyricsModel3 seems to work fine for now

Epoch 1
-------------------------------
accuracy: 0.5417291354322838
Epoch 2
-------------------------------
accuracy: 0.7131434282858571
Epoch 3
-------------------------------
accuracy: 0.7446276861569215
Epoch 4
-------------------------------
accuracy: 0.7766116941529235
Epoch 5
-------------------------------
accuracy: 0.783608195902049
Done!


In [18]:
# word2vec model results
query = []
for item in user_query_stemmed:
  query.append(item[0])
print(model1.most_similar(query)[:10])

[('sickasick', 0.88801109790802), ('chalet', 0.887222409248352), ('darl', 0.88182532787323), ('romanc', 0.8809159994125366), ('melook', 0.8808699250221252), ('belong', 0.8768244385719299), ('miss', 0.8751754760742188), ('awhil', 0.8751081824302673), ('youx', 0.8743777871131897), ('sweet', 0.8687548637390137)]


  print(model1.most_similar(query)[:10])


In [19]:
# add some more user queries based on song lyrics

# Godsmack - Good day to die
godsmack_user_query = "Today is a good day to die"
user_query_stemmed_godsmack, user_query_stemmed_wv_godsmack = preprocess_query(godsmack_user_query, model1)

# this song is not in the dataset!
# Nirvana - Frances farmer will have her revenge on seattle
nirvana_user_query = "I miss the comfort in being sad"
user_query_stemmed_nirvana, user_query_stemmed_wv_nirvana = preprocess_query(nirvana_user_query, model1)

In [20]:
# put all user queries in a list for simplicity
user_queries = [the_doors_user_query, godsmack_user_query, nirvana_user_query]

In [21]:
optimizer = torch.optim.SGD(lyricsSimModel3.parameters(), lr=0.05)
loss_fn = torch.nn.MSELoss()

epochs = 5
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(track_to_wv, user_query_stemmed,lyricsSimModel3, loss_fn, optimizer)
    train(track_to_wv, user_query_stemmed_godsmack,lyricsSimModel3, loss_fn, optimizer)
    test(track_to_wv_test, user_query_stemmed, lyricsSimModel3, loss_fn, optimizer)
    test(track_to_wv_test, user_query_stemmed_godsmack, lyricsSimModel3, loss_fn, optimizer)

    # test one query which is not in the dataset and not trained
    test(track_to_wv_test, user_query_stemmed_nirvana, lyricsSimModel3, loss_fn, optimizer)
print("Done!")

Epoch 1
-------------------------------
accuracy: 0.7391304347826086
accuracy: 0.9035482258870565
accuracy: 0.31284357821089454
Epoch 2
-------------------------------
accuracy: 0.7436281859070465
accuracy: 0.9295352323838081
accuracy: 0.31334332833583206
Epoch 3
-------------------------------
accuracy: 0.7376311844077961
accuracy: 0.92103948025987
accuracy: 0.3058470764617691
Epoch 4
-------------------------------
accuracy: 0.735632183908046
accuracy: 0.9180409795102449
accuracy: 0.3073463268365817
Epoch 5
-------------------------------
accuracy: 0.729135432283858
accuracy: 0.889055472263868
accuracy: 0.2988505747126437
Done!


In [22]:
extract_data(model1, 100000)

# get most similar songs to all given user queries
def get_x_most_similar_songs_to(q, x, model, pooling='avg'):
  q_stemmed, q_stemmed_wv = preprocess_query(q, model, pooling)

  most_sim = dict()
  for name, wv in track_to_wv.items():
    most_sim[name] = cosine_sim(wv, q_stemmed_wv)
  print("The 10 most similar (cosine_sim) songs by lyrics to the user query '" + q + "':")
  return sorted(most_sim.items(), key=lambda item: item[1], reverse=True)[:x]

for q in user_queries:
  print(get_x_most_similar_songs_to(q, 10, model1))

The 10 most similar (cosine_sim) songs by lyrics to the user query 'hello i love you':
[('the meters be my lady 1977', 0.9685838989570518), ('ohio players sweet sticky thing 1975', 0.9679238176530158), ("the beach boys i'll bet he's nice 1977", 0.9610735591303842), ('jerry garcia how sweet it is 1988', 0.9577514677803745), ('chuck berry sweet sixteen 1961', 0.9573350192311404), ('fred astaire he loves and she loves 1957', 0.956496143432206), ('karen dalton how sweet it is 1971', 0.9556027122653002), ('the everly brothers some sweet day 1960', 0.9555265891824459), ('mickey & sylvia forever and a day 1957', 0.9553664859839853), ('aretha franklin love is the only thing 1961', 0.9529284150757323)]
The 10 most similar (cosine_sim) songs by lyrics to the user query 'Today is a good day to die':
[('gladys knight & the pips the way we were / try to remember 1974', 0.9669285163705549), ('tony bennett anything goes 1959', 0.9581328719669597), ('count basie anything goes 1959', 0.9576195842307492

In [23]:
# get most similar artists to a given artist
def get_x_most_similar_artists_to(artist_name, artist_to_wv, x):
  artist_wv = artist_to_wv[artist_name]
  most_sim = dict()
  for name, wv in artist_to_wv.items():
    most_sim[name] = cosine_sim(wv, artist_wv)
  print("The 10 most similar (cosine_sim) songs by lyrics to the user query '" + artist_name + "':")

  return sorted(most_sim.items(), key=lambda item: item[1], reverse=True)[:x]

# test model1
print(get_x_most_similar_artists_to('snoop dogg', artist_to_wv, 10))
print(get_x_most_similar_artists_to('nirvana', artist_to_wv, 10))
print(get_x_most_similar_artists_to('snoop dogg', artist_to_wv_max, 10))


# test model2
#extract_data(model2)
#print(get_x_most_similar_artists_to('snoop dogg', 10))

The 10 most similar (cosine_sim) songs by lyrics to the user query 'snoop dogg':
[('snoop dogg', 1.0000000000000002), ('greyboy', 0.9981214070669466), ('jeezy', 0.9956759299983424), ('lloyd banks', 0.9939393843877524), ('yung joc', 0.9934436160058329), ('t.k. soul', 0.9934060719934), ('mack 10', 0.9932159525178528), ('mike will made-it', 0.9923732173693108), ('a$ap rocky', 0.9921382152636611), ('kodak black', 0.9919490329894536)]
The 10 most similar (cosine_sim) songs by lyrics to the user query 'nirvana':
[('nirvana', 1.0), ('hazhe', 0.99614489491428), ('the dillinger escape plan', 0.9958584116267591), ('part one tribe', 0.9957346557973328), ('the 1975', 0.9942684539386565), ('motörhead', 0.9933760632090229), ('paul bley', 0.9922637141731121), ('j. roddy walston & the business', 0.9915604309184929), ('marcus johnson', 0.9915319760662453), ('deftones', 0.9910861908705328)]
The 10 most similar (cosine_sim) songs by lyrics to the user query 'snoop dogg':
[('snoop dogg', 0.999999999999999

In [24]:
# get artist which is most likely to write these words
def get_x_artists_most_likely_to_write(user_input, x, wvs, model, pooling='avg'):
  user_input_stemmed, user_input_wv = preprocess_query(user_input, model, pooling)
  most_sim = dict()
  for name, wv in wvs.items():
    most_sim[name] = cosine_sim(wv, user_input_wv)
  print("The 10 artists which write most similar (cosine_sim) songs to the user query '" + user_input + "':")

  return sorted(most_sim.items(), key=lambda item: item[1], reverse=True)[:x]

# test model1
print(get_x_artists_most_likely_to_write(the_doors_user_query, 10, artist_to_wv,  model1))
print(get_x_artists_most_likely_to_write(godsmack_user_query, 10, artist_to_wv, model1))
print(get_x_artists_most_likely_to_write(nirvana_user_query, 10, artist_to_wv, model1))
print()
print(get_x_artists_most_likely_to_write(the_doors_user_query, 10, artist_to_wv_max, model1, 'max'))
print(get_x_artists_most_likely_to_write(godsmack_user_query, 10, artist_to_wv_max, model1, 'max'))
print(get_x_artists_most_likely_to_write(nirvana_user_query, 10, artist_to_wv_max, model1, 'max'))


# test model2
#extract_data(model2)
#print(get_x_most_similar_artists_to('snoop dogg', 10))

The 10 artists which write most similar (cosine_sim) songs to the user query 'hello i love you':
[('mickey & sylvia', 0.9553664859839853), ('the del-vikings', 0.9314836247192461), ('daniel johnston', 0.923066733050798), ('stampeders', 0.9200642885668358), ('harry james', 0.9167082561653859), ('the elgins', 0.914193131484855), ('the spaniels', 0.9114387664364751), ('betty everett', 0.9112145178517222), ('scouting for girls', 0.9095968921248402), ('the academy is...', 0.9094988246339704)]
The 10 artists which write most similar (cosine_sim) songs to the user query 'Today is a good day to die':
[('the walkmen', 0.9517135177951018), ('walt weiskopf', 0.9457664623063651), ('eddie "cleanhead" vinson', 0.9430593206323605), ('girl in red', 0.9313983480973819), ("the mowgli's", 0.9310143459036847), ('five for fighting', 0.928601026835882), ('grandaddy', 0.9266626419250261), ('guadalcanal diary', 0.9235787622343925), ('brenda boykin', 0.922697154099902), ('glenn medeiros', 0.9213058853368987)]
T

In [25]:
# get genre which is most likely to use these words
def get_x_most_similar_genres_to(user_input, x, genre_to_wv, model, pooling='avg'):
  user_input_stemmed, user_input_wv = preprocess_query(user_input, model, pooling)
  most_sim = dict()
  for name, wv in genre_to_wv.items():
    most_sim[name] = cosine_sim(wv, user_input_wv)
  print("The 10 genre which are most likely (cosine_sim) to use the words in the user query '" + user_input + "':")

  return sorted(most_sim.items(), key=lambda item: item[1], reverse=True)[:x]
# test model1
print(get_x_most_similar_genres_to(the_doors_user_query, 10, genre_to_wv,  model1))
print(get_x_most_similar_genres_to(godsmack_user_query, 10, genre_to_wv, model1))
print(get_x_most_similar_genres_to(nirvana_user_query, 10, genre_to_wv, model1))
print()
print(get_x_most_similar_genres_to(the_doors_user_query, 10, genre_to_wv_max, model1, 'max'))
print(get_x_most_similar_genres_to(godsmack_user_query, 10, genre_to_wv_max, model1, 'max'))
print(get_x_most_similar_genres_to(nirvana_user_query, 10, genre_to_wv_max, model1, 'max'))


# test model2
#extract_data(model2)
#print(get_x_most_similar_artists_to('snoop dogg', 10))

The 10 genre which are most likely (cosine_sim) to use the words in the user query 'hello i love you':
[('country', 0.7780120620734993), ('jazz', 0.7701153848455616), ('pop', 0.7642455802757708), ('blues', 0.7634944823781573), ('rock', 0.7148250768789185), ('reggae', 0.7095055821485913), ('hip hop', 0.5734617973002161)]
The 10 genre which are most likely (cosine_sim) to use the words in the user query 'Today is a good day to die':
[('jazz', 0.8269915699383722), ('country', 0.822279643485579), ('reggae', 0.8158668028183959), ('rock', 0.814141134860063), ('pop', 0.8122578268520283), ('blues', 0.8121397485940219), ('hip hop', 0.6629795633834208)]
The 10 genre which are most likely (cosine_sim) to use the words in the user query 'I miss the comfort in being sad':
[('country', 0.9150427381933491), ('jazz', 0.9110325771603796), ('blues', 0.8974738392354822), ('pop', 0.8971890869156524), ('rock', 0.8942365689826056), ('reggae', 0.8471966256328908), ('hip hop', 0.6214625754938634)]

The 10 gen

In [26]:
# get genre which is most likely to use these words
def get_x_most_similar_genres_for_artist(artist_name, x, artist_to_wv, genre_to_wv, model, pooling='avg'):
  artist_wv = artist_to_wv[artist_name]
  most_sim = dict()
  for name, wv in genre_to_wv.items():
    most_sim[name] = cosine_sim(wv, artist_wv)
  print("The 10 genre which are most likely (cosine_sim) to use the words in the user query '" + artist_name + "':")

  return sorted(most_sim.items(), key=lambda item: item[1], reverse=True)[:x]
# test model1
print(get_x_most_similar_genres_for_artist('snoop dogg', 10, artist_to_wv, genre_to_wv,  model1))
print(get_x_most_similar_genres_for_artist('rick astley', 10, artist_to_wv, genre_to_wv,  model1))
print(get_x_most_similar_genres_for_artist('godsmack', 10, artist_to_wv, genre_to_wv,  model1))
print(get_x_most_similar_genres_for_artist('johnny cash', 10, artist_to_wv, genre_to_wv,  model1))
print(get_x_most_similar_genres_for_artist('inner circle', 10, artist_to_wv, genre_to_wv,  model1))
print(get_x_most_similar_genres_for_artist('rihanna', 10, artist_to_wv, genre_to_wv,  model1))


The 10 genre which are most likely (cosine_sim) to use the words in the user query 'snoop dogg':
[('hip hop', 0.974797194168287), ('reggae', 0.8312961911698123), ('blues', 0.7755715162410641), ('pop', 0.770683782203893), ('rock', 0.7641106599353487), ('jazz', 0.7519213400278768), ('country', 0.7328287776666594)]
The 10 genre which are most likely (cosine_sim) to use the words in the user query 'rick astley':
[('pop', 0.9665704375226811), ('country', 0.9648340192092438), ('blues', 0.9629008304345614), ('jazz', 0.956754528328227), ('rock', 0.956034484592843), ('reggae', 0.9378184923661248), ('hip hop', 0.7940250035515559)]
The 10 genre which are most likely (cosine_sim) to use the words in the user query 'godsmack':
[('rock', 0.9851672722585053), ('reggae', 0.9787976697165468), ('pop', 0.9760834786604645), ('blues', 0.9732512376037322), ('jazz', 0.967869451348358), ('country', 0.9666460865402586), ('hip hop', 0.8773036228719463)]
The 10 genre which are most likely (cosine_sim) to use the

In [27]:
extract_data(model2, 100000)

# test model2

for q in user_queries:
  print(get_x_most_similar_songs_to(q, 10, model2))
print()
print()
print(get_x_most_similar_artists_to('snoop dogg', artist_to_wv, 10))
print(get_x_most_similar_artists_to('nirvana', artist_to_wv, 10))
print(get_x_most_similar_artists_to('snoop dogg', artist_to_wv_max, 10))
print()
print()
print(get_x_artists_most_likely_to_write(the_doors_user_query, 10, artist_to_wv,  model2))
print(get_x_artists_most_likely_to_write(godsmack_user_query, 10, artist_to_wv, model2))
print(get_x_artists_most_likely_to_write(nirvana_user_query, 10, artist_to_wv, model2))
print()
print(get_x_artists_most_likely_to_write(the_doors_user_query, 10, artist_to_wv_max, model2, 'max'))
print(get_x_artists_most_likely_to_write(godsmack_user_query, 10, artist_to_wv_max, model2, 'max'))
print(get_x_artists_most_likely_to_write(nirvana_user_query, 10, artist_to_wv_max, model2, 'max'))
print()
print()
print(get_x_most_similar_genres_to(the_doors_user_query, 10, genre_to_wv,  model2))
print(get_x_most_similar_genres_to(godsmack_user_query, 10, genre_to_wv, model2))
print(get_x_most_similar_genres_to(nirvana_user_query, 10, genre_to_wv, model2))
print()
print(get_x_most_similar_genres_to(the_doors_user_query, 10, genre_to_wv_max, model2, 'max'))
print(get_x_most_similar_genres_to(godsmack_user_query, 10, genre_to_wv_max, model2, 'max'))
print(get_x_most_similar_genres_to(nirvana_user_query, 10, genre_to_wv_max, model2, 'max'))
print()
print()
print(get_x_most_similar_genres_for_artist('snoop dogg', 10, artist_to_wv, genre_to_wv,  model2))
print(get_x_most_similar_genres_for_artist('rick astley', 10, artist_to_wv, genre_to_wv,  model2))
print(get_x_most_similar_genres_for_artist('godsmack', 10, artist_to_wv, genre_to_wv,  model2))
print(get_x_most_similar_genres_for_artist('johnny cash', 10, artist_to_wv, genre_to_wv,  model2))
print(get_x_most_similar_genres_for_artist('inner circle', 10, artist_to_wv, genre_to_wv,  model2))
print(get_x_most_similar_genres_for_artist('rihanna', 10, artist_to_wv, genre_to_wv,  model2))

The 10 most similar (cosine_sim) songs by lyrics to the user query 'hello i love you':
[('chubby checker lovely, lovely (loverly, loverly) 1964', 0.860622306730079), ('colin blunstone she loves the way they love her 1971', 0.8502115483223693), ('michael franks loving you more and more 1980', 0.8468794115459548), ('fred astaire he loves and she loves 1957', 0.8413914606670269), ("ella fitzgerald it's a lovely day today 1958", 0.8403506935329526), ('michael franks i love lucy 1993', 0.8371556378641917), ('love hummingbirds 1967', 0.8368101729568674), ('cocoa tea she loves me now 1994', 0.8292443336916949), ('jackie wilson to be loved 1958', 0.8273730440766199), ('george strait all of me (loves all of you) 1992', 0.8268668433755615)]
The 10 most similar (cosine_sim) songs by lyrics to the user query 'Today is a good day to die':
[('rosemary clooney anything goes 1982', 0.8599961416636793), ('count basie anything goes 1959', 0.8595572266509675), ("merle haggard i think we're livin' in the 