<a href="https://colab.research.google.com/github/abdurrafiarief/Song-Chatbot/blob/main/Song_Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# Sumber dataset https://www.kaggle.com/imuhammad/audio-features-and-lyrics-of-spotify-songs
# https://docs.google.com/document/d/1DPdsfIHn1nFRBsNYk1oId3KooaP8uQfek6noMv_3g3g/edit
import numpy as np
import nltk
import pandas as pd
import re

In [7]:
!pip install pyspellchecker



In [8]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
print(open('spotify_songs.csv'))

<_io.TextIOWrapper name='spotify_songs.csv' mode='r' encoding='UTF-8'>


# Pre-processing

## **[Filter based on language]**
---

In [10]:
df = pd.read_csv('spotify_songs.csv', encoding='UTF-8')
sorted = df.loc[df['language'] == 'en']
sorted = sorted[['track_id','track_name','track_artist','lyrics','playlist_genre','playlist_subgenre','track_popularity']]
sorted = sorted.dropna()
sorted.to_csv('data.csv',index=False)

## **[Tokenization]**
---

### Cleansing lyric from contractions

In [11]:
# Dictionary of english Contractions from https://www.analyticsvidhya.com/blog/2020/08/information-retrieval-using-word2vec-based-vector-space-model/ 
contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not","can't": "can not","can't've": "cannot have",
"'cause": "because","could've": "could have","couldn't": "could not","couldn't've": "could not have",
"didn't": "did not","doesn't": "does not","don't": "do not","hadn't": "had not","hadn't've": "had not have",
"hasn't": "has not","haven't": "have not","he'd": "he would","he'd've": "he would have","he'll": "he will",
"he'll've": "he will have","how'd": "how did","how'd'y": "how do you","how'll": "how will","i'd": "i would",
"i'd've": "i would have","i'll": "i will","i'll've": "i will have","i'm": "i am","i've": "i have",
"isn't": "is not","it'd": "it would","it'd've": "it would have","it'll": "it will","it'll've": "it will have",
"let's": "let us","ma'am": "madam","mayn't": "may not","might've": "might have","mightn't": "might not",
"mightn't've": "might not have","must've": "must have","mustn't": "must not","mustn't've": "must not have",
"needn't": "need not","needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
"oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
"shan't've": "shall not have","she'd": "she would","she'd've": "she would have","she'll": "she will",
"she'll've": "she will have","should've": "should have","shouldn't": "should not",
"shouldn't've": "should not have","so've": "so have","that'd": "that would","that'd've": "that would have",
"there'd": "there would","there'd've": "there would have",
"they'd": "they would","they'd've": "they would have","they'll": "they will","they'll've": "they will have",
"they're": "they are","they've": "they have","to've": "to have","wasn't": "was not","we'd": "we would",
"we'd've": "we would have","we'll": "we will","we'll've": "we will have","we're": "we are","we've": "we have",
"weren't": "were not","what'll": "what will","what'll've": "what will have","what're": "what are",
"what've": "what have","when've": "when have","where'd": "where did",
"where've": "where have","who'll": "who will","who'll've": "who will have","who've": "who have",
"why've": "why have","will've": "will have","won't": "will not","won't've": "will not have",
"would've": "would have","wouldn't": "would not","wouldn't've": "would not have","y'all": "you all",
"y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
"you'd": "you would","you'd've": "you would have","you'll": "you will","you'll've": "you will have",
"you're": "you are","you've": "you have"}
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

def cleansed_docs(doc):
  x = pd.Series([i.lower() for i in doc])
  return x.apply(lambda x:expand_contractions(x))

lyrics = cleansed_docs(sorted['lyrics'])
titles = cleansed_docs(sorted['track_name'])
lyrics

0        the trees, are singing in the wind the sky blu...
1        na yeah, spyderman and freeze in full effect u...
2        i really can not stay baby it is cold outside ...
3        get up out of my business you do not keep me f...
4        hold your breath, do not look down, keep tryin...
                               ...                        
15400    yeah, nah, nah, nah, nah, nah, nah, nah, nah n...
15401    caught up in such a head rush, wide-eyed latel...
15402    my anaconda do not, my anaconda do not my anac...
15403    bound, bound bound, bound bound to fall in lov...
15404    (would you do for me) sweetheart (would you do...
Length: 15405, dtype: object

In [12]:
def tokenize_docs(doc):
  output = []
  for i in doc:
    token = nltk.tokenize.word_tokenize(i)
    clean_token = []
    for i in token:
      if str.isalnum(i):
        clean_token.append(i)
    output.append(clean_token)
  return output

### Tokenize lyric and titles

In [13]:
tok_lyric = tokenize_docs(lyrics)
tok_lyric[0]

['the',
 'trees',
 'are',
 'singing',
 'in',
 'the',
 'wind',
 'the',
 'sky',
 'blue',
 'only',
 'as',
 'it',
 'can',
 'be',
 'and',
 'the',
 'angels',
 'smiled',
 'at',
 'me',
 'i',
 'saw',
 'you',
 'in',
 'that',
 'lonely',
 'bench',
 'at',
 'half',
 'past',
 'four',
 'i',
 'kissed',
 'your',
 'soft',
 'soft',
 'hands',
 'and',
 'at',
 '6',
 'i',
 'kissed',
 'your',
 'lips',
 'and',
 'the',
 'angels',
 'smiled',
 'i',
 'thought',
 'hey',
 'i',
 'feel',
 'alive',
 'the',
 'park',
 'sign',
 'said',
 'it',
 'was',
 'closed',
 'and',
 'we',
 'jumped',
 'that',
 'fence',
 'with',
 'no',
 'cares',
 'at',
 'all',
 'and',
 'we',
 'kissed',
 'under',
 'a',
 'tree',
 'we',
 'danced',
 'under',
 'the',
 'midnight',
 'sun',
 'and',
 'i',
 'loved',
 'you',
 'without',
 'knowing',
 'you',
 'at',
 'all',
 'and',
 'we',
 'laughed',
 'and',
 'felt',
 'so',
 'free',
 'and',
 'the',
 'angels',
 'they',
 'smiled',
 'i',
 'thought',
 'hey',
 'i',
 'feel',
 'alive']

In [14]:
tok_title = tokenize_docs(titles)
tok_title[0]

['i', 'feel', 'alive']

### Tokenize and cleansing artist name

In [15]:
subt_dict = { "\$": "s","&":" and ","-": " ","!": "i","\/":" ", "'n": " and", "n'": "and", "\.":""}
temp_artist = sorted[['track_id','track_artist']].values.tolist()
new_artists = [] 
for words in temp_artist:
  result = words[1]
  for keys in subt_dict.keys():
    result = re.sub(r'{}'.format(keys),subt_dict[keys],result)
  new_artists.append({words[0]: result})

In [16]:
tok_artist = []
for dict_word in new_artists:
  for key,values in dict_word.items():
    token = nltk.tokenize.word_tokenize(values.lower())
    clean_token = []
    for i in token:
      if str.isalnum(i):
        clean_token.append(i)
    tok_artist.append({key: clean_token})

In [17]:
tok_artist

[{'004s3t0ONYlzxII9PLgU6z': ['steady', 'rollin']},
 {'00chLpzhgVjxs1zKC9UScL': ['bell', 'biv', 'devoe']},
 {'00cqd6ZsSkLZqGMlQCR0Zo': ['ceelo', 'green']},
 {'00emjlCv9azBN0fzuuyLqy': ['kard']},
 {'00f9VGHfQhAHMCQ2bSjg3D': ['james', 'tw']},
 {'00FROhC5g4iJdax5US8jRr': ['diddy']},
 {'00GfGwzlSB8DoA0cDP2Eit': ['babyface']},
 {'00Gu3RMpDW2vO9PjlMVFDL': ['blasterjaxx']},
 {'00HIh9mVUQQAycsQiciWsh': ['magic', 'city', 'hippies']},
 {'00i2HU7TEzzftShjRrDSEF': ['2pac']},
 {'00ITtxUozN0vifE2uYvtqn': ['queen']},
 {'00LfFm08VWeZwB0Zlm24AT': ['baby', 'bash']},
 {'00lNx0OcTJrS3MKHcB80HY': ['jax', 'jones']},
 {'00NAQYOP4AmWR549nnYJZu': ['the', 'weeknd']},
 {'00OKDIsSQbx8rd5Al7fKNw': ['sabrina', 'carpenter']},
 {'00p85inzGeXRXgqDVn7Ftq': ['zotiyac']},
 {'00PLtXXER1XcTRZvs3LioS': ['the', 'brothers', 'johnson']},
 {'00ppj2gGs8oxhGnTdejkzD': ['saymyname']},
 {'00qOE7OjRl0BpYiCiweZB2': ['foreigner']},
 {'00QyLmjxaSEE8qIZQjBXBj': ['2', 'chainz']},
 {'00ReeHCY0FQUyuAUyPJdnk': ['mc', 'breed']},
 {'00S35gEf40

# IR Modelling

## **[Non Boolean Model for lyrics]**
---

In [18]:
from gensim.models import Word2Vec
import multiprocessing
from time import time
print(multiprocessing.cpu_count())

2


### Import an existing model or train a new one

In [19]:
try:
  w2v_model = Word2Vec.load('spotify_songs_en.model')
  print('Existing model detected')
except (FileNotFoundError):
  w2v_model = Word2Vec(min_count=1, window=10, size=300, workers=2, sg=1) 
  t = time()
  w2v_model.build_vocab(tok_lyric)
  print('Time elapsed: {} mins'.format(round((time() - t) / 60, 2)))
  t = time()
  w2v_model.train(tok_lyric, total_examples=w2v_model.corpus_count, epochs=10, report_delay=1)
  w2v_model.save('spotify_songs_en.model')
  print('Time elapsed: {} mins'.format(round((time() - t) / 60, 2)))

Time elapsed: 0.17 mins
Time elapsed: 10.55 mins


### Word Embeddings for lyric

In [20]:
# Get the embeddings for lyrics
def get_embeddings(tokens):
  tok_val = []
  if len(tokens) < 1:
    return np.zeros(300)
  for tok in tokens:
    if tok in w2v_model.wv.vocab:
      tok_val.append(w2v_model.wv.word_vec(tok))
    else:
      tok_val.append(np.random.rand(300))
  return np.mean(tok_val, axis=0)

### Finding song based on lyrics

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

def preprocess_input(q):
  query = q.lower()
  q_tok = nltk.tokenize.word_tokenize(query)
  q_tok = pd.Series(q_tok).apply(lambda x:expand_contractions(x))
  q_clean = []
  for i in q_tok:
    if str.isalnum(i):
      q_clean.append(i)
  
  return q_clean

def find_lyrics(lyric, doc=None):
  # Get embeddings for input
  q_vec = preprocess_input(lyric)
  q_vec = get_embeddings(q_vec)

  # Get lyric|embed
  embed_lyric = sorted.copy()
  embed = []
  for i in tok_lyric:
    embed_val = get_embeddings(i)
    embed.append(embed_val)
  embed_lyric['embed_val'] = embed
  
  # Get Cosine Similarity
  sim = []
  for i in embed_lyric['embed_val']:
    similar = cosine_similarity(q_vec.reshape(1,-1), i.reshape(1, -1))
    sim.append(similar)

  embed_lyric['similarity'] = sim 

  if doc is not None:
    new_doc = pd.DataFrame(columns=doc.columns)
    for i in doc['track_id']:
      ranked_el = embed_lyric.query("track_id == '{}'".format(str(i)))
      new_doc = new_doc.append(ranked_el)
      
    new_doc.sort_values(by='similarity', ascending=False, inplace=True)
    return new_doc.head(10).reset_index(drop=True)

  embed_lyric.sort_values(by='similarity',ascending=False, inplace=True)


  top_songs = embed_lyric.head(10).reset_index(drop=True)
  top_songs.sort_values(by='track_popularity', ascending=False, inplace=True)
  return top_songs.head(10).reset_index(drop=True)


In [22]:
find_lyrics("hey, soul sister hey mister mister stereo radio the way you move ain't fair you know ")

Unnamed: 0,track_id,track_name,track_artist,lyrics,playlist_genre,playlist_subgenre,track_popularity,embed_val,similarity
0,4HlFJV71xXKIGcU3kRyttv,"Hey, Soul Sister",Train,Heyy Heeey Heey Your lipstick stains On the fr...,pop,post-teen pop,82,"[0.03525572, -0.053815506, 0.098430775, 6.5559...",[[0.80473423]]
1,1gzIbdFnGJ226LTl0Cn2SX,White Wedding - Pt. 1,Billy Idol,"Hey little sister, what have you done? Hey lit...",rock,hard rock,68,"[0.015389303, -0.07939862, 0.075831264, -0.027...",[[0.76938564]]
2,57trQKFZdJxHia4sMJioWk,Sister Christian,Night Ranger,"Sister Christian, oh, the time has come And yo...",rock,album rock,60,"[0.05037777, -0.104029454, 0.055772122, -0.046...",[[0.77208793]]
3,4lYxJPxy3jMEADaFhtcWRV,Mister Glassman,Scotty Sire,Let's paint a picture: you're in class The tea...,pop,electropop,57,"[0.011404457, -0.033813287, 0.11119126, -0.012...",[[0.76641995]]
4,6LoSKNsYkRQbyuiReooMjG,Sister Christian,Night Ranger,"Sister Christian, oh, the time has come And yo...",rock,classic rock,55,"[0.05037777, -0.1040294, 0.055772174, -0.04634...",[[0.77208805]]
5,27FnDAhPtmDJkY1GAbYv6B,Whatchugot (Pisk Remix),Caro Emerald,"Hey Toscanini, can you make me swoon? I like t...",edm,electro house,38,"[0.056979626, 0.00333551, 0.12750477, 0.051265...",[[0.7774255]]
6,0KpfYajJVVGgQ32Dby7e9i,"Hey, Soul Sister",Train,Heyy Heeey Heey Your lipstick stains On the fr...,pop,dance pop,27,"[0.035255726, -0.05381548, 0.09843079, 6.55594...",[[0.8047341]]
7,3u18DwDsN8r88M3Tf0McAc,Hey Mister,Tujamo,Hey Mister! Hey Mister! Hey Mister! Ok! Ok! Ok...,edm,progressive electro house,23,"[-0.07528543, -0.013754112, 0.03356644, 0.0532...",[[0.883813]]
8,4zmx3vEPgRPjRT2CszUsZo,White Wedding - Part 1 - 2001- Remaster,Billy Idol,"Hey little sister, what have you done? Hey lit...",rock,classic rock,12,"[0.015389302, -0.07939862, 0.07583125, -0.0275...",[[0.76938564]]
9,5aJVMQMvdAYYkdMjNnPXLo,Mr. Banker - Single Version,Lynyrd Skynyrd,"Mister Banker Mister please, how much does mon...",rock,classic rock,0,"[0.044068508, -0.030765997, 0.05879561, 0.0150...",[[0.82245386]]


## **[Boolean Model for artists]**
---

### Checking artist name's mispelling

In [23]:
'''
Spell Checker using norvig spell checker for artist name, some problems:
Its accuracy is worrying
'''
from spellchecker import SpellChecker

def build_frequency_list_artist():
  freq = {}
  for list_a in tok_artist:
      for key,values in list_a.items():
        for value in values:
          if value not in freq.keys():
            freq[value] = 1
          else :
            freq[value] += 1

  with open('freq_list_artis.txt','w') as file:
    for key,values in freq.items():
      file.write('{} {}\n'.format(key, values))
build_frequency_list_artist()

def spell_check_artist(kalimat):
  # turn off loading a built language dictionary
  spell = SpellChecker(language=None)
  spell.word_frequency.load_text_file('freq_list_artis.txt')

  # find those words that may be misspelled
  result = []
  for word in kalimat:
    misspelled = spell.unknown([word])
    if misspelled:
      for mis_words in misspelled:
        result.append(spell.correction(mis_words))
    else:
      result.append(word)

  separator = ' '
  result = separator.join(result)
  return result

spell_check_artist(['ed', 'sheerann'])

'ed sheeran'

### Indexing artists' name

In [24]:
def create_index(token):
  hasil = {}
  for docs in token:
    i = list(docs.keys())[0]
    for j,word in enumerate(docs[i]):
      if word in hasil.keys():
        if i not in hasil[word].keys():
          hasil[word][i] = [j]  
        else:
          hasil[word][i].append(j)
      else:
        hasil[word] = {i:[j]}
  return hasil

In [25]:
index_artist = create_index(tok_artist)
index_artist

{'steady': {'004s3t0ONYlzxII9PLgU6z': [0]},
 'rollin': {'004s3t0ONYlzxII9PLgU6z': [1]},
 'bell': {'00chLpzhgVjxs1zKC9UScL': [0],
  '06yHcjr58IKJnfEK7ko3sD': [0],
  '0RnDu3eYJqbFKz6MHv2ajd': [0],
  '0dDifktDnbzmGwliFzI0Ld': [0],
  '15dAoYrYVbGB7PiSE45j1Y': [0],
  '1TsVEyryDtlexOZFXyVYxy': [0],
  '1oNHw7yf4uvsUQeTaT2Dz7': [0],
  '231mi5BkHyJHxQre25S6oS': [0],
  '2donM8sTsqYtRpSAwRdPJQ': [0],
  '3RB1O5rR8artbci5fuNK1C': [0],
  '3VzVTs2qNdmNrCI1S9iixK': [0],
  '3an1aoJKoBXJKoIfdDbVwf': [0],
  '5tKenaJVfb2YP6imRnDDrN': [0],
  '6dxOG7bXlN9wK9LXw59KWF': [1],
  '6m59VvDUi0UQsB2eZ9wVbH': [0],
  '70EYRjEmK12XiCX9E98upr': [0]},
 'biv': {'00chLpzhgVjxs1zKC9UScL': [1],
  '06yHcjr58IKJnfEK7ko3sD': [1],
  '0RnDu3eYJqbFKz6MHv2ajd': [1],
  '0dDifktDnbzmGwliFzI0Ld': [1],
  '15dAoYrYVbGB7PiSE45j1Y': [1],
  '1TsVEyryDtlexOZFXyVYxy': [1],
  '1oNHw7yf4uvsUQeTaT2Dz7': [1],
  '231mi5BkHyJHxQre25S6oS': [1],
  '2donM8sTsqYtRpSAwRdPJQ': [1],
  '3RB1O5rR8artbci5fuNK1C': [1],
  '3VzVTs2qNdmNrCI1S9iixK': [1],
  '3a

### Find song based on artist

In [26]:
def positional_intersect(prev_word,next_word):
  intersect_id = set()
  for doc_id_1 in prev_word.keys():
    if doc_id_1 in next_word.keys():
      for doc_pos in prev_word[doc_id_1]:
        if doc_pos+1 in next_word[doc_id_1]:
          intersect_id.add(doc_id_1)
  return intersect_id

def get_query(kalimat,token_dict):
  token = nltk.tokenize.word_tokenize(kalimat.lower())
  if len(token) == 1:
    hasil = token_dict[token[0]].keys()
    return hasil  
  token_hasil = token[1:]
  hasil = token_dict[token[0]].keys()
  temp_word = token[0]
  for word in token_hasil:
      hasil = hasil & positional_intersect(token_dict[temp_word],token_dict[word])
      temp_word = word
  return hasil

def retrieve_result_artist(kalimat,token=index_artist, lyrics=False):
  kalimat = spell_check_artist(nltk.tokenize.word_tokenize(kalimat))
  id_set = get_query(kalimat,token)
  artist_result = pd.DataFrame()
  artist_result = sorted[sorted['track_id'].isin(id_set)]
  artist_result = artist_result.sort_values(by='track_popularity', ascending=False)
  if lyrics:
    return artist_result
  return artist_result.head(10)

In [27]:
get_query("suicideboys",index_artist)

dict_keys(['0fyBYsrmpihh1mfalssDlB', '0kCfKToIFZFQQAaSQHcjRL', '0NurtsK6qk4qwDfxmfC2YL', '2Bkfvi3pCcwoJwnmd45OnA', '2I12vOWeJU5ayhr6ha6esf', '3bNCifwGPjqeeqzpjrFG9k', '3mRBSk7h2Vr0D0yott8VhC', '3NSgQjYihV40pOmn8AQevz', '3SCVHoGHV7GL0hmqcOIpqh', '4irYeuAi87yyGHcI4h9s0x', '4LfZGbBkFJW0KF44vNoVjU', '4owRIrDAJPqNWGf2SGC3H8', '4XQDrALwqj4J1YPqg58sDV', '4yORBk6ZyYsJpnJchyZevc', '5XAPpyIoYF3QXP34Hv8Pvx', '6oyeeA0sHwHM1pPpZaIsrD', '6yK9kwo8cBoVRNOylq0yNc', '70nmZhHZLNVYWP4NON41Zw', '7ae2GdeXS5Tvo5RvdNwgaz', '7pu5jnOlS1k4fmigtRaXvG'])

In [28]:
x = retrieve_result_artist("ed sheeran")
x

Unnamed: 0,track_id,track_name,track_artist,lyrics,playlist_genre,playlist_subgenre,track_popularity
11508,4vUmTMuQqjdnvlZmAH61Qk,South of the Border (feat. Camila Cabello & Ca...,Ed Sheeran,"She got the mmm, brown eyes, caramel thighs Lo...",pop,post-teen pop,91
8064,3HVWdVOQ0ZA45FuZGSfvns,I Don't Care (with Justin Bieber),Ed Sheeran,I'm at a party I don't wanna be at And I don't...,pop,indie poptimism,90
10213,4evmHXcjt3bTUHD1cvny97,Beautiful People (feat. Khalid),Ed Sheeran,"We are, we are, we are L.A. on a Saturday nigh...",pop,post-teen pop,89
1871,0tgVpDi06FyKpA1z0VMD4v,Perfect,Ed Sheeran,"I found a love for me Oh darling, just dive ri...",latin,latin hip hop,86
18071,7qiZfU4dY1lWllzX7mPBI3,Shape of You,Ed Sheeran,The club isn't the best place to find a lover ...,pop,dance pop,86
16549,70eFcWOvlMObDhURTqT4Fv,Beautiful People (feat. Khalid),Ed Sheeran,"We are, we are, we are L.A. on a Saturday nigh...",pop,post-teen pop,85
3334,1HNkqx9Ahdgi1Ixy2xkKkL,Photograph,Ed Sheeran,"Loving can hurt, loving can hurt sometimes But...",pop,post-teen pop,84
965,0hVXuCcriWRGvwMV1r5Yn9,I Don't Care (with Justin Bieber),Ed Sheeran,I'm at a party I don't wanna be at And I don't...,pop,post-teen pop,84
7236,34gCuhDGsG4bRPIf9bb02f,Thinking out Loud,Ed Sheeran,When your legs don't work like they used to be...,latin,latin pop,83
379,0afhq8XCExXpqazXczTSve,Galway Girl,Ed Sheeran,She played the fiddle in an Irish band But she...,pop,electropop,80


## **[Boolean Model for song title]**

In [29]:
'''
What to do:
1. Tokenization
2. Indexing
3. Spell correction with jarrad coeff and edit distance
4. Creating a boolean model -> Using position of index to search certain query
'''


'\nWhat to do:\n1. Tokenization\n2. Indexing\n3. Spell correction with jarrad coeff and edit distance\n4. Creating a boolean model -> Using position of index to search certain query\n'

### Indexing song title

In [30]:
def indexer(list_of_lists):
  #for making a index with the type of dictionary, made for title
  index = dict()
  for i in range(len(list_of_lists)):
    title = list_of_lists[i]
    for no in range(len(title)):
      token = title[no]
      if token not in index.keys():
        temp = dict()
        temp[i] = [no] 
        index[token] = temp
      else:
        if i in index[token].keys():
          index[token][i] += [no]
        else:
          index[token][i] = [no]
  return index

In [31]:
title_index = indexer(tok_title)  

### Spelling correction for song title

In [32]:
!pip install autocorrect

Collecting autocorrect
[?25l  Downloading https://files.pythonhosted.org/packages/35/31/aa5d4b54baafed2d0eef47e30d527ea60eb7357f11c3b5adc58262a3c693/autocorrect-2.5.0.tar.gz (622kB)
[K     |▌                               | 10kB 20.6MB/s eta 0:00:01[K     |█                               | 20kB 26.9MB/s eta 0:00:01[K     |█▋                              | 30kB 24.6MB/s eta 0:00:01[K     |██                              | 40kB 27.7MB/s eta 0:00:01[K     |██▋                             | 51kB 26.5MB/s eta 0:00:01[K     |███▏                            | 61kB 28.8MB/s eta 0:00:01[K     |███▊                            | 71kB 18.9MB/s eta 0:00:01[K     |████▏                           | 81kB 19.7MB/s eta 0:00:01[K     |████▊                           | 92kB 18.8MB/s eta 0:00:01[K     |█████▎                          | 102kB 18.8MB/s eta 0:00:01[K     |█████▉                          | 112kB 18.8MB/s eta 0:00:01[K     |██████▎                         | 122kB 18.8MB

In [33]:
from autocorrect import Speller
spell = Speller(lang='en')

def preprocess_input_with_spelling(q):
  query = q.lower()
  q_tok = nltk.tokenize.word_tokenize(query)
  q_tok = pd.Series(q_tok).apply(lambda x:expand_contractions(x))
  q_clean = []
  for i in q_tok:
    if str.isalnum(i):
      autocorrect = spell(i)
      q_clean.append(autocorrect)
  
  return q_clean

### Find the intersection between query and title index

In [34]:
def find_intersection(query, index):
  keys = index.keys()
  q_clean = preprocess_input_with_spelling(query)
  set_of_indexes = []
  answer_indexes = []
  for token in q_clean:
    if token in keys:
      set_of_indexes.append(set(list(index[token])))

  if len(set_of_indexes) > 0:
    answer_indexes = set_of_indexes[0]
    for ind_set in set_of_indexes:
      answer_indexes = answer_indexes.intersection(ind_set)
  
  answer_indexes = list(answer_indexes)
  return answer_indexes[:10]

### Find song based on title

In [35]:
def find_title(query, title_index=title_index, data=sorted):
  indx = find_intersection(query, title_index)
  answer = pd.DataFrame()
  if indx == []:
    answer = "Not Found"
    print(answer)
    return 0
  answer = sorted.iloc[indx]
  answer = answer.sort_values(by='track_popularity', ascending=False)                 
  return answer

In [36]:
find_title("firefly", title_index, sorted)

Not Found


0

In [37]:
find_title("!!!!!", title_index, sorted)

Not Found


0

In [38]:
len(sorted)

15405

## Input combination

### Arist & Title

In [39]:
def retrieve_artist_title(q_artist, q_title):
  artist_filter = retrieve_result_artist(q_artist, lyrics=True)
  title_filter = find_title(q_title)
  #print(artist_filter)
  #print(title_filter)
  #merge results of the 2 querys 
  full_filter = pd.merge(artist_filter, title_filter, how='right', on=['track_artist', "track_name"])
  full_filter = full_filter.iloc[:, 0:7].dropna()
  full_filter.columns = artist_filter.columns
  return full_filter

In [40]:
retrieve_artist_title("ed sheeran", "shape of you")

Unnamed: 0,track_id,track_name,track_artist,lyrics,playlist_genre,playlist_subgenre,track_popularity
0,7qiZfU4dY1lWllzX7mPBI3,Shape of You,Ed Sheeran,The club isn't the best place to find a lover ...,pop,dance pop,86.0
1,0FE9t6xYkqWXU2ahLh6D8X,Shape of You,Ed Sheeran,The club isn't the best place to find a lover ...,latin,tropical,75.0
2,7qiZfU4dY1lWllzX7mPBI3,Shape of You,Ed Sheeran,The club isn't the best place to find a lover ...,pop,dance pop,86.0
3,0FE9t6xYkqWXU2ahLh6D8X,Shape of You,Ed Sheeran,The club isn't the best place to find a lover ...,latin,tropical,75.0
4,5H7CwzYZ60e7w69tX4ivQN,Shape of You - Galantis Remix,Ed Sheeran,The club isn't the best place to find a lover ...,edm,pop edm,51.0


### Artist & Lyric

In [41]:
# Jika artist and lyrics
def retrieve_artist_lyrics(q_artist, q_lyrics):
  artist_filter = retrieve_result_artist(q_artist, lyrics=True)
  full_filter = find_lyrics(q_lyrics, artist_filter)
  return full_filter

In [42]:
retrieve_artist_lyrics('linkin park', "Breaking the habit tonight")

Unnamed: 0,track_id,track_name,track_artist,lyrics,playlist_genre,playlist_subgenre,track_popularity,embed_val,similarity
0,6n8TMVyFKoUmDc4apxceRD,Breaking the Habit,Linkin Park,"Memories consume, like opening the wound I'm p...",rock,album rock,65,"[0.08552809, -0.018229255, 0.113975, -0.005854...",[[0.7039161]]
1,7oVEtyuv9NBmnytsCIsY5I,BURN IT DOWN,Linkin Park,The cycle repeated As explosions broke in the ...,pop,electropop,73,"[0.056871943, -0.03366629, 0.13719541, 0.04056...",[[0.6442048]]
2,60a0Rd6pjrkxjPbaKzXjfq,In the End,Linkin Park,"It starts with one One thing, I don't know why...",rock,permanent wave,83,"[0.06559538, -0.020554647, 0.15176237, -0.0402...",[[0.6417858]]
3,2nLtzopw4rPReszdYBJU6h,Numb,Linkin Park,I'm tired of being what you want me to be Feel...,rock,permanent wave,81,"[0.055891998, -0.06043786, 0.13149533, 0.04966...",[[0.63986975]]
4,0UFDKFqW2oGspYeYqo9wjA,Bleed It Out,Linkin Park,"Yeah, here we go for the hundredth time Hand g...",rock,album rock,72,"[0.07471897, -0.08247591, 0.12070294, 0.031815...",[[0.6364451]]
5,1fLlRApgzxWweF1JTf8yM5,Given Up,Linkin Park,Waking in sweat again Another day's been laid ...,rock,hard rock,67,"[0.06638073, -0.08678293, 0.07281739, -0.01939...",[[0.63526183]]
6,0WrZxwjIBUL0VNN0AAblBQ,Given Up,Linkin Park,Waking in sweat again Another day's been laid ...,rock,hard rock,37,"[0.06638073, -0.08678293, 0.07281739, -0.01939...",[[0.63526183]]
7,3aYBjxTMvrEOP0A0UXg9ER,Final Masquerade,Linkin Park,Tearing me apart with the words you wouldn't s...,pop,electropop,63,"[0.02077694, -0.050110202, 0.09197483, 0.01649...",[[0.63324594]]
8,3fjmSxt0PskST13CSdBUFx,Somewhere I Belong,Linkin Park,(When this began) I had nothing to say And I'd...,rock,album rock,69,"[0.058633216, -0.06251986, 0.10885001, 0.01212...",[[0.6326205]]
9,1d5UuboIPRMD4HaU3yycKC,Somewhere I Belong,Linkin Park,(When this began) I had nothing to say And I'd...,rock,hard rock,56,"[0.058633216, -0.06251986, 0.10885001, 0.01212...",[[0.6326205]]


### Title Lyric

In [43]:
def find_title_lyrics(q_title, q_lyrics):
  title_idx = indexer(tok_title)
  title_doc = find_title(q_title, title_idx, sorted)
  full_doc = find_lyrics(q_lyrics, title_doc)
  return full_doc

In [44]:
find_title_lyrics('in the endd', "in the end it doesn't even matter")

Unnamed: 0,track_id,track_name,track_artist,lyrics,playlist_genre,playlist_subgenre,track_popularity,embed_val,similarity
0,60a0Rd6pjrkxjPbaKzXjfq,In the End,Linkin Park,"It starts with one One thing, I don't know why...",rock,permanent wave,83,"[0.06559538, -0.020554647, 0.15176237, -0.0402...",[[0.85805565]]
1,2vyACbL11HoiWTcA6Ri7wU,In the End (Lost Tapes),Röyksopp,"""The boy's a genius"" You'll hear them say ""Oh ...",pop,electropop,45,"[0.047967218, -0.077143535, 0.12416375, 0.0041...",[[0.82758236]]
2,5Ztwhn7zR0q7wRe2ZZsrmN,The Loser In The End - Remastered 2011,Queen,Mama's gotta problem She don't know what to sa...,rock,album rock,33,"[0.06960667, -0.08874056, 0.10755766, 0.007156...",[[0.8063226]]
3,1faolBLC36cigtao56kZHM,In the End,The Cranberries,Ain't it strange When everything you wanted Wa...,rock,album rock,43,"[0.0326231, -0.035967536, 0.15322703, 0.032433...",[[0.7773652]]


# **[Evaluation]**

Test case dan perhitungan dilakukan di dokumen dokumentasi program

# Misc

In [45]:
def out_beautifier(doc):
    if isinstance(doc, int):
        return
    out = doc[['track_name', 'track_artist', 'lyrics', 'track_popularity','playlist_genre']]
    display(out)

# **[Main Program]**

In [46]:
def main():
  print('''Hello I am Music Checker(temp)\n 
  ''')
  while True:
    print('What do you want to do?\n1.Search\n2.Quit')
    try:
      opening = int(input())
      if opening == 2:
        print('Goodbye!')
        return 
      elif opening != 1:
        continue
    except:
      continue
      
    print("Please input the title of the song. (You can leave it blank if you don't know)")
    in_title = str(input())
    print("Please input the artist of the song. (You can leave it blank if you don't know)")
    in_artist = str(input())
    print("Please input part of the lyrics of the song. (You can leave it blank if you don't know)")
    in_lyrics = str(input()) 

    if in_title == '' and in_artist == '' and in_lyrics == '': # All blank
      print("You can't have it all blank")
      continue
    if in_title != '' and in_artist != '': #Artist & Title
      # call search in_title & in_artist
      out_beautifier(retrieve_artist_title(in_artist, in_title))
      continue
    if in_title != '' and in_artist == '' and in_lyrics != '': # Title & Lyric
      # call search in_title & in_lyrics
      out_beautifier(find_title_lyrics(in_title, in_lyrics))
      continue
    if in_title == '' and in_artist != '' and in_lyrics != '': # Artist & Lyric
      # call search in_artist & in_lyrics
      out_beautifier(retrieve_artist_lyrics(in_artist, in_lyrics))
      continue
    if in_title != '' and in_artist == '' and in_lyrics == '': # Title
      # Call search in_title
      out_beautifier(find_title(in_title))
      continue
    if in_title == '' and in_artist != '' and in_lyrics == '': # Artist
      out_beautifier(retrieve_result_artist(in_artist))
      # call search in_artist
      continue
    if in_title == '' and in_artist == '' and in_lyrics != '': # Lyric
      # call search in_lyrics
      out_beautifier(find_lyrics(in_lyrics))
      continue

# Test case

In [54]:
main() #Checking title only

Hello I am Music Checker(temp)
 
  
What do you want to do?
1.Search
2.Quit
2
Goodbye!


In [48]:
main() #Artist only

Hello I am Music Checker(temp)
 
  
What do you want to do?
1.Search
2.Quit
1
Please input the title of the song. (You can leave it blank if you don't know)

Please input the artist of the song. (You can leave it blank if you don't know)
taylor
Please input part of the lyrics of the song. (You can leave it blank if you don't know)



Unnamed: 0,track_name,track_artist,lyrics,track_popularity,playlist_genre
15892,You Need To Calm Down,Taylor Swift,You are somebody that I don't know But you're ...,86,pop
8078,Lover (Remix) [feat. Shawn Mendes],Taylor Swift,We could leave the Christmas lights up 'til Ja...,85,pop
9582,I Forgot That You Existed,Taylor Swift,How many days did I spend thinking 'Bout how y...,79,r&b
4337,Blank Space,Taylor Swift,"Nice to meet you, where you been? I could show...",78,pop
14885,I Knew You Were Trouble.,Taylor Swift,"NA Once upon a time, a few mistakes ago I was ...",76,pop
2062,We Are Never Ever Getting Back Together,Taylor Swift,I remember when we broke up the first time Say...,74,pop
3468,Look What You Made Me Do,Taylor Swift,I don't like your little games Don't like your...,74,pop
1672,How You Want It?,Teyana Taylor,Tell me what you want (Oh) Tell me what you wa...,71,r&b
13562,Delicate,Taylor Swift,This ain't for the best My reputation's never ...,71,edm
8503,Gonna Love Me,Teyana Taylor,"And oh, you're gonna love me You're gonna wann...",70,r&b


What do you want to do?
1.Search
2.Quit
2
Goodbye!


In [49]:
main() #Lirik only

Hello I am Music Checker(temp)
 
  
What do you want to do?
1.Search
2.Quit
1
Please input the title of the song. (You can leave it blank if you don't know)

Please input the artist of the song. (You can leave it blank if you don't know)

Please input part of the lyrics of the song. (You can leave it blank if you don't know)
bad blood


Unnamed: 0,track_name,track_artist,lyrics,track_popularity,playlist_genre
0,Bad Boy,Miami Sound Machine,"Bad boy, bad boy Bad boy, bad boy Boys will be...",42,rock
1,Blood,Thepoolboi,Sit here and looking for every second I'm maki...,40,rap
2,Give Blood,Pete Townshend,(Give blood) But you may find that blood is no...,35,rock
3,Bad Medicine,Bon Jovi,Your love is like bad medicine Bad medicine is...,2,rock
4,Blood,In This Moment,I hate you for the sacrifices you made for me ...,1,rock
5,Bomb A Drop,Garmiani,Garmiani! Like a one man army! Run it! Bad man...,1,edm
6,BAMF,Pegboard Nerds,"Bad motherf-, bad motherf- Bad motherf-, bad m...",0,edm
7,Bad Blood,Neil Sedaka,It could've been me But it was you Who went an...,0,rock
8,It's Okay (One Blood),The Game,"NA Dre, I see dead people Modern vampires of t...",0,rap
9,Bad Blood - Alternate Mix,Ministry,Wild skies Full moon and thoughts collide We l...,0,rap


What do you want to do?
1.Search
2.Quit
2
Goodbye!


In [50]:
main() #Lirik & Artist

Hello I am Music Checker(temp)
 
  
What do you want to do?
1.Search
2.Quit
1
Please input the title of the song. (You can leave it blank if you don't know)
blank space
Please input the artist of the song. (You can leave it blank if you don't know)
taylor
Please input part of the lyrics of the song. (You can leave it blank if you don't know)



Unnamed: 0,track_name,track_artist,lyrics,track_popularity,playlist_genre
0,Blank Space,Taylor Swift,"Nice to meet you, where you been? I could show...",78,pop


What do you want to do?
1.Search
2.Quit
2
Goodbye!


In [51]:
main() #Lirik & Title

Hello I am Music Checker(temp)
 
  
What do you want to do?
1.Search
2.Quit
1
Please input the title of the song. (You can leave it blank if you don't know)
in the end
Please input the artist of the song. (You can leave it blank if you don't know)

Please input part of the lyrics of the song. (You can leave it blank if you don't know)
even matter


Unnamed: 0,track_name,track_artist,lyrics,track_popularity,playlist_genre
0,In the End,Linkin Park,"It starts with one One thing, I don't know why...",83,rock
1,The Loser In The End - Remastered 2011,Queen,Mama's gotta problem She don't know what to sa...,33,rock
2,In the End (Lost Tapes),Röyksopp,"""The boy's a genius"" You'll hear them say ""Oh ...",45,pop
3,In the End,The Cranberries,Ain't it strange When everything you wanted Wa...,43,rock


What do you want to do?
1.Search
2.Quit
What do you want to do?
1.Search
2.Quit
2
Goodbye!


In [52]:
main() #Title & Artist

Hello I am Music Checker(temp)
 
  
What do you want to do?
1.Search
2.Quit
2
Goodbye!


In [53]:
main() #Walkthrough

Hello I am Music Checker(temp)
 
  
What do you want to do?
1.Search
2.Quit
2
Goodbye!
