In [1]:
# This is an implementation of TF-IDF based on the following link
# https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76
# If you failed to access the web page, you might access the pdf file in the following link
# https://drive.google.com/file/d/1J7J6p8hZ3lDmncxq1f6HHNstaGbtREHj/view?usp=sharing

import pandas as pd
import numpy as np
import re

In [2]:
# Simple documents
# Change the content, so you can have some intuition about TF-IDF

documentA = 'The man went out for a walk'
documentB = 'the children sat around the fire'

# split each document
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')

print(bagOfWordsA)
print(bagOfWordsB)

['The', 'man', 'went', 'out', 'for', 'a', 'walk']
['the', 'children', 'sat', 'around', 'the', 'fire']


In [3]:
# Find the uniue set of words
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))
print(uniqueWords)

{'out', 'around', 'for', 'a', 'sat', 'fire', 'the', 'walk', 'man', 'children', 'went', 'The'}


In [4]:
# create dictionary for each document and calculate the word frequency in each document

numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
    numOfWordsA[word] += 1
print(numOfWordsA)

numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
    numOfWordsB[word] += 1
print(numOfWordsB)

{'out': 1, 'around': 0, 'for': 1, 'a': 1, 'sat': 0, 'fire': 0, 'the': 0, 'walk': 1, 'man': 1, 'children': 0, 'went': 1, 'The': 1}
{'out': 0, 'around': 1, 'for': 0, 'a': 0, 'sat': 1, 'fire': 1, 'the': 2, 'walk': 0, 'man': 0, 'children': 1, 'went': 0, 'The': 0}


In [5]:
#importing stopword
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
#stopwords.words('english')
stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hadiyan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


<WordListCorpusReader in 'C:\\Users\\Hadiyan\\AppData\\Roaming\\nltk_data\\corpora\\stopwords'>

In [6]:
# Compute the Term Frequency
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)

print(tfA)
print(tfB)

{'out': 0.14285714285714285, 'around': 0.0, 'for': 0.14285714285714285, 'a': 0.14285714285714285, 'sat': 0.0, 'fire': 0.0, 'the': 0.0, 'walk': 0.14285714285714285, 'man': 0.14285714285714285, 'children': 0.0, 'went': 0.14285714285714285, 'The': 0.14285714285714285}
{'out': 0.0, 'around': 0.16666666666666666, 'for': 0.0, 'a': 0.0, 'sat': 0.16666666666666666, 'fire': 0.16666666666666666, 'the': 0.3333333333333333, 'walk': 0.0, 'man': 0.0, 'children': 0.16666666666666666, 'went': 0.0, 'The': 0.0}


In [7]:
# Compute the inverse document frequency
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

idfs = computeIDF([numOfWordsA, numOfWordsB])
print(idfs)

{'out': 0.6931471805599453, 'around': 0.6931471805599453, 'for': 0.6931471805599453, 'a': 0.6931471805599453, 'sat': 0.6931471805599453, 'fire': 0.6931471805599453, 'the': 0.6931471805599453, 'walk': 0.6931471805599453, 'man': 0.6931471805599453, 'children': 0.6931471805599453, 'went': 0.6931471805599453, 'The': 0.6931471805599453}


In [8]:
# Compute the TFxIDF
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
df = pd.DataFrame([tfidfA, tfidfB])

df

Unnamed: 0,out,around,for,a,sat,fire,the,walk,man,children,went,The
0,0.099021,0.0,0.099021,0.099021,0.0,0.0,0.0,0.099021,0.099021,0.0,0.099021,0.099021
1,0.0,0.115525,0.0,0.0,0.115525,0.115525,0.231049,0.0,0.0,0.115525,0.0,0.0


In [9]:
# Obtaining TF-IDF using sklearn library
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([documentA, documentB])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
df

Unnamed: 0,around,children,fire,for,man,out,sat,the,walk,went
0,0.0,0.0,0.0,0.42616,0.42616,0.42616,0.0,0.303216,0.42616,0.42616
1,0.407401,0.407401,0.407401,0.0,0.0,0.0,0.407401,0.579739,0.0,0.0


In [40]:
# Tugas Besar 1 Implementasi Content Based Filtering dengan TF-IDF.
# a. Copy lirik "Reff" dari lagu berbahasa Indonesia (masing-masing 3 lagu), 
# b. Paste ke link berikut (Pastikan 1 lagu 1 cell lirik_ref): 
# https://docs.google.com/spreadsheets/d/1j98gjzwL-88GPiTGkRgEWhWWcKryvNAdlaDR_oBWgpY/edit?usp=sharing
# c. Cari informasi mengenai k-NN, dan yang diimplementasikan pada TF-IDF pada dataset di atas
# d. Buat satu prosedur yang menerima ID dari lagu, dan kembalikan 5 ID most-similar items

In [10]:
# Load data from google drive
from google.colab import drive
import os

drive.mount('/content/drive')
os.chdir("/content/drive/My Drive/Colab Notebooks/recsys") # me
fileNames = os.listdir()
song_csv = r'DatasetLaguIF_GAB - IF-GAB-03.csv'
song_df = pd.read_csv(song_csv)
song_df

ModuleNotFoundError: No module named 'google'

In [50]:
# Get Data
songs_reff = song_df
reff_column_name = 'Bait_Reff'
# songs_reff = pd.DataFrame(songs_reff_list)

In [51]:
# Preprocessing reff
def preprocess_reff(reff):
  reff = re.sub(r"\n", " ", reff) # remove newline change it to space
  reff = re.sub(r"\r", " ", reff) # remove newline change it to space
  reff = re.sub(r"\r\n", " ", reff) # remove newline change it to space
  reff = re.sub(r'[^A-Za-z0-9 ]+', '', reff)
  return reff

songs_reff = songs_reff.dropna(axis=0, how="any")
songs_reff[reff_column_name].apply(preprocess_reff)
songs_reff

Unnamed: 0,ID_Lagu,Judul_Lagu,Bait_Reff,NIM,Nama
0,1,biarlah - nidji,Biarlah kurela Melepasmu Meninggalkan aku Beri...,1301152728,Dzaka triadi mahdiyah
1,2,haruskah ku mati - ada band,Haruskah kumati karenamu? Terkubur dalam kesed...,1301152728,Dzaka triadi mahdiyah
2,3,bintang di surga - ungu,Bagai bintang di surga Dan seluruh warna Dan k...,1301152728,Dzaka triadi mahdiyah
3,4,glenn fredly - kasih putih,Kucurahkan isi jiwaku\nHanyutkan daku dalam ai...,1301162750,Al Zira Pramitha
4,5,tangga - hebat,Ooh Kau membuat ku merasa hebat\nKarena ketulu...,1301162750,Al Zira Pramitha
...,...,...,...,...,...
122,123,Desember - Efek Rumah Kaca,Aku selalu suka sehabis hujan dibulan desember...,1301198524,Muhammad Muttabi Hudaya
123,124,Sebelah Mata - Efek Rumah Kaca,Sebelah mataku yang mempelajari Gelombang kan ...,1301198524,Muhammad Muttabi Hudaya
124,125,Selamanya Cinta - D'cinnamons,Andaikan kudapat\r\nMengungkapkan\r\nPerasaank...,1301174051,Aqmal insan cendekia
125,126,Puisi - jikustik,Kapan lagi kutulis untukmu\r\nTulisan-tulisan ...,1301174051,Aqmal insan cendekia


In [52]:
# TF IDF song reff
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(songs_reff[reff_column_name].to_numpy())
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
tfidf_songs_reff = pd.DataFrame(denselist, columns=feature_names)

tfidf_songs_reff

Unnamed: 0,abadi,abaikan,about,ada,adakah,adalah,adanya,adil,adzim,agar,ah,air,ajaibnya,ajarkan,akan,akhir,akhirat,akhirnya,akhlak,akhlakmu,aku,akulah,akut,alami,alasan,all,alunan,always,amanat,ambil,amin,anarki,and,anda,andai,andaikan,andalkan,anggap,antara,anugrah,...,tubuhku,tuhan,tuk,tulisan,tumbuh,tunggu,tunggulah,turun,tutur,uang,ucap,udara,umurku,untuk,untukku,untukmu,untuknya,urus,usahaku,usai,vespa,wahai,waktu,walau,walaupun,warna,warnai,was,when,will,with,wujudkan,ya,yakin,yakinkan,yakinlah,yang,yeah,you,zero
0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.295675,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.215823,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.251785,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.31267,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.121915,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.189106,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.135869,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
121,0.0,0.0,0.0,0.0,0.0,0.181072,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.247889,0.247889,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.247889,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.178072,0.0,0.0,0.0
122,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.161671,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.292724,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
123,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.346289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.426893,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.173145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.062190,0.0,0.0,0.0


In [53]:
# Concat song id and tfidf results
tf_idf_with_id =  tfidf_songs_reff.copy()
tf_idf_with_id.insert(loc=0, column='ID_Lagu', value=songs_reff['ID_Lagu'].values)
tf_idf_with_id

Unnamed: 0,ID_Lagu,abadi,abaikan,about,ada,adakah,adalah,adanya,adil,adzim,agar,ah,air,ajaibnya,ajarkan,akan,akhir,akhirat,akhirnya,akhlak,akhlakmu,aku,akulah,akut,alami,alasan,all,alunan,always,amanat,ambil,amin,anarki,and,anda,andai,andaikan,andalkan,anggap,antara,...,tubuhku,tuhan,tuk,tulisan,tumbuh,tunggu,tunggulah,turun,tutur,uang,ucap,udara,umurku,untuk,untukku,untukmu,untuknya,urus,usahaku,usai,vespa,wahai,waktu,walau,walaupun,warna,warnai,was,when,will,with,wujudkan,ya,yakin,yakinkan,yakinlah,yang,yeah,you,zero
0,1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.295675,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.215823,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
1,2,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.251785,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
2,3,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.31267,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.121915,0.0,0.0,0.0
3,4,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.189106,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
4,5,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,123,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.135869,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
121,124,0.0,0.0,0.0,0.0,0.0,0.181072,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.247889,0.247889,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.247889,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.178072,0.0,0.0,0.0
122,125,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.161671,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.292724,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
123,126,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.346289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.426893,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.173145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.062190,0.0,0.0,0.0


In [54]:
# Train K-NN using TF-IDF Result
from sklearn.neighbors import NearestNeighbors
k = 5
nn = NearestNeighbors(algorithm='auto', n_neighbors=k)
nn.fit(tfidf_songs_reff)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [55]:
# Procedure to get 5 nearest neigbors or in this case is the similar valud of TFIDF
# Buat satu prosedur yang menerima ID dari lagu, dan kembalikan 5 ID most-similar items
def get_similar_items_from_nearest_neigbors_results(data, id, nearest_neigbors, total_similar_items=5, column_id_name="ID_Lagu"):
  idx_selected_id = data.index[data[column_id_name] == id].tolist()
  if len(idx_selected_id) > 0:
    values = data.drop(column_id_name, 1)
    distances, indexes = nearest_neigbors.kneighbors([values.iloc[idx_selected_id[0]]], total_similar_items + 1, return_distance=True)
    indexes = indexes[0][1:] # Remove itself

    print(distances)
    print(indexes)

    if indexes is not None:
      similar_items = []
      for point_idx in indexes:
        similar_items.append(data.iloc[point_idx][column_id_name])
      return np.array(similar_items).astype(int)
    else:
      print("Can't show any similar items")
      return None
  else: 
    print("Song not found")
    return None

In [56]:
# Show 5 similar items
id = 104
similar_items = get_similar_items_from_nearest_neigbors_results(tf_idf_with_id, id, nn)

print("Similar Items")
songs_reff[songs_reff['ID_Lagu'].isin(similar_items)]

[[0.         0.69867281 1.30952531 1.33687066 1.34018999 1.34133461]]
[  3 119  48 107  32]
Similar Items


Unnamed: 0,ID_Lagu,Judul_Lagu,Bait_Reff,NIM,Nama
3,4,glenn fredly - kasih putih,Kucurahkan isi jiwaku\nHanyutkan daku dalam ai...,1301162750,Al Zira Pramitha
33,34,Still Virgin - Dear ndut,Coba genggamlah tanganku dan biarkanlah diriku...,1301164496,Hafizh Fairussufi
50,51,Ardhito Pramono - Bila,"Bila saja dikau bisa berbicara, oh\nIzinkan da...",1301170353,Attala Rafid Abelard
109,110,Separuh Nafasku,Kau hancurkan diriku... Bila kau tinggalkan ak...,1301198512,Revi Chandra
121,122,Cinta - Vina Panduwinata,"Dalam mimpimu, di langkahmu serta hidupmu Geng...",1301198524,Muhammad Muttabi Hudaya


In [57]:
print("Selected Items")
songs_reff[songs_reff['ID_Lagu'].isin([id])]

Selected Items


Unnamed: 0,ID_Lagu,Judul_Lagu,Bait_Reff,NIM,Nama
103,104,Kasih Putih - Glenn Fredly,Biarkanlah kurasakan\nHangatnya sentuhan kasih...,1301198508,Novia Rinanti Robynson
