In [None]:
import sys
import pandas as pd
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [39]:
def load_data():
    return pd.read_csv("turkish_song_lyrics.csv")

In [None]:
def preprocess_data(df):
    df['lyrics'] = df['lyrics'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex=True)
    stemmer = PorterStemmer()

In [None]:
def tokenization(txt):
        tokens = nltk.word_tokenize(txt)
        stemming = [stemmer.stem(w) for w in tokens]
        return " ".join(stemming)
df['lyrics'] = df['lyrics'].apply(lambda x: tokenization(x))
df.head()

In [None]:
def calculate_similarity(df):
    tfidvector = TfidfVectorizer(analyzer='word', stop_words='english')
    matrix = tfidvector.fit_transform(df['lyrics'])
    similarity = cosine_similarity(matrix)
    return similarity

In [None]:
def recommendation(selected_song, df, similarity):
    idx = df[df['song'] == selected_song].index[0]
    distances = sorted(list(enumerate(similarity[idx])), reverse=True, key=lambda x: x[1])

    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)

    return songs

In [None]:
if __name__ == "__main__":
    df = load_data()
    df = preprocess_data(df)
    similarity_matrix = calculate_similarity(df)
    selected_song = sys.argv[1]
    recommendations = recommendation(selected_song, df, similarity_matrix)
    print(json.dumps(recommendations))
