In [1]:
import pandas as pd
import sqlite3 as sql
import numpy as np
from heapq import nlargest
import random
from sklearn.model_selection import train_test_split as tts
import math   

In [2]:
# require data: track_metadata.db, mxm_dataset.db, lastfm_tags.db, songs.csv

conn = sql.connect("dat/track_metadata.db")
c = conn.cursor()
c.execute("attach 'dat/mxm_dataset.db' AS lyr;")
c.execute("attach 'dat/lastfm_tags.db' AS tag;") 


<sqlite3.Cursor at 0x1a1a696340>

In [3]:
for row in c.execute("SELECT name FROM main.sqlite_master WHERE type='table';"):
    print(row)
for row in c.execute("SELECT name FROM lyr.sqlite_master WHERE type='table';"):
    print(row)
for row in c.execute("SELECT name FROM tag.sqlite_master WHERE type='table';"):
    print(row)

('songs',)
('words',)
('lyrics',)
('tags',)
('tids',)
('tid_tag',)


In [4]:
songs = pd.read_csv("dat/songs.csv")[['CT', 'track_id', 'song_name_cleaned', 'duration', 'artist_familiarity', 'year', 'genre_merged']]
songs["genre_merged"] = songs["genre_merged"].str.replace("rap","hiphop")
songs = songs.drop_duplicates(["track_id","genre_merged"])

In [5]:
# songs = songs.merge(songs.groupby("track_id")["genre_merged"].agg(["count"]).reset_index(),on="track_id")

In [6]:
# combo = set()
# for i, id in enumerate(songs["track_id"].drop_duplicates()):
#     temp_tag = tuple(sorted(list(songs[songs["track_id"] == id].genre_merged)))
#     combo.add(temp_tag)
# len(combo) = 676

In [7]:
len(songs.track_id.drop_duplicates())

21167

In [8]:
songcount = songs.groupby("genre_merged")["track_id"].agg(["count"]).reset_index()

In [9]:
print(list(songcount.sort_values("count")['genre_merged']))

['blues', 'jazz', 'country', 'classic rock', 'electronic', 'folk', 'soul', 'punk', 'metal', 'indie', 'hiphop', 'pop', 'rock']


In [10]:
# outer tag 
map1 = {'rock':'rock',
 'indie':'indie',
 'folk':'folk',
 'electronic':'electronic',
 'classic rock':'rock',
 'country':'rock',
 'blues':'blues',
 'metal':'rock',
 'jazz':'blues',
 'pop':'pop',
 'punk':'rock',
 'hiphop':'hiphop',
 'soul':'blues'}

In [11]:
songs["outer_g"] = songs["genre_merged"].map(map1)

In [12]:
songs.head()

Unnamed: 0,CT,track_id,song_name_cleaned,duration,artist_familiarity,year,genre_merged,outer_g
0,174,TRAABIG128F9356C56,walk the walk,290.16771,0.690786,2000,rock,rock
2,174,TRAABIG128F9356C56,walk the walk,290.16771,0.690786,2000,indie,indie
3,174,TRAABIG128F9356C56,walk the walk,290.16771,0.690786,2000,folk,folk
4,174,TRAABIG128F9356C56,walk the walk,290.16771,0.690786,2000,electronic,electronic
5,102,TRAABLR128F423B7E3,floating,491.12771,0.636424,1987,classic rock,rock


In [13]:
def select_songs(n, col="genre_merged"):
    df = pd.DataFrame(columns = ['CT', 'track_id', 'song_name_cleaned', 'duration', 'artist_familiarity', 'year', 'genre_merged', "outer_g"])
    # tags = ['rock', 'pop', 'hiphop', 'indie', 'metal', 'punk', 'soul', 'folk', 'electronic', 'classic rock', 'country', 'jazz', 'blues']
    tags = ['blues', 'jazz', 'country', 'classic rock', 'electronic', 'folk', 'soul', 'punk', 'metal', 'indie', 'hiphop', 'pop', 'rock']
    # tags = ['rock', 'pop', 'hiphop', 'indie', 'blues', 'folk', 'electronic']
    for tag in tags:  
        df2 = songs[songs["track_id"].isin(df.track_id)]
        overlap = len(df2[df2["genre_merged"]==tag])
        # temp = songs[(songs[col]==tag) & (~songs["track_id"].isin(df.track_id))].sample(300, random_state = 30)
        temp = songs[(songs[col]==tag) & (~songs["track_id"].isin(df.track_id))].sample(((300-overlap) if (300-overlap) > 0 else 0), random_state = 30)
        df = pd.concat([df, temp])
    return df

In [14]:
train_songs = (select_songs(300))
test_songs = songs[(~songs["track_id"].isin(train_songs["track_id"]))]

In [15]:
train_songs.head()

Unnamed: 0,CT,track_id,song_name_cleaned,duration,artist_familiarity,year,genre_merged,outer_g
15203,150,TRHEKVS128F149193A,the wreck of the barbie ferrari,276.97587,0.686222,1993,blues,blues
47394,119,TRXJWOG128EF35E2C7,lien on your dreams,275.90485,0.830676,2007,blues,blues
24283,109,TRLSEZR128F4214E02,moon over bourbon street,176.03873,0.783414,1985,blues,blues
32168,101,TRPSUWH128F424492F,candy licker,414.71955,0.522479,2003,blues,blues
24787,101,TRLZDOJ128F148B0FA,goodbye letter,239.12444,0.69897,2003,blues,blues


In [16]:
songs[songs["track_id"].isin(train_songs.track_id)].groupby("genre_merged")["track_id"].agg(["count"]).reset_index()

Unnamed: 0,genre_merged,count
0,blues,364
1,classic rock,363
2,country,323
3,electronic,328
4,folk,317
5,hiphop,300
6,indie,410
7,jazz,330
8,metal,301
9,pop,574


In [17]:
len(songs[["track_id", "genre_merged"]])

39825

In [18]:
sql_train = "SELECT track_id, word, count FROM lyr.lyrics WHERE track_id IN {}".format(str(tuple(train_songs.track_id.drop_duplicates())))
sql_test= "SELECT track_id, word, count FROM lyr.lyrics WHERE track_id IN {}".format(str(tuple(test_songs.track_id.drop_duplicates())))


In [19]:
train_lyc = pd.read_sql(sql_train, conn)
test_lyc = pd.read_sql(sql_test, conn)

In [20]:
import nltk

In [21]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/NyanPassu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
sql_wc = "SELECT word, sum(count) as wc FROM lyr.lyrics \
            WHERE word NOT IN {} \
            GROUP BY word \
            ORDER BY wc DESC \
            LIMIT 100".format(str(tuple(en_stop)))
freq_wd = pd.read_sql(sql_wc, conn)

In [23]:
enhanced_stop = en_stop.union(set(freq_wd.word[0:30]))

In [24]:
train_lyc = train_lyc[~train_lyc["word"].isin(enhanced_stop)]
test_lyc = test_lyc[~test_lyc["word"].isin(enhanced_stop)]

In [25]:
train_lyc.head()

Unnamed: 0,track_id,word,count
35,TRAABLR128F423B7E3,night,2
36,TRAABLR128F423B7E3,need,2
37,TRAABLR128F423B7E3,right,1
38,TRAABLR128F423B7E3,tell,1
39,TRAABLR128F423B7E3,live,1


In [26]:
tags = ['blues', 'jazz', 'country', 'classic rock', 'electronic', 'folk', 'soul', 'punk', 'metal', 'indie', 'hiphop', 'pop', 'rock']
outertags = ['rock', 'pop', 'hiphop', 'indie', 'blues', 'folk', 'electronic']

In [27]:
def train_by(tags, col):   
    temp_train = songs[songs[col].isin(tags)].drop_duplicates(["track_id", col])
    temp_train_lyc = train_lyc[train_lyc["track_id"].isin(temp_train.track_id)]
    lyc_C = temp_train_lyc.merge(temp_train, how = "inner", on="track_id")
    lyc_C_count = lyc_C.groupby(["word", col])["count"].agg("sum").reset_index()
    tag_wcount = lyc_C_count.groupby([col])["count"].agg("sum").reset_index()
    tag_dict = {}
    for index, row in tag_wcount.iterrows():
        tag = row[col]
        prob = math.log(row["count"] / tag_wcount["count"].sum())
        tag_dict.update({tag: prob})
        
    dummy_dict = {}
    for index, row in tag_wcount.iterrows():
        tag = row[col]
        prob = math.log(0.5 / row["count"])
        dummy_dict.update({tag: prob})
    
    word_dict = {} 
    for tag in tags: 
        for index, row in (lyc_C_count[lyc_C_count[col] == tag]).iterrows():
            word_t = row["word"]
            prob = math.log(row["count"] / tag_wcount[tag_wcount[col] == tag]["count"])
            word_dict.update({(word_t, tag): prob})
    return tag_dict, dummy_dict, word_dict

In [28]:
o_td, o_dd, o_wd = train_by(outertags, "outer_g")

In [29]:
# outer tag, log(probability of tag) 
o_td

{'blues': -1.6913559309107609,
 'electronic': -2.4236153775540252,
 'folk': -2.6847034022798906,
 'hiphop': -2.224712181880013,
 'indie': -2.3381271906014804,
 'pop': -1.945091263978441,
 'rock': -1.1669742813041872}

In [42]:
# outer tag, log(probability of dummy) 
o_dd

{'blues': -12.209267536426694,
 'electronic': -11.477008089783428,
 'folk': -11.215920065057563,
 'hiphop': -11.67591128545744,
 'indie': -11.562496276735974,
 'pop': -11.955532203359013,
 'rock': -12.733649186033267}

In [75]:
# outer tag, log(probability of word) 
o_wd[("ghetto","hiphop")]

-7.987031831343504

In [30]:
b_td, b_dd, b_wd = train_by(["blues", "soul", "jazz"], "genre_merged")

In [31]:
b_td

{'blues': -1.0965435804323833,
 'jazz': -1.084893812046225,
 'soul': -1.114623645232074}

In [32]:
r_td, r_dd, r_wd = train_by(["rock", "classic rock", "country", "metal", "punk"], "genre_merged")

In [33]:
def b_predict(tid):
    outertags = ['rock', 'indie', 'folk', 'electronic', 'blues', 'pop', 'hiphop'] 
    tag_dict = {key: 0 for key in outertags}
    lyc = test_lyc[test_lyc["track_id"] == tid]
    for tag in outertags:
        for index, row in lyc.iterrows():
            word = (row["word"], tag)
            if word in o_wd:
                
                tag_dict[tag] += o_wd[word] + math.log(row["count"])
            else:
                tag_dict[tag] += o_dd[tag]
        tag_dict[tag] = tag_dict[tag] + o_td[tag]
    outer_r = (nlargest(3, tag_dict, key=tag_dict.get))

    if "rock" in outer_r:
        rtags = ["rock", "classic rock", "country", "metal", "punk"]
        tag_dict = {key: 0 for key in rtags}
        for tag in rtags:
            for index, row in lyc.iterrows():
                word = (row["word"], tag)
                if word in r_wd:
                    tag_dict[tag] += r_wd[word] + math.log(row["count"])
                else:
                    tag_dict[tag] += r_dd[tag]
            tag_dict[tag] = tag_dict[tag] + r_td[tag]
        r_tag = max(tag_dict, key=tag_dict.get)    
        outer_r = [w.replace('rock', r_tag) for w in outer_r]

    if "blues" in outer_r: 
        btags = ["blues", "soul", "jazz"]
        tag_dict = {key: 0 for key in btags}
        for tag in btags:
            for index, row in lyc.iterrows():
                word = (row["word"], tag)
                if word in b_wd:
                    tag_dict[tag] += b_wd[word] + math.log(row["count"])
                else:
                    tag_dict[tag] += b_dd[tag]
            tag_dict[tag] = tag_dict[tag] + b_td[tag]
        b_tag = max(tag_dict, key=tag_dict.get)
        outer_r = [w.replace('blues', b_tag) for w in outer_r]

    return outer_r


In [34]:
test_songs[test_songs["genre_merged"]=="hiphop"].head()

Unnamed: 0,CT,track_id,song_name_cleaned,duration,artist_familiarity,year,genre_merged,outer_g
21,150,TRAACZN128F93236B1,seaweed,218.95791,0.707387,2004,hiphop,hiphop
48,143,TRAAHEG128E07861C3,wkya (drop),124.18567,0.74313,2001,hiphop,hiphop
64,274,TRAANWA128F426ADF0,assassination day,257.2273,0.794504,1996,hiphop,hiphop
79,188,TRAAQIH128F428BDEA,(bloody paw on the) kill floor,192.1824,0.618003,2007,hiphop,hiphop
88,218,TRAASGM128EF34DBB0,never forget me,286.30159,0.815923,2007,hiphop,hiphop


In [35]:
b_predict("TRAACZN128F93236B1")

['folk', 'rock', 'indie']

In [36]:
b_predict("TRAANWA128F426ADF0")

['jazz', 'punk', 'hiphop']

In [37]:
b_predict("TRAAHEG128E07861C3")

['jazz', 'hiphop', 'rock']

In [38]:
"""
def test_prediction(tids, tags):
    r = []
    song_name = [] #
    predicted = [] #
    actual = [] #
    for tid in tids:
        pred = b_predict(tid, tags)
        predicted.append(pred) #
        act = list(songs[songs["track_id"]== tid]["genre_merged"])
        actual.append(act) #
        song_name.append(list(songs[songs["track_id"]== tid]["song_name"])[0])
        common = len(set(pred)&set(act))
        if (common >=2) or (common/len(act)>=0.5):
            r.append(1)
        else:
            r.append(0)
        # pd.DataFrame({"tid":tids, "result":r})
    return pd.DataFrame({"tid":tids, "song_name": song_name, "pred":predicted, "actual":actual, "result":r})
"""

def test_prediction(tids):
    r = []
    for tid in tids:
        pred = b_predict(tid)
        act = list(songs[songs["track_id"]== tid]["genre_merged"])
        common = len(set(pred)&set(act))
        if (common >=2) or (common/len(act)>=0.5):
            r.append(1)
        else:
            r.append(0)
    return pd.DataFrame({"tid":tids, "result":r})

In [39]:
def random_test_id(n):
    test_id = list(test_songs.track_id.drop_duplicates())
    rad = list(set([random.randint(1,len(test_id)-1) for i in range(0,n)]))
    return [test_id[i] for i in rad]

In [40]:
pred_1 = test_prediction(random_test_id(1000))

In [41]:
pred_1.result.sum()/len(pred_1)

0.6586888657648283

In [None]:
conn.close()
