In [47]:
import pandas as pd
import sqlite3 as sql
import numpy as np
from heapq import nlargest
import random
from sklearn.model_selection import train_test_split as tts
import math   

In [2]:
conn = sql.connect("dat/track_metadata.db")
c = conn.cursor()
c.execute("attach 'dat/mxm_dataset.db' AS lyr;")
c.execute("attach 'dat/lastfm_tags.db' AS tag;") 


<sqlite3.Cursor at 0x1a0f0172d0>

In [3]:
for row in c.execute("SELECT name FROM main.sqlite_master WHERE type='table';"):
    print(row)
for row in c.execute("SELECT name FROM lyr.sqlite_master WHERE type='table';"):
    print(row)
for row in c.execute("SELECT name FROM tag.sqlite_master WHERE type='table';"):
    print(row)

('songs',)
('words',)
('lyrics',)
('tags',)
('tids',)
('tid_tag',)


In [4]:
songs = pd.read_csv("dat/songs.csv")[['CT', 'track_id', 'song_name_cleaned', 'duration', 'artist_familiarity', 'year', 'genre_merged']]
songs["genre_merged"] = songs["genre_merged"].str.replace("rap","hiphop")
songs = songs.drop_duplicates(["track_id","genre_merged"])

In [5]:
songs = songs.merge(songs.groupby("track_id")["genre_merged"].agg(["count"]).reset_index(),on="track_id")

In [6]:
# combo = set()
# for i, id in enumerate(songs["track_id"].drop_duplicates()):
#     temp_tag = tuple(sorted(list(songs[songs["track_id"] == id].genre_merged)))
#     combo.add(temp_tag)
# len(combo) = 676

In [7]:
len(songs.track_id.drop_duplicates())

21167

In [8]:
songs.groupby("genre_merged")["track_id"].agg(["count"]).reset_index()

Unnamed: 0,genre_merged,count
0,blues,762
1,classic rock,1615
2,country,1455
3,electronic,1812
4,folk,1967
5,hiphop,5279
6,indie,4362
7,jazz,837
8,metal,3135
9,pop,5485


In [9]:
# outer tag 
map1 = {'rock':'rock',
 'indie':'indie',
 'folk':'folk',
 'electronic':'electronic',
 'classic rock':'rock',
 'country':'rock',
 'blues':'blues',
 'metal':'rock',
 'jazz':'blues',
 'pop':'pop',
 'punk':'rock',
 'hiphop':'hiphop',
 'soul':'blues'}

In [10]:
songs["outer_g"] = songs["genre_merged"].map(map1)

In [11]:
def select_songs(n):
    df = pd.DataFrame(columns = ['CT', 'track_id', 'song_name_cleaned', 'duration', 'artist_familiarity', 'year', 'genre_merged', 'count', "outer_g"])
    tags = ['rock', 'indie', 'folk', 'electronic', 'classic rock', 'country', 'blues', 'metal', 'jazz', 'pop', 'punk', 'hiphop', 'soul']
    
    for tag in tags: 
        temp = songs[(songs["genre_merged"]==tag) & (~songs["track_id"].isin(df.track_id))].sort_values("count")[0:n]
        df = pd.concat([df, temp])
    return df

In [12]:
train_songs = (select_songs(300)).drop("count", axis = 1)
test_songs = songs[(~songs["track_id"].isin(train_songs["track_id"]))].drop("count", axis = 1)
songs = songs.drop("count", axis = 1)

In [13]:
train_songs.head()

Unnamed: 0,CT,track_id,song_name_cleaned,duration,artist_familiarity,year,genre_merged,outer_g
6245,116,TRDWQIB128E0782EAC,money (2001 digital remaster),405.9424,0.818042,1981,rock,rock
22012,110,TROHBUD128F9320C4C,fixed to ruin,226.58567,0.666536,2008,rock,rock
23522,127,TRPGBOL128F931462C,stander on the mountain,368.69179,0.582204,1990,rock,rock
7120,108,TRELEKE128F426A8D3,pirate radio,267.78077,0.686224,1997,rock,rock
16231,120,TRKJISU128F4214FD6,holy roller,267.96363,0.646628,1990,rock,rock


In [None]:
# songs["ccount"] = songs.groupby(['genre_merged']).cumcount()

In [None]:
# train_songs = songs[songs["ccount"].between(50,350)]
# test_songs = songs[(~songs["track_id"].isin(train_songs["track_id"])) & (songs["ccount"].between(100,700))]

In [14]:
songs[songs["track_id"].isin(train_songs.track_id)].groupby("genre_merged")["track_id"].agg(["count"]).reset_index()

Unnamed: 0,genre_merged,count
0,blues,307
1,classic rock,329
2,country,325
3,electronic,308
4,folk,321
5,hiphop,368
6,indie,317
7,jazz,318
8,metal,334
9,pop,374


In [15]:
len(songs[["track_id", "genre_merged"]])

39825

In [16]:
sql_train = "SELECT track_id, word, count FROM lyr.lyrics WHERE track_id IN {}".format(str(tuple(train_songs.track_id.drop_duplicates())))
sql_test= "SELECT track_id, word, count FROM lyr.lyrics WHERE track_id IN {}".format(str(tuple(test_songs.track_id.drop_duplicates())))


In [17]:
train_lyc = pd.read_sql(sql_train, conn)
test_lyc = pd.read_sql(sql_test, conn)

In [18]:
aggdict = {"duration":["mean"], "sum":["mean"]}
genre_dc = train_lyc.groupby("track_id")["count"].agg(["sum"]).reset_index()\
        .merge(train_songs[["track_id", "duration", "genre_merged"]], how = "inner", on="track_id")\
        .groupby("genre_merged")["sum","duration"].agg(aggdict).reset_index()
genre_dc.columns = ["genre_merged","avg_duration", "avg_wc"]

In [None]:
# genre_dc.to_csv("genre_dc_t.csv", index=False)

In [19]:
genre_dc

Unnamed: 0,genre_merged,avg_duration,avg_wc
0,blues,267.641974,315.523333
1,classic rock,289.347378,316.54
2,country,231.184004,300.173333
3,electronic,303.067759,318.723333
4,folk,268.670763,297.713333
5,hiphop,239.428162,534.326667
6,indie,258.479786,282.173333
7,jazz,268.505146,333.396667
8,metal,332.10408,272.643333
9,pop,248.055361,390.456667


In [20]:
import nltk

In [21]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/NyanPassu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
train_lyc = train_lyc[~train_lyc["word"].isin(en_stop)]
test_lyc = test_lyc[~test_lyc["word"].isin(en_stop)]

In [23]:
train_lyc.head()

Unnamed: 0,track_id,word,count
21,TRAAWOR128F92DF3C8,love,5
23,TRAAWOR128F92DF3C8,know,10
29,TRAAWOR128F92DF3C8,like,1
30,TRAAWOR128F92DF3C8,time,2
32,TRAAWOR128F92DF3C8,go,3


In [24]:
tags = list(train_songs['genre_merged'].drop_duplicates())
tags

['rock',
 'indie',
 'folk',
 'electronic',
 'classic rock',
 'country',
 'blues',
 'metal',
 'jazz',
 'pop',
 'punk',
 'hiphop',
 'soul']

In [25]:
outertags = list(train_songs['outer_g'].drop_duplicates())
outertags

['rock', 'indie', 'folk', 'electronic', 'blues', 'pop', 'hiphop']

In [28]:
def train_by(tags, col):   
    temp_train = songs[songs[col].isin(tags)].drop_duplicates(["track_id", col])
    temp_train_lyc = train_lyc[train_lyc["track_id"].isin(temp_train.track_id)]
    lyc_C = temp_train_lyc.merge(temp_train, how = "inner", on="track_id")
    lyc_C_count = lyc_C.groupby(["word", col])["count"].agg("sum").reset_index()
    tag_wcount = lyc_C_count.groupby([col])["count"].agg("sum").reset_index()
    tag_dict = {}
    for index, row in tag_wcount.iterrows():
        tag = row[col]
        prob = math.log(row["count"] / tag_wcount["count"].sum())
        tag_dict.update({tag: prob})
        
    dummy_dict = {}
    for index, row in tag_wcount.iterrows():
        tag = row[col]
        prob = math.log(0.5 / row["count"])
        dummy_dict.update({tag: prob})
    
    word_dict = {} 
    for tag in tags: 
        for index, row in (lyc_C_count[lyc_C_count[col] == tag]).iterrows():
            word_t = row["word"]
            prob = math.log(row["count"] / tag_wcount[tag_wcount[col] == tag]["count"])
            word_dict.update({(word_t, tag): prob})
    return tag_dict, dummy_dict, word_dict

In [29]:
o_td, o_dd, o_wd = train_by(outertags, "outer_g")

In [30]:
o_td

{'blues': -1.501846922335147,
 'electronic': -2.6792052838514078,
 'folk': -2.744481414599998,
 'hiphop': -1.9262946354745338,
 'indie': -2.819370361754169,
 'pop': -2.3390844209695922,
 'rock': -1.0710869416843403}

In [32]:
b_td, b_dd, b_wd = train_by(["blues", "soul", "jazz"], "genre_merged")

In [33]:
b_td

{'blues': -1.2871505553487335,
 'jazz': -1.1637213466291716,
 'soul': -0.8876498960158438}

In [34]:
r_td, r_dd, r_wd = train_by(["rock", "classic rock", "country", "metal", "punk"], "genre_merged")

In [35]:
def b_predict(tid):
    outertags = ['rock', 'indie', 'folk', 'electronic', 'blues', 'pop', 'hiphop'] 
    tag_dict = {key: 0 for key in outertags}
    lyc = test_lyc[test_lyc["track_id"] == tid]
    for tag in outertags:
        for index, row in lyc.iterrows():
            word = (row["word"], tag)
            if word in o_wd:
                
                tag_dict[tag] += o_wd[word] + math.log(row["count"])
            else:
                tag_dict[tag] += o_dd[tag]
    outer_r = (nlargest(3, tag_dict, key=tag_dict.get))

    if "rock" in outer_r:
        rtags = ["rock", "classic rock", "country", "metal", "punk"]
        tag_dict = {key: 0 for key in rtags}
        for tag in rtags:
            for index, row in lyc.iterrows():
                word = (row["word"], tag)
                if word in r_wd:
                    tag_dict[tag] += r_wd[word] + math.log(row["count"])
                else:
                    tag_dict[tag] += r_dd[tag]
            r_tag = max(tag_dict, key=tag_dict.get)
        outer_r = [w.replace('rock', r_tag) for w in outer_r]

    if "blues" in outer_r: 
        btags = ["blues", "soul", "jazz"]
        tag_dict = {key: 0 for key in btags}
        for tag in btags:
            for index, row in lyc.iterrows():
                word = (row["word"], tag)
                if word in b_wd:
                    tag_dict[tag] += b_wd[word] + math.log(row["count"])
                else:
                    tag_dict[tag] += b_dd[tag]
            b_tag = max(tag_dict, key=tag_dict.get)
    
        outer_r = [w.replace('blues', b_tag) for w in outer_r]

    return outer_r


In [None]:
outertags

In [36]:
test_songs[test_songs["genre_merged"]=="hiphop"].head()

Unnamed: 0,CT,track_id,song_name_cleaned,duration,artist_familiarity,year,genre_merged,outer_g
18,150,TRAACZN128F93236B1,seaweed,218.95791,0.707387,2004,hiphop,hiphop
40,143,TRAAHEG128E07861C3,wkya (drop),124.18567,0.74313,2001,hiphop,hiphop
53,274,TRAANWA128F426ADF0,assassination day,257.2273,0.794504,1996,hiphop,hiphop
65,188,TRAAQIH128F428BDEA,(bloody paw on the) kill floor,192.1824,0.618003,2007,hiphop,hiphop
71,218,TRAASGM128EF34DBB0,never forget me,286.30159,0.815923,2007,hiphop,hiphop


In [37]:
b_predict("TRAACZN128F93236B")

['rock', 'indie', 'folk']

In [38]:
b_predict("TRAAHEG128E07861C3")

['hiphop', 'punk', 'jazz']

In [39]:
b_predict("TRAAQIH128F428BDEA")

['electronic', 'punk', 'indie']

In [None]:
list(songs[songs["track_id"]=="TRYUOBY128F14B0F10"]["genre_merged"])

In [None]:
list(songs[songs["track_id"]=="TRAVVMB128F148D1C1"]["genre_merged"])

In [None]:
songs.groupby("track_id")["genre_merged"].agg(["count"]).reset_index().sort_values("count", ascending=False).head()

In [None]:
len(set(al)&set(bl))/len(bl)

In [None]:
list(songs[songs["track_id"]== "TRYUOBY128F14B0F10"]["song_name"])[0]

In [40]:
"""
def test_prediction(tids, tags):
    r = []
    song_name = [] #
    predicted = [] #
    actual = [] #
    for tid in tids:
        pred = b_predict(tid, tags)
        predicted.append(pred) #
        act = list(songs[songs["track_id"]== tid]["genre_merged"])
        actual.append(act) #
        song_name.append(list(songs[songs["track_id"]== tid]["song_name"])[0])
        common = len(set(pred)&set(act))
        if (common >=2) or (common/len(act)>=0.5):
            r.append(1)
        else:
            r.append(0)
        # pd.DataFrame({"tid":tids, "result":r})
    return pd.DataFrame({"tid":tids, "song_name": song_name, "pred":predicted, "actual":actual, "result":r})
"""

def test_prediction(tids):
    r = []
    for tid in tids:
        pred = b_predict(tid)
        act = list(songs[songs["track_id"]== tid]["genre_merged"])
        common = len(set(pred)&set(act))
        if (common >=2) or (common/len(act)>=0.5):
            r.append(1)
        else:
            r.append(0)
    return pd.DataFrame({"tid":tids, "result":r})

In [73]:
def random_test_id(n):
    test_id = list(test_songs.track_id.drop_duplicates())
    rad = list(set([random.randint(1,len(test_id)-1) for i in range(0,n)]))
    return [test_id[i] for i in rad]

In [82]:
pred_1 = test_prediction(random_test_id(1000))

In [83]:
pred_1.result.sum()/len(pred_1)

0.6308169596690796

In [None]:
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
import lyricwikia

In [None]:
def l_predict(artist, songname):
    outertags = ['rock', 'indie', 'folk', 'electronic', 'blues', 'pop', 'hiphop'] 
    tag_dict = {key: 0 for key in outertags}
    testsong = lyricwikia.get_lyrics(artist, songname)
    edit_string_as_list = testsong.split()
    final_list = [word for word in edit_string_as_list if word not in en_stop]
    final_string = [' '.join(final_list)]
    vectorizer = CountVectorizer() 
    cv_fit = vectorizer.fit_transform(final_string).todense() 
    count_w = np.asarray(cv_fit.sum(axis=0))[0]
    l_word = vectorizer.vocabulary_
    for tag in outertags:
        for key in l_word:
            word = (key, tag)
            if word in o_wd:
                tag_dict[tag] += o_wd[word] + math.log(count_w[l_word[key]])
            else:
                tag_dict[tag] += o_dd[tag]
    outer_r = (nlargest(3, tag_dict, key=tag_dict.get))

    if "rock" in outer_r:
        rtags = ["rock", "classic rock", "country", "metal", "punk"]
        tag_dict = {key: 0 for key in rtags}
        for tag in rtags:
            for key in l_word:
                word = (key, tag)
                if word in r_wd:
                    tag_dict[tag] += r_wd[word] + math.log(count_w[l_word[key]])
                else:
                    tag_dict[tag] += r_dd[tag]
            r_tag = max(tag_dict, key=tag_dict.get)
        outer_r = [w.replace('rock', r_tag) for w in outer_r]

    if "blues" in outer_r: 
        btags = ["blues", "soul", "jazz"]
        tag_dict = {key: 0 for key in btags}
        for tag in btags:
            for key in l_word:
                word = (key, tag)
                if word in b_wd:
                    tag_dict[tag] += b_wd[word] + math.log(count_w[l_word[key]])
                else:
                    tag_dict[tag] += b_dd[tag]
            b_tag = max(tag_dict, key=tag_dict.get)
    
        outer_r = [w.replace('blues', b_tag) for w in outer_r]
        # max(tag_dict, key=tag_dict.get)  
    return outer_r


In [None]:
print(l_predict("Muddy Waters", "Hoochie Coochie Man"))
print(l_predict("John Lee Hooker", "Boogie Chillun"))
print(l_predict("Mamie Smith", "Crazy Blues"))
print(l_predict("pink floyd", "money"))
print(l_predict("red hot chili peppers", "snow"))
print(l_predict("Bob Seger", "old time rock n roll"))
print(l_predict("linkin park", "numb"))
print(l_predict("lil wayne", "i'm me"))
print(l_predict("akon", "i wanna love you"))


In [None]:
conn.close()
