In [50]:
import pandas as pd
import re
from collections import Counter, defaultdict
import numpy as np
df = pd.read_csv("..//..//data//Weekly_data_tokenized.csv")

In this notebook we use the classification idea on five different classes.

In [51]:
def get_word_distribution(d, type_ = "train"):
    """
    if train: return word distribution
    if test: return each testing example's word distribution.
    """
    
    from collections import Counter
    from itertools import chain
    
    if type_ == "train":
        ans = Counter(list(d))
        norm = sum(ans.values())
        for element in ans:
            ans[element] /= norm
            
        impute = defaultdict(float)
        for i in ans:
            impute[i] = ans[i]
        return impute

    elif type_ == "test":
        
        ans = []
        for song in d.ID.unique():
            ans1 = Counter(list(d[d.ID == song].word))
            norm = sum(ans1.values())
            for element in ans1:
                ans1[element] /= norm
                
            impute = defaultdict(float)
            for i in ans1:
                impute[i] = ans1[i]
            ans.append(ans1)
        return ans
    
def classify(data, dists):#, popprop, rapprop):
    #p_genre = {"pop": popprop, "rap": rapprop}
    results = []
    for song in data:
        distance = {}
        for dist in dists:
            distance.update({dist: hellinger_distance(song, dists[dist])})
        results.append(max(distance.items(), key = lambda x: x[1]))
    return results

def hellinger_distance(dist1, dist2):
    
    num = 0
    
    for element in dist1.keys():
        
        num += (np.sqrt(dist1[element]) - np.sqrt(dist2[element])) ** 2
        
    num = (1 / np.sqrt(2)) * np.sqrt(num)
    
    return 1 - num

id's: 0: pop, 1: rap, 2: rock

In [52]:
genre_type = []
for word in range(len(df)):
    if re.search("pop", df.loc[word, "Genre"], flags = re.I) != None:
        genre_type.append(0)
    elif re.search("rap", df.loc[word, "Genre"], flags = re.I) != None:
        genre_type.append(1)
    elif re.search("rock", df.loc[word, "Genre"], flags = re.I) != None:
        genre_type.append(2)      
    elif re.search("country", df.loc[word, "Genre"], flags = re.I) != None:
        genre_type.append(3) 
    else:
        genre_type.append(4)
        
df["genre_type"] = genre_type

In [53]:
Counter(genre_type)

Counter({0: 235838, 1: 258621, 2: 42204, 3: 34629, 4: 12366})

We use a 65-35 split since we have 5 different classes now.

In [54]:
IDs = df.ID.unique()
np.random.shuffle(IDs)
train = df[df.ID.isin(IDs[:int(.65 * len(IDs))])]
test = df[df.ID.isin(IDs[int(.65 * len(IDs)):])]

poptrain = train[train["genre_type"] == 0].word
raptrain = train[train["genre_type"] == 1].word
rocktrain = train[train["genre_type"] == 2].word
countrytrain = train[train["genre_type"] == 3].word
othertrain = train[train["genre_type"] == 4].word

poptest = test[test["genre_type"] == 0]
raptest = test[test["genre_type"] == 1]
rocktest = test[test["genre_type"] == 2]
countrytest = test[test["genre_type"] == 3]
othertest = test[test["genre_type"] == 4]

In [55]:
poptraindist = get_word_distribution(poptrain)
raptraindist = get_word_distribution(raptrain)
rocktraindist = get_word_distribution(rocktrain)
countrytraindist = get_word_distribution(countrytrain)
othertraindist = get_word_distribution(othertrain)


dists = {"pop": poptraindist, 
         "rap": raptraindist,
         "rock": rocktraindist,
         "country": countrytraindist,
         "other": othertraindist}

poptestdist = get_word_distribution(poptest, "test")
raptestdist = get_word_distribution(raptest, "test")
rocktestdist = get_word_distribution(rocktest, "test")
countrytestdist = get_word_distribution(countrytest, "test")
othertestdist = get_word_distribution(othertest, "test")

In [56]:
pops = [x[0] for x in classify(poptestdist, dists)]
raps = [x[0] for x in classify(raptestdist, dists)]
rocks = [x[0] for x in classify(rocktestdist, dists)]
countrys = [x[0] for x in classify(countrytestdist, dists)]
others = [x[0] for x in classify(othertestdist, dists)]

predicted = pops + raps + rocks + countrys + others

true = ["pop" for _ in range(len(pops))] + \
       ["rap" for _ in range(len(raps))] + \
       ["rock" for _ in range(len(rocks))] + \
       ["country" for _ in range(len(countrys))] + \
       ["other" for _ in range(len(others))]

In [57]:
from sklearn.metrics import confusion_matrix

In [58]:
ans = confusion_matrix(predicted, true)

In [59]:
sum(np.diag(ans)) / np.sum(ans)

0.5139500734214391

Horrible accuracy.

## Diagnostics

In [64]:
Counter(pops)

Counter({'pop': 255, 'other': 77, 'rock': 153, 'country': 127, 'rap': 60})

Pop seems to be semi-easily confused w/ rock and country. Moreover, there is a non-trivial classification into rap. Definitely look into this.

In [65]:
Counter(raps)

Counter({'rap': 298, 'pop': 44, 'other': 12, 'rock': 10, 'country': 15})

In contrast, rap is easily classified from the other genres. Consistent with our findings.

In [66]:
Counter(rocks)

Counter({'pop': 23, 'other': 9, 'country': 75, 'rock': 59, 'rap': 3})

Rock music is easly misclassified as country. Definitely look into this.

In [67]:
Counter(countrys)

Counter({'country': 75, 'rock': 27, 'pop': 4, 'other': 3})

Country music is easily classified as country; however, some are misclassified as rock.

In [68]:
Counter(others)

Counter({'other': 13, 'country': 6, 'pop': 11, 'rock': 1, 'rap': 2})

The remaining genres are semi-easily confused with pop.