In [287]:
from collections import defaultdict

In [291]:
x = defaultdict(int)

In [292]:
for word in vocab:
    x[word] = 0

In [14]:
import string
import numpy as np

In [18]:
vocab = list(string.ascii_lowercase)

In [221]:
def make_lyrics(vocab, low, high, num_songs, genre, prob):
    return [np.random.choice(vocab, 
                             np.random.randint(low, high), 
                             replace = True,
                             p = prob) for _ in range(num_songs)]

In [222]:
popprob = np.array(list(range(1, 27))) / sum(list(range(1, 27)))
rapprob = popprob[::-1]
pop = make_lyrics(vocab, 100, 400, 100, "pop", popprob)
rap = make_lyrics(vocab, 100, 400, 100, "rap", rapprob)

In [223]:
def hellinger_distance(dist1, dist2):
    
    num = 0
    
    for element in dist1.keys():
        
        num += (np.sqrt(dist1[element]) - np.sqrt(dist2[element])) ** 2
        
    num = (1 / np.sqrt(2)) * np.sqrt(num)
    
    return 1 - num

In [389]:
def kl_divergence(dist1, dist2):
    
    num = 0
    
    for element in set(dist1.keys()).union(dist2.keys()):
        if (dist2[element] == 0) and (dist1[element] == 0):
            continue
        num -= dist1[element] * np.log(dist2[element] / dist1[element])
        
    return num

In [196]:
def get_p_genre(x):
    return {genre:len(x[genre]) for genre in x}

In [296]:
def get_word_distribution(d, type_ = "train"):
    """
    if train: return word distribution
    if test: return each testing example's word distribution.
    """
    
    from collections import Counter
    from itertools import chain
    
    if type_ == "train":
        ans = Counter(list(chain.from_iterable(d)))
        norm = sum(ans.values())
        for element in ans:
            ans[element] /= norm
            
        impute = defaultdict(float)
        for i in ans:
            impute[i] = ans[i]

    elif type_ == "test":
        
        ans = []
        for song in d:
            ans1 = Counter(list(chain.from_iterable(song)))
            norm = sum(ans1.values())
            for element in ans1:
                ans1[element] /= norm 
            impute = defaultdict(float)
            for i in ans1:
                impute[i] = ans1[i]
            ans.append(ans1)
            
            
    return impute

In [297]:
popdist = get_word_distribution(pop)
rapdist = get_word_distribution(rap)

In [264]:
dists = {"pop": popdist, "rap": rapdist}

### Make testing data

In [259]:
poptest = make_lyrics(vocab, 100, 400, 100, "pop", popprob)
raptest = make_lyrics(vocab, 100, 400, 100, "rap", rapprob)

In [262]:
poptest2 = get_word_distribution(poptest, "test")
raptest2 = get_word_distribution(raptest, "test")

In [284]:
def classify(data, dists, popprop, rapprop):
    p_genre = {"pop": popprop, "rap": rapprop}
    results = []
    for song in data:
        distance = {}
        for dist in dists:
            distance.update({dist: hellinger_distance(song, dists[dist]) * p_genre[dist]})
        #print(distance)
        results.append(max(distance.items(), key = lambda x: x[1]))
    return results

In [301]:
#classify(raptest2, dists, .5, .5)

In [302]:
#classify(poptest2, dists, .5, .5)

## Test on real data

In [311]:
import pandas as pd
import re
df = pd.read_csv("..//..//data//Weekly_data_tokenized.csv")

In [370]:
pop = []
for word in range(len(df)):
    if re.search("pop", df.loc[word, "Genre"], flags = re.I) != None:
        pop.append(True)
    else:
        pop.append(False)

In [371]:
IDs = df.ID.unique()
np.random.shuffle(IDs)

In [372]:
train = df[df.ID.isin(IDs[:int(.8 * len(IDs))])]
test = df[df.ID.isin(IDs[int(.8 * len(IDs)):])]

In [409]:
poptrain = train[train["Pop"] == True].word
raptrain = train[train["Pop"] == False].word

poptest = test[test["Pop"] == True]
raptest = test[test["Pop"] == False]

In [455]:
def get_word_distribution(d, type_ = "train"):
    """
    if train: return word distribution
    if test: return each testing example's word distribution.
    """
    
    from collections import Counter
    from itertools import chain
    
    if type_ == "train":
        ans = Counter(list(d))
        norm = sum(ans.values())
        for element in ans:
            ans[element] /= norm
            
        impute = defaultdict(float)
        for i in ans:
            impute[i] = ans[i]
        return impute

    elif type_ == "test":
        
        ans = []
        for song in d.ID.unique():
            ans1 = Counter(list(d[d.ID == song].word))
            norm = sum(ans1.values())
            for element in ans1:
                ans1[element] /= norm
                
            impute = defaultdict(float)
            for i in ans1:
                impute[i] = ans1[i]
            ans.append(ans1)
        return ans

In [456]:
def classify(data, dists, popprop, rapprop):
    p_genre = {"pop": popprop, "rap": rapprop}
    results = []
    for song in data:
        distance = {}
        for dist in dists:
            distance.update({dist: hellinger_distance(song, dists[dist])})# * p_genre[dist]})
        #print(distance)
        results.append(max(distance.items(), key = lambda x: x[1]))
    return results

In [457]:
poptraindist = get_word_distribution(poptrain)
raptraindist = get_word_distribution(raptrain)
dists = {"pop": poptraindist, "rap": raptraindist}

In [458]:
poptestdist = get_word_distribution(poptest, "test")
raptestdist = get_word_distribution(raptest, "test")

In [460]:
numpop = len(train[train.Pop == True].ID.unique())
numrap = len(train[train.Pop == False].ID.unique())
popprop = numpop / (numpop + numrap)
rapprop = 1 - popprop

In [461]:
pops = [x[0] for x in classify(poptestdist, dists, popprop, rapprop)]
raps = [x[0] for x in classify(raptestdist, dists, popprop, rapprop)]
predicted = pops + raps
true = ["pop" for _ in range(len(pops))] + ["rap" for _ in range(len(raps))]

In [463]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(predicted, true).ravel()

In [464]:
tn, fp, fn, tp

(319, 164, 45, 250)

## Class percentages

In [471]:
len(pops) / (len(pops) + len(raps))

0.46786632390745503

## Accuracy

In [466]:
(tn + tp) / (tn + fp + fn + tp)

0.7313624678663239

Higher than naive classifier!!!!

## Precision 

In [465]:
tp / (tp + fp)

0.6038647342995169

## Recall

In [470]:
tp / (tp + fn)

0.847457627118644