# Prediction: Multigenre

This notebook explores various algorithms' ability to classify songs as pop, rap, rock or country. This notebook compares the same algorithms as the ones in Pop vs. Rap except we increase the number of genres to four.

In [1]:
import sys
sys.path.insert(0, "..//..//..//scripts")

import xgboost as xgb
import seaborn as sns
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from itertools import chain
from NonParametricClassifier import *
from CDFClassifier import *
from HelperFunctions import *
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

To decide which genres to add, we found the top four most popular genres. They are, in order, pop, rap, rock and country.

In [2]:
df = pd.read_csv("..//..//..//..//data//Weekly_data_tokenized.csv")
genre = []

for unique in df.ID.unique():
    genre.append(df[df.ID == unique].iloc[0].Genre)
    
genre = [x.split(",") for x in genre]
genre = Counter(list(chain.from_iterable(genre)))
genre = sorted(genre.items(), key = lambda x: x[1], reverse = True)

genre[:8]

[('Pop', 1783),
 ('Rap', 1427),
 ('Rock', 721),
 ('Country', 692),
 ('R&;B', 661),
 ('Trap', 359),
 ('Canada', 266),
 ('Pop-Rock', 207)]

This time, the minimum Gini index is .125.

In [3]:
df["Pop"] = df.apply(lambda row: create_genre(row, "pop"), axis = 1)
df["Rap"] = df.apply(lambda row: create_genre(row, "rap"), axis = 1)
df["Rock"] = df.apply(lambda row: create_genre(row, "rock"), axis = 1)
df["Country"] = df.apply(lambda row: create_genre(row, "country"), axis = 1)

df = df[["word", "ID", "Pop", "Rap", "Rock", "Country"]]

tmp = df.groupby(["word", "Pop", "Rap", "Rock", "Country"]).count().unstack().unstack().unstack().unstack().fillna(0)

gini = calculate_gini_index(tmp)
useless_words = [x for x in gini if gini[x] <= .236]

df = df[~df.word.isin(useless_words)]

We remove words with the bottom 3.2% of Gini indexes.

In [4]:
len(useless_words) / len(df.word.unique())

0.033193979933110365

Again, we opted for a 80-20 split between the training and validation set.

In [None]:
np.random.seed(1)

IDs = df.ID.unique()
np.random.shuffle(IDs)

train = df[df.ID.isin(IDs[:int(.8 * len(IDs))])]
test = df[df.ID.isin(IDs[int(.8 * len(IDs)):])]

# Classification by distribution comparison

An explanation of this algorithm is available in the notebook `01 - Pop vs. Rap Prediction`.

### KL Divergence

In [None]:
klgrid = grid_search_nonparametric(0.00000001, 0.0001, 200, NonParametricClassifier, train, test, ["Pop", "Rap", "Rock", "Country"], "KL", False)

### Hellinger

In [None]:
hellingergrid = grid_search_nonparametric(0, 5, 200, NonParametricClassifier, train, test, ["Pop", "Rap", "Rock", "Country"], "hellinger", False)

### Naive Bayes - Bernoulli

In [None]:
X_train, y_train, X_test, y_test = prepare_multigenre_data(train, test)

# Rank-based classification

### Mann-Whitney

In [None]:
mwgrid = grid_search_cdf(0.01, 2, 200, CDFClassifier, train, test, ["Pop", "Rap", "Rock", "Country"], "Mann-Whitney");

# Comparison to standard algorithms

### Naive Bayes - Bernoulli

In [None]:
bernoulligrid = {}
grid2 = {}

for n in np.linspace(0, 1, 200)[1:]:
    clf = BernoulliNB(alpha = n)
    clf.fit(X_train, y_train)
    bernoulligrid.update({n: confusion_matrix(clf.predict(X_test), y_test)})
    grid2.update({n: np.diag(bernoulligrid[n]).sum() / bernoulligrid[n].sum()})
    
best = sorted(grid2.items(), key = lambda x: x[1], reverse = True)[0]
print("Best accuracy:", best[1])
print("Parameter", best[0])
    
plt.plot([i for i in grid2], [grid2[i] for i in grid2]);

### Naive Bayes - Multinomial

In [None]:
multigrid = {}
grid2 = {}

for n in np.linspace(0, 1, 200)[1:]:
    clf = MultinomialNB(alpha = n)
    clf.fit(X_train, y_train)
    multigrid.update({n: confusion_matrix(clf.predict(X_test), y_test)})
    grid2.update({n: np.diag(multigrid[n]).sum() / multigrid[n].sum()})
    
best = sorted(grid2.items(), key = lambda x: x[1], reverse = True)[0]
print("Best accuracy:", best[1])
print("Parameter", best[0])
    
plt.plot([i for i in grid2], [grid2[i] for i in grid2]);

### xgboost

In [None]:
y_train_binary = convert_genre(y_train)
y_test_binary = convert_genre(y_test)

In [None]:
dtrain = xgb.DMatrix(X_train, label = y_train_binary)
dtest = xgb.DMatrix(X_test, label = y_test_binary)
evallist = [(dtrain, 'train'), (dtest, 'eval')]

grid = {}
dims = 10

for l1 in np.linspace(0, 1, dims):
    for l2 in np.linspace(0, 1, dims):   
        param = {'max_depth': 500, 'eta': 0.2, 'silent': 1, 'objective': 'multi:softprob', "alpha": l1,
                 "lambda": l2, "subsample": 0.9, "num_class": 4, "eval_metric": "mlogloss", "scale_pos_weight": 1}
        bst = xgb.train(params = param, dtrain = dtrain, num_boost_round = 200, evals = evallist, early_stopping_rounds = 20)
        cfmat = confusion_matrix(np.argmax(bst.predict(dtest), 1), y_test_binary)
        grid.update({(l1, l2): np.diag(cfmat).sum() / cfmat.sum()})

In [None]:
mat = np.zeros((dims, dims))
row = 0
col = 0
for (r, c) in grid:
    mat[row, col] = grid[(r, c)]
    col += 1
    if (col) % dims == 0:
        if (row, col) == (0, 1):
            continue
        col = 0
        row += 1

In [None]:
fig = plt.figure(figsize = (10, 8))
sns.heatmap(mat, annot = True, fmt = ".3f");

### Feedforward Neural Network

In [None]:
from keras import Sequential
from keras.models import load_model
from keras.layers import Dense, BatchNormalization
from keras.regularizers import l1, l2
from keras.optimizers import SGD
from keras.callbacks import ModelCheckpoint

from tensorflow import Session, ConfigProto
sess = Session(config=ConfigProto(log_device_placement=True))
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

from keras import backend as K
K.tensorflow_backend._get_available_gpus()

In [None]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(categories = "auto")
enc.fit(y_train_binary.reshape((len(y_train_binary), 1)))
y_train_onehot = enc.transform(y_train_binary.reshape((len(y_train_binary), 1))).toarray()
y_test_onehot = enc.transform(y_test_binary.reshape((len(y_test_binary), 1))).toarray()

In [None]:
arch = [
    Dense(512, input_dim = 23920, activation = "sigmoid"),
    Dense(128, activation = "sigmoid"),
    Dense(32, activation = "sigmoid"),
    Dense(8, activation = "sigmoid"),
    Dense(4, activation = "softmax")
]

model = Sequential(arch)

model.compile(
    optimizer = SGD(lr = 0.01),
    loss = "categorical_crossentropy",
    metrics = ["categorical_accuracy"]
)

filepath = "..//..//..//..//data//NN weights//weights-improvement-multigenre-{epoch:02d}-{val_categorical_accuracy:.4f}.h5"
checkpoint = ModelCheckpoint(filepath, monitor='categorical_accuracy',
                             verbose=1, save_best_only=True,
                             mode='max')
callbacks_list = [checkpoint]

history = model.fit(
    np.array(X_train), 
    np.array(y_train_onehot),
    callbacks = callbacks_list,
    verbose = 1, 
    epochs = 20,
    batch_size = 2,
    validation_data = [np.array(X_test), np.array(y_test_onehot)]
)

In [None]:
def plot_embedding(encoder, X, Y):
    fig = plt.figure(figsize = (10, 6))
    h = encoder.predict(np.array(X))
    y_tester = np.array(Y)
    for i in range(4):
        sel = y_tester == i
        plt.plot(h[sel, 0], h[sel, 1], '.', label='Group %d' % i, markersize = 3, alpha = 0.8)
    plt.title('MLP embedding - test data')
    plt.legend()
    plt.show()

As an aside, below is the embedding of our test data within our neural network. We can see that the neural network has learned a linearly separable representation of our frequency vectors. We also see that group 2 and 3 (Rock and Country) are very similar to group 0, or Pop. This implies that the text in pop, country, and rock are all very similar.

In [None]:
nn = load_model("..//..//..//..//data//NN weights//weights-improvement-multigenre-10-0.5501.h5")
model_tmp = Sequential(nn.layers[:-1])
plot_embedding(model_tmp, X_test, y_test_binary)

# Results

In [None]:
Below is a table summarizing the performance of each algorithm on the validation set.


| KL    | Hellinger | Mann-Whitney | NB-Bernoulli | NB-Multinomial | xgboost | Neural network |
|:-----:|:---------:|:------------:|---|---|---|:---:|
|0.8262|  0.84   |    0.769    |<b>0.85538|0.82769|0.846|0.8277|