__Conduct machine learning.__

@Andreas Lüschow

12.11.2020

### Imports

In [None]:
from IPython.display import display

import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.metrics import confusion_matrix, classification_report, f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split

# algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

pd.options.display.max_columns = None

### Constants

In [None]:
INPUT_CSV = "./data/all_data.csv"
GENRE_FILE = "./data/input/genres.txt"
FIELD_FILE = "./data/input/fields.txt"

TRAIN_FIELD_LABEL = 'genre'  # genre | genre_main

MAX_GENRES = 25
ML = "d_tree"  # logreg | random_forest | d_tree

TEST_SIZE = 0.1

# reduce data size if necessary
SAMPLING = False
SAMPLING_FRACTION = 1  # only relevant if SAMPLING == True

DROP_NAN = True
DROP_NAN_THRESHOLD = 1000  # only relevant if DROP_NAN == True

### Load data

In [None]:
genres = [line.strip() for line in open(GENRE_FILE)]
assert len(genres) == 1319
print(genres[:20])

In [None]:
src_df = pd.read_csv(INPUT_CSV, sep="\t", index_col='Unnamed: 0', low_memory=False)
src_df.shape

In [None]:
orig_df = src_df.copy()
orig_df.shape

In [None]:
# remove whitespace in genre names
orig_df["genre"] = orig_df["genre"].str.replace(" ", "_")
orig_df["genre_main"] = orig_df["genre_main"].str.replace(" ", "_")

In [None]:
assert src_df.shape[0] == 746786
assert orig_df.shape[0] == 746786

### Sampling

In [None]:
if SAMPLING:
    orig_df = orig_df.sample(frac=SAMPLING_FRACTION, replace=True, random_state=1)
    orig_df.shape

### Dropping columns/fields where NaN is predominant

In [None]:
if DROP_NAN:
    orig_df.dropna(thresh=len(orig_df.index)/DROP_NAN_THRESHOLD, axis=1, inplace=True)

In [None]:
orig_df.shape

### Filter genres

In [None]:
# filter by genre count
genres_top = orig_df[TRAIN_FIELD_LABEL].value_counts()[:MAX_GENRES].index.tolist()
df_tmp = orig_df[orig_df[TRAIN_FIELD_LABEL].isin(genres_top)]

# order
i, r = pd.factorize(df_tmp[TRAIN_FIELD_LABEL])
a = np.argsort(np.bincount(i)[i], kind='mergesort')
df_tmp = df_tmp.iloc[a]

### Train and Test

In [None]:
# create final data
y = df_tmp.loc[:,TRAIN_FIELD_LABEL].values
df = df_tmp.notnull().astype('int')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=TEST_SIZE)

### Select algorithm

In [None]:
if ML == "logreg":
    model = LogisticRegression()
elif ML == "random_forest":
    model = RandomForestClassifier()
elif ML == "d_tree":
    model = DecisionTreeClassifier()

### Start training

In [None]:
model.fit(X_train, y_train)

In [None]:
print(f"Accuracy Score: {model.score(X_test, y_test)}")

### Start evaluation

In [None]:
predictions = model.predict(X_test)

In [None]:
c_report = classification_report(y_test, predictions)
# print(c_report)

In [None]:
cm = confusion_matrix(y_test, predictions)
# cm

In [None]:
print(f"Precision: {precision_score(y_test, predictions, average='weighted')}")
print(f"Recall: {recall_score(y_test, predictions, average='weighted')}")
print(f"F1-score: {f1_score(y_test, predictions, average='weighted')}")
print(f"Accuracy: {accuracy_score(y_test, predictions)}")

In [None]:
def plot_classification_report(cr, title="", with_avg_total=False, cmap=plt.cm.Blues):
    """see https://stackoverflow.com/a/31689645"""
    lines = cr.split('\n')

    classes = []
    plotMat = []
    for line in lines[2 : (len(lines) - 5)]:
        # print(line)
        t = line.split()
        # print(t)
        classes.append(t[0])
        v = [float(x) for x in t[1: len(t) - 1]]
        print(v)
        plotMat.append(v)

    if with_avg_total:
        aveTotal = lines[len(lines) - 1].split()
        classes.append('avg/total')
        vAveTotal = [float(x) for x in t[1:len(aveTotal) - 1]]
        plotMat.append(vAveTotal)


    plt.figure(figsize=(10,6))
    plt.imshow(plotMat, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    x_tick_marks = np.arange(3)
    y_tick_marks = np.arange(len(classes))
    plt.xticks(x_tick_marks, ['Precision', 'Recall', 'F1-score'], rotation=90)
    plt.yticks(y_tick_marks, classes)
    plt.tight_layout()
    plt.ylabel('Genres')

plot_classification_report(c_report)

In [None]:
cm_df = pd.DataFrame(cm)
# consider relative values in confusion matrix
cm_relative = cm_df.loc[:].div(cm_df.sum(axis='columns'), axis="index")
# cm_relative[:2]
cm_relative.to_csv('./data/confusion_matrix_accuracy.csv', index = False) 

In [None]:
plt.figure(figsize=(24,15))
ax = sns.heatmap(cm_relative, annot=True, xticklabels=genres_top, yticklabels=genres_top, fmt=".2f", cmap="Greens")
bottom, top = ax.get_ylim()
ax.set_ylim(bottom+0.5, top-0.5)