In [6]:
import pandas as pd
import autograd.numpy as np
from autograd import grad 
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from common import gradient_descent, plot_cost_histories

In [257]:
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier

In [258]:
# Read data
DATA_BASE_URL = "https://raw.githubusercontent.com/sql-injection/spotify_data/master/"
datasets = {
    "train": DATA_BASE_URL + "train.csv",
    "test": DATA_BASE_URL + "test.csv",
    "all": DATA_BASE_URL + "spotify.csv"
}

total_df = pd.read_csv(datasets["all"])
attribute_names = list(total_df)[:-1]
x = total_df[attribute_names].values
y = total_df["Class"].values[:, np.newaxis]


print("Attributes we are considering:", attribute_names[:2] + attribute_names[3:4] + attribute_names[5:-2])
x = np.concatenate((x[:, :2], x[:, 3:4], x[:, 5:-2]), axis=1)
original_x = np.copy(x)
original_y = np.copy(y)

Attributes we are considering: ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']


In [259]:
class ValidationResult(object):
    def __init__(self):
        self.accuracy = 0.0
        self.results = list()
        
    def __str__(self):
        errors = [(actual, predicted) for actual, predicted in self.results if actual != predicted]
        error_length = len(errors)
        total_length = len(self.results)
        return "<accuracy={accuracy}, num_incorrect={incorrect}, num_correct={correct}, errors={errors}>\n".format(
            accuracy=self.accuracy,
            incorrect=error_length,
            correct=total_length - error_length,
            errors=errors
        )


def accuracy(y_pred, y_test):
    correct = 0
    total = len(y_test)
    v = ValidationResult()
    for i in range(total):
        v.results.append((y_test[i][0], y_pred[i]))
        if y_test[i][0] == y_pred[i]:
            correct += 1
    v.accuracy = correct / total 
    return v

In [260]:
num_splits = 10
num_neighbors = 3
kf = KFold(n_splits=num_splits)
cross_validations = list()

for train_index, test_index in kf.split(x):
    # Split into testing and training
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Fit
    classifier = KNeighborsClassifier(n_neighbors=num_neighbors)
    classifier.fit(x_train, y_train.ravel())
    
    # Evaluate
    y_pred = classifier.predict(x_test)
    cross_validations.append(accuracy(y_pred, y_test))

for i, v in enumerate(cross_validations):
    print("Split", i + 1)
    print(v)
    
print("Mean accuracy:", np.mean([v.accuracy for v in cross_validations]))

Split 1
<accuracy=0.24390243902439024, num_incorrect=31, num_correct=10, errors=[('edm', 'country'), ('edm', 'pop'), ('edm', 'country'), ('edm', 'pop'), ('edm', 'country'), ('edm', 'country'), ('edm', 'country'), ('edm', 'country'), ('edm', 'country'), ('edm', 'country'), ('edm', 'country'), ('edm', 'country'), ('edm', 'country'), ('edm', 'country'), ('edm', 'country'), ('edm', 'jazz'), ('edm', 'country'), ('edm', 'country'), ('edm', 'country'), ('edm', 'country'), ('edm', 'rock'), ('edm', 'country'), ('edm', 'country'), ('edm', 'pop'), ('edm', 'country'), ('edm', 'country'), ('edm', 'country'), ('edm', 'pop'), ('edm', 'rock'), ('edm', 'country'), ('edm', 'country')]>

Split 2
<accuracy=0.14634146341463414, num_incorrect=35, num_correct=6, errors=[('edm', 'country'), ('edm', 'rock'), ('edm', 'pop'), ('edm', 'country'), ('edm', 'country'), ('edm', 'country'), ('hiphop', 'country'), ('hiphop', 'country'), ('hiphop', 'country'), ('hiphop', 'pop'), ('hiphop', 'country'), ('hiphop', 'countr