**Table of contents**<a id='toc0_'></a>    
- [Imports](#toc1_)    
- [Read Data](#toc2_)    
- [Data Selection](#toc3_)    
  - [Drop trash columns](#toc3_1_)    
  - [Drop rows with missing values](#toc3_2_)    
  - [Selecting 50% of the data](#toc3_3_)    
- [Removing Categorical Columns](#toc4_)    
- [Split Train and Test Data](#toc5_)    
- [Data Cleaning](#toc6_)    
  - [Impute missing numeric data](#toc6_1_)    
- [Data Normalization](#toc7_)    
- [Model training](#toc8_)    
  - [KNN](#toc8_1_)    
    - [Best model](#toc8_1_1_)    
  - [LVQ](#toc8_2_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

----

# <a id='toc1_'></a>[Imports](#toc0_)

In [8]:
from datasets import load_dataset
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score, classification_report
import numpy as np
import matplotlib.pyplot as plt
import requests
import io
    
plt.rcParams['figure.figsize'] = [16, 10]


pd.options.display.max_colwidth = 1000
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 200

# <a id='toc2_'></a>[Read Data](#toc0_)

In [9]:
dataset = load_dataset("maharshipandya/spotify-tracks-dataset")
#df = pd.read_csv("C:/Users/bito-/OneDrive - UFPE/Área de Trabalho/Bito/Exatas/AM/AM 2024.1/Atividades Leandro/Emerson/atividade_01_10042024/dados/dataset.csv")

df = dataset['train'].to_pandas()

Found cached dataset csv (file://C:/Users/pichau/.cache/huggingface/datasets/maharshipandya___csv/maharshipandya--spotify-tracks-dataset-ff79c8444e5ec4c3/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


NotImplementedError: Loading a dataset cached in a LocalFileSystem is not supported.

In [3]:
df.shape
print(df.shape)
print('\n')
df.info()
print('\n')
df.head()

(114000, 21)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        114000 non-null  int64  
 1   track_id          114000 non-null  object 
 2   artists           113999 non-null  object 
 3   album_name        113999 non-null  object 
 4   track_name        113999 non-null  object 
 5   popularity        114000 non-null  int64  
 6   duration_ms       114000 non-null  int64  
 7   explicit          114000 non-null  bool   
 8   danceability      114000 non-null  float64
 9   energy            114000 non-null  float64
 10  key               114000 non-null  int64  
 11  loudness          114000 non-null  float64
 12  mode              114000 non-null  int64  
 13  speechiness       114000 non-null  float64
 14  acousticness      114000 non-null  float64
 15  instrumentalness  114000 non-null  float64
 16  liven

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Soundtrack),Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [4]:
df.track_genre.value_counts(normalize=True) * 100

track_genre
acoustic             0.877193
punk-rock            0.877193
progressive-house    0.877193
power-pop            0.877193
pop                  0.877193
pop-film             0.877193
piano                0.877193
party                0.877193
pagode               0.877193
opera                0.877193
new-age              0.877193
mpb                  0.877193
minimal-techno       0.877193
metalcore            0.877193
metal                0.877193
mandopop             0.877193
malay                0.877193
latino               0.877193
latin                0.877193
kids                 0.877193
k-pop                0.877193
jazz                 0.877193
j-rock               0.877193
j-pop                0.877193
j-idol               0.877193
j-dance              0.877193
iranian              0.877193
psych-rock           0.877193
punk                 0.877193
afrobeat             0.877193
r-n-b                0.877193
turkish              0.877193
trip-hop             0.87719

# <a id='toc3_'></a>[Data Selection](#toc0_)

## <a id='toc3_1_'></a>[Drop trash columns](#toc0_)

In [5]:
df = df.drop(df.columns[0], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 20 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   track_id          114000 non-null  object 
 1   artists           113999 non-null  object 
 2   album_name        113999 non-null  object 
 3   track_name        113999 non-null  object 
 4   popularity        114000 non-null  int64  
 5   duration_ms       114000 non-null  int64  
 6   explicit          114000 non-null  bool   
 7   danceability      114000 non-null  float64
 8   energy            114000 non-null  float64
 9   key               114000 non-null  int64  
 10  loudness          114000 non-null  float64
 11  mode              114000 non-null  int64  
 12  speechiness       114000 non-null  float64
 13  acousticness      114000 non-null  float64
 14  instrumentalness  114000 non-null  float64
 15  liveness          114000 non-null  float64
 16  valence           11

## <a id='toc3_2_'></a>[Drop rows with missing values](#toc0_)

In [6]:
df.dropna(inplace=True, axis=0, how='any')

## <a id='toc3_3_'></a>[Selecting 50% of the data](#toc0_)

In [7]:
df = df.sample(frac=0.5, replace=False)
df.shape

(57000, 20)

# <a id='toc4_'></a>[Removing Categorical Columns](#toc0_)

In [8]:
categorical_columns = ['track_id', 'artists', 'album_name', 'track_name','key', 'mode','danceability']
df = df.drop(categorical_columns, axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 57000 entries, 78611 to 13751
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   popularity        57000 non-null  int64  
 1   duration_ms       57000 non-null  int64  
 2   explicit          57000 non-null  bool   
 3   energy            57000 non-null  float64
 4   loudness          57000 non-null  float64
 5   speechiness       57000 non-null  float64
 6   acousticness      57000 non-null  float64
 7   instrumentalness  57000 non-null  float64
 8   liveness          57000 non-null  float64
 9   valence           57000 non-null  float64
 10  tempo             57000 non-null  float64
 11  time_signature    57000 non-null  int64  
 12  track_genre       57000 non-null  object 
dtypes: bool(1), float64(8), int64(3), object(1)
memory usage: 5.7+ MB


# <a id='toc5_'></a>[Split Train and Test Data](#toc0_)

In [10]:
def train_validation_test_split(df, target_column, validation_size=0.1, test_size=0.1, random_state=42):
    df_train, df_test = train_test_split(df, test_size=test_size, random_state=random_state, stratify=df[target_column])
    
    df_train, df_validation = train_test_split(df_train,
                                               test_size=validation_size/(1 - test_size),
                                               random_state=random_state,
                                               stratify=df_train[target_column])
    return df_train, df_validation, df_test  

In [11]:
df_train, df_validation, df_test = train_validation_test_split(df, "track_genre",0.2, 0.2)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 57000 entries, 78611 to 13751
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   popularity        57000 non-null  int64  
 1   duration_ms       57000 non-null  int64  
 2   explicit          57000 non-null  bool   
 3   energy            57000 non-null  float64
 4   loudness          57000 non-null  float64
 5   speechiness       57000 non-null  float64
 6   acousticness      57000 non-null  float64
 7   instrumentalness  57000 non-null  float64
 8   liveness          57000 non-null  float64
 9   valence           57000 non-null  float64
 10  tempo             57000 non-null  float64
 11  time_signature    57000 non-null  int64  
 12  track_genre       57000 non-null  object 
dtypes: bool(1), float64(8), int64(3), object(1)
memory usage: 5.7+ MB


# <a id='toc6_'></a>[Data Cleaning](#toc0_)

## <a id='toc6_1_'></a>[Impute missing numeric data](#toc0_)

In [20]:
numeric_columns = df_train.select_dtypes(include=['number']).columns

numeric_imputer = SimpleImputer(strategy='median')
numeric_imputer.fit(df_train[numeric_columns])

df_train[numeric_columns] = numeric_imputer.transform(df_train[numeric_columns])
df_validation[numeric_columns] = numeric_imputer.transform(df_validation[numeric_columns])
df_test[numeric_columns] = numeric_imputer.transform(df_test[numeric_columns])

# <a id='toc7_'></a>[Data Normalization](#toc0_)

In [21]:
normalizer = MinMaxScaler()

normalizer.fit(df_train[numeric_columns])

df_train[numeric_columns] = normalizer.transform(df_train[numeric_columns])
df_validation[numeric_columns] = normalizer.transform(df_validation[numeric_columns])
df_test[numeric_columns] = normalizer.transform(df_test[numeric_columns])

# <a id='toc8_'></a>[Model training](#toc0_)

## <a id='toc8_1_'></a>[KNN](#toc0_)

In [41]:
knn = KNeighborsClassifier()

parameters = {
    'n_neighbors': [13,17,21,25,29,33,37,41,45],#np.arange(10,50,1),
    'metric': ['euclidean', 'manhattan']#, 'chebyshev' 'minkowski']
}

evaluation = {}

eval_metrics = {
    'precision':'precision_macro', 
    'recall':'recall_macro', 
    'f1':'f1_macro', 
    'accuracy':'accuracy'
}

for k in parameters['n_neighbors']:
  for m in parameters['metric']:
    
    knn = KNeighborsClassifier(n_neighbors=k, metric=m, n_jobs=-1)
    scores = cross_validate(knn, df_train[numeric_columns], df_train['track_genre'], cv=5, scoring=eval_metrics, n_jobs=-1)
    
    knn.fit(df_train[numeric_columns], df_train['track_genre'])
    y_pred_train = knn.predict(df_train[numeric_columns])
    y_pred_valid = knn.predict(df_validation[numeric_columns])
    
    evaluation[(k, m)] = {
        'cv_precision': scores['test_precision'].mean(),
        'cv_recall': scores['test_recall'].mean(),
        'cv_f1': scores['test_f1'].mean(),
        'cv_accuracy': scores['test_accuracy'].mean(),
        'train_f1': f1_score(df_train['track_genre'], y_pred_train, average='macro'),
        'train_recall': recall_score(df_train['track_genre'], y_pred_train, average='macro'),
        'train_precision': precision_score(df_train['track_genre'], y_pred_train, average='macro'),
        'train_accuracy': accuracy_score(df_train['track_genre'], y_pred_train),
        'validation_f1': f1_score(df_validation['track_genre'], y_pred_valid, average='macro'),
        'validation_recall': recall_score(df_validation['track_genre'], y_pred_valid, average='macro'),
        'validation_precision': precision_score(df_validation['track_genre'], y_pred_valid, average='macro'),
        'validation_accuracy': accuracy_score(df_validation['track_genre'], y_pred_valid)
    }
    
    print(k, m,accuracy_score(df_validation['track_genre'], y_pred_valid))

13 euclidean 0.1792982456140351
13 manhattan 0.2013157894736842
17 euclidean 0.1782456140350877
17 manhattan 0.20350877192982456
21 euclidean 0.1812280701754386
21 manhattan 0.20350877192982456
25 euclidean 0.18078947368421053
25 manhattan 0.20192982456140351
29 euclidean 0.18043859649122806
29 manhattan 0.19921052631578948
33 euclidean 0.17842105263157895
33 manhattan 0.20271929824561402
37 euclidean 0.17868421052631578
37 manhattan 0.20280701754385966
41 euclidean 0.17982456140350878
41 manhattan 0.20359649122807016
45 euclidean 0.17850877192982456
45 manhattan 0.2036842105263158


### <a id='toc8_1_1_'></a>[Best model](#toc0_)

In [43]:
print(k, m,evaluation[(k,m)])

best_knn = KNeighborsClassifier(n_neighbors=45, metric='manhattan', n_jobs=-1)

best_knn.fit(df_train[numeric_columns], df_train['track_genre'])

print(classification_report(df_test['track_genre'], best_knn.predict(df_test[numeric_columns])))

45 manhattan {'cv_precision': 0.19161525965470105, 'cv_recall': 0.20199923711840534, 'cv_f1': 0.18488288371108258, 'cv_accuracy': 0.20190058479532164, 'train_f1': 0.24698349069443096, 'train_recall': 0.2637649986801499, 'train_precision': 0.2615227656614058, 'train_accuracy': 0.26359649122807016, 'validation_f1': 0.18803048676768094, 'validation_recall': 0.20412055878098662, 'validation_precision': 0.1928877606201788, 'validation_accuracy': 0.2036842105263158}
                   precision    recall  f1-score   support

         acoustic       0.15      0.25      0.19       105
         afrobeat       0.22      0.16      0.19       105
         alt-rock       0.06      0.05      0.05       101
      alternative       0.12      0.11      0.11       102
          ambient       0.20      0.21      0.20        99
            anime       0.08      0.07      0.07       100
      black-metal       0.29      0.33      0.31       105
        bluegrass       0.23      0.29      0.26        95
   

In [38]:
best_knn = KNeighborsClassifier(n_neighbors=45, metric='manhattan', n_jobs=-1)

best_knn.fit(df_train[numeric_columns], df_train['track_genre'])

print(classification_report(df_test['track_genre'], best_knn.predict(df_test[numeric_columns])))

                   precision    recall  f1-score   support

         acoustic       0.15      0.25      0.19       105
         afrobeat       0.22      0.16      0.19       105
         alt-rock       0.06      0.05      0.05       101
      alternative       0.12      0.11      0.11       102
          ambient       0.20      0.21      0.20        99
            anime       0.08      0.07      0.07       100
      black-metal       0.29      0.33      0.31       105
        bluegrass       0.23      0.29      0.26        95
            blues       0.07      0.04      0.05       104
           brazil       0.05      0.05      0.05       104
        breakbeat       0.28      0.21      0.24        99
          british       0.02      0.01      0.01       100
         cantopop       0.10      0.20      0.13        90
    chicago-house       0.28      0.42      0.34       101
         children       0.25      0.18      0.21        95
            chill       0.14      0.15      0.14       

## <a id='toc8_2_'></a>[LVQ](#toc0_)

In [56]:
from random import randrange
from math import sqrt

# Convert string column to float
def str_column_to_float(dataset, column):
	for row in dataset:
		row[column] = float(row[column].strip())

# Convert string column to integer
def str_column_to_int(dataset, column):
	class_values = [row[column] for row in dataset]
	unique = set(class_values)
	lookup = dict()
	for i, value in enumerate(unique):
		lookup[value] = i
	for row in dataset:
		row[column] = lookup[row[column]]
	return lookup

# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
	dataset_split = list()
	dataset_copy = list(dataset)
	fold_size = int(len(dataset) / n_folds)
	for i in range(n_folds):
		fold = list()
		while len(fold) < fold_size:
			index = randrange(len(dataset_copy))
			fold.append(dataset_copy.pop(index))
		dataset_split.append(fold)
	return dataset_split

def accuracy_metric(actual, predicted):
	correct = 0
	for i in range(len(actual)):
		if actual[i] == predicted[i]:
			correct += 1
	return correct / float(len(actual)) * 100.0

def calculate_metrics_per_class(actual, predicted, class_label):
    TP = 0  # Verdadeiros Positivos
    FP = 0  # Falsos Positivos
    FN = 0  # Falsos Negativos
    for i in range(len(actual)):
        if actual[i] == predicted[i] == class_label:
            TP += 1
        elif predicted[i] == class_label and actual[i] != class_label:
            FP += 1
        elif actual[i] == class_label and predicted[i] != class_label:
            FN += 1
    return TP, FP, FN

def macro_recall(actual, predicted):
    unique_classes = set(actual)
    recalls = []
    for class_label in unique_classes:
        TP, _, FN = calculate_metrics_per_class(actual, predicted, class_label)
        recall = TP / (TP + FN) if (TP + FN) else 0
        recalls.append(recall)
    return sum(recalls) / len(recalls) * 100.0

def macro_precision(actual, predicted):
    unique_classes = set(actual)
    precisions = []
    for class_label in unique_classes:
        TP, FP, _ = calculate_metrics_per_class(actual, predicted, class_label)
        precision = TP / (TP + FP) if (TP + FP) else 0
        precisions.append(precision)
    return sum(precisions) / len(precisions) * 100.0

def macro_f1_score(actual, predicted):
    precision = macro_precision(actual, predicted) / 100.0
    recall = macro_recall(actual, predicted) / 100.0
    return 2 * (precision * recall) / (precision + recall) * 100.0 if (precision + recall) else 0


# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, dataset_validation, algorithm, n_folds, *args):
	folds = cross_validation_split(dataset, n_folds)
	scores_cv = list()
	scores_val = list()
	
	scores_cv_recall = list()
	scores_cv_precision = list()
	scores_cv_f1 = list()

	scores_val_recall = list()
	scores_val_precision = list()
	scores_val_f1 = list()


	actual_validation = [row[-1] for row in dataset_validation]
	for fold in folds:
		train_set = list(folds)
		train_set.remove(fold)
		train_set = sum(train_set, [])
		test_set = list()
		for row in fold:
			row_copy = list(row)
			test_set.append(row_copy)
			row_copy[-1] = None
		predicted_test, predicted_validation = algorithm(train_set, test_set, dataset_validation, *args)
		actual = [row[-1] for row in fold]
  
		accuracy_test = accuracy_metric(actual, predicted_test)
		scores_cv.append(accuracy_test)
  
		scores_cv_recall.append(macro_recall(actual, predicted_test))
		scores_cv_precision.append(macro_precision(actual, predicted_test))
		scores_cv_f1.append(macro_f1_score(actual, predicted_test))
  
		accuracy_val = accuracy_metric(actual_validation, predicted_validation)
		scores_val.append(accuracy_val)
		
		scores_val_recall.append(macro_recall(actual, predicted_validation))
		scores_val_precision.append(macro_precision(actual, predicted_validation))
		scores_val_f1.append(macro_f1_score(actual, predicted_validation))

	return scores_cv, scores_val, scores_cv_recall, scores_val_recall, scores_cv_precision, scores_val_precision, scores_cv_f1, scores_val_f1 

# calculate the Euclidean distance between two vectors
def euclidean_distance(row1, row2):
	distance = 0.0
	for i in range(len(row1)-1):
		distance += (row1[i] - row2[i])**2
	return sqrt(distance)

# Locate the best matching unit
def get_best_matching_unit(codebooks, test_row):
	distances = list()
	for codebook in codebooks:
		dist = euclidean_distance(codebook, test_row)
		distances.append((codebook, dist))
	distances.sort(key=lambda tup: tup[1])
	return distances[0][0]

# Make a prediction with codebook vectors
def predict(codebooks, test_row):
	bmu = get_best_matching_unit(codebooks, test_row)
	return bmu[-1]

# Create a random codebook vector
def random_codebook(train):
	n_records = len(train)
	n_features = len(train[0])
	codebook = [train[randrange(n_records)][i] for i in range(n_features)]
	return codebook

# Train a set of codebook vectors
def train_codebooks(train, n_codebooks, lrate, epochs):
	codebooks = [random_codebook(train) for i in range(n_codebooks)]
	for epoch in range(epochs):
		rate = lrate * (1.0-(epoch/float(epochs)))
		for row in train:
			bmu = get_best_matching_unit(codebooks, row)
			for i in range(len(row)-1):
				error = row[i] - bmu[i]
				if bmu[-1] == row[-1]:
					bmu[i] += rate * error
				else:
					bmu[i] -= rate * error
	return codebooks

# LVQ Algorithm
def learning_vector_quantization(train, test, validation, n_codebooks, lrate, epochs):
	codebooks = train_codebooks(train, n_codebooks, lrate, epochs)
	predictions_test = list()
	predictions_validation = list()

	for row in test:
		output = predict(codebooks, row)
		predictions_test.append(output)
	for row in validation:
		output = predict(codebooks, row)
		predictions_validation.append(output)
  
	return predictions_test, predictions_validation



In [45]:
le = LabelEncoder()
df_train_2 = df_train.copy()
df_train_2['track_genre'] = le.fit_transform(df_train_2['track_genre'])
df_train_2['explicit'] = df_train_2['explicit'].astype("int")

df_validation_2 = df_validation.copy()
df_validation_2['track_genre'] = le.transform(df_validation_2['track_genre'])
df_validation_2['explicit'] = df_validation_2['explicit'].astype("int")

df_test_2 = df_test.copy()
df_test_2['track_genre'] = le.transform(df_test_2['track_genre'])
df_test_2['explicit'] = df_test_2['explicit'].astype("int")


In [64]:
# Test LVQ on Spotify Dataset
# load and prepare data
n_folds = 5
n_epochs = 20

dataset_train = df_train_2.values.tolist()
dataset_validation = df_validation_2.values.tolist()


parameters = {
    'learn_rate': [0.01,0.2,0.5],
    'n_codebooks': [40,50,60]
}

evaluation = {}
for lr in parameters['learn_rate']:
  for n in parameters['n_codebooks']:

    scores_cv, scores_vali, scores_cv_recall, scores_val_recall, scores_cv_precision, scores_val_precision, scores_cv_f1, scores_val_f1 = evaluate_algorithm(
      dataset_train, dataset_validation, learning_vector_quantization, n_folds, n, lr, n_epochs)

    evaluation[(lr, n)] = {
        'cv_accuracy': sum(scores_cv)/float(len(scores_cv)),
        'validation_accuracy': sum(scores_vali)/float(len(scores_vali)),
        'cv_recall': sum(scores_cv_recall)/float(len(scores_cv_recall)),
        'validation_recall': sum(scores_val_recall)/float(len(scores_val_recall)),
        'cv_precision': sum(scores_cv_precision)/float(len(scores_cv_precision)),
        'validation_precision': sum(scores_val_precision)/float(len(scores_val_precision)),
        'cv_f1': sum(scores_cv_f1)/float(len(scores_cv_f1)),
        'validation_f1': sum(scores_val_f1)/float(len(scores_val_f1))
    }
    
    print(lr, n)
  print(evaluation)


0.01 40
0.01 50
0.01 60
{(0.01, 40): {'cv_accuracy': 0.9005847953216375, 'validation_accuracy': 0.8666666666666666, 'cv_recall': 0.8771929824561402, 'validation_recall': 0.8771929824561402, 'cv_precision': 0.007899866625628398, 'validation_precision': 0.007899866625628398, 'cv_f1': 0.01565657078121029, 'validation_f1': 0.01565657078121029}, (0.01, 50): {'cv_accuracy': 0.9005847953216375, 'validation_accuracy': 0.8736842105263157, 'cv_recall': 0.8771929824561402, 'validation_recall': 0.8771929824561402, 'cv_precision': 0.007899866625628398, 'validation_precision': 0.007899866625628398, 'cv_f1': 0.01565451444004986, 'validation_f1': 0.01565451444004986}, (0.01, 60): {'cv_accuracy': 0.9122807017543859, 'validation_accuracy': 0.8754385964912281, 'cv_recall': 0.8771929824561402, 'validation_recall': 0.8771929824561402, 'cv_precision': 0.008002462296091105, 'validation_precision': 0.008002462296091105, 'cv_f1': 0.015858577097302463, 'validation_f1': 0.015858577097302463}}


OverflowError: (34, 'Result too large')

In [55]:
dataset_train = df_train_2.values.tolist()
dataset_test = df_test_2.values.tolist()

n_folds = 5
learn_rate = 0.1
n_epochs = 20
n_codebooks = 30

scores_cv, scores_vali, scores_cv_recall, scores_val_recall, scores_cv_precision, scores_val_precision, scores_cv_f1, scores_val_f1 = evaluate_algorithm(
    dataset_train, dataset_test, learning_vector_quantization,n_folds, n_codebooks, learn_rate, n_epochs) 

OverflowError: (34, 'Result too large')