In [29]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor, DistanceMetric
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV

In [111]:
df=pd.read_csv("./csv/fullwine.csv")

In [112]:
((df.isna().sum()*100)/df.shape[0]).to_frame()

Unnamed: 0,0
fixed acidity,0.0
volatile acidity,0.0
citric acid,0.0
residual sugar,0.0
chlorides,0.0
free sulfur dioxide,0.0
total sulfur dioxide,0.0
density,0.0
pH,0.0
sulphates,0.0


In [113]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,wine
0,6.8,0.29,0.22,3.4,0.035,40.0,122.0,0.99024,3.09,0.47,12.3,6,white
1,6.7,0.24,0.41,9.4,0.04,49.0,166.0,0.9954,3.12,0.61,9.9,6,white
2,6.6,0.88,0.04,2.2,0.066,12.0,20.0,0.99636,3.53,0.56,9.9,5,red
3,7.1,0.67,0.0,2.3,0.083,18.0,27.0,0.99768,3.44,0.54,9.4,5,red
4,6.0,0.28,0.27,4.1,0.046,50.0,147.0,0.99126,3.27,0.56,11.6,6,white


## K-Vecinos (clasificación)

In [114]:
clas=df.copy().drop(columns="quality")
tgt = "wine"
ls_pred = [x for x in clas.columns if x not in [tgt]]

In [115]:
X=clas[ls_pred]
y=clas[tgt]

In [116]:
knn = KNeighborsClassifier()

In [117]:
knn.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [118]:
ls_res = cross_val_score(estimator = knn, X=X, y=y, cv=4, n_jobs=-1, scoring="roc_auc")

In [119]:
np.mean(ls_res), np.std(ls_res)

(0.9642970604104213, 0.0013833204332616607)

In [120]:
param_grid = {"n_neighbors": range(100),
              "algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute'],
              "metric": ["euclidean", "manhattan", "chebyshev", "minkowski", "seuclidean", "mahalanobis", ]}

In [121]:
rs = RandomizedSearchCV(cv=4, error_score=-1000, estimator=knn, n_jobs=-1, scoring="roc_auc", param_distributions=param_grid, verbose=True, n_iter=20)

In [122]:
%%time
rs.fit(X, y)

Fitting 4 folds for each of 20 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.3s


CPU times: user 535 ms, sys: 66.1 ms, total: 601 ms
Wall time: 8.39 s


[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:    8.0s finished


RandomizedSearchCV(cv=4, error_score=-1000,
                   estimator=KNeighborsClassifier(algorithm='auto',
                                                  leaf_size=30,
                                                  metric='minkowski',
                                                  metric_params=None,
                                                  n_jobs=None, n_neighbors=5,
                                                  p=2, weights='uniform'),
                   iid='warn', n_iter=20, n_jobs=-1,
                   param_distributions={'algorithm': ['auto', 'ball_tree',
                                                      'kd_tree', 'brute'],
                                        'metric': ['euclidean', 'manhattan',
                                                   'chebyshev', 'minkowski',
                                                   'seuclidean',
                                                   'mahalanobis'],
                                        'n

In [123]:
rs.best_score_

0.9973517198370159

In [124]:
rs.best_estimator_

KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='mahalanobis',
                     metric_params=None, n_jobs=None, n_neighbors=73, p=2,
                     weights='uniform')

## K-Vecinos (Regresor)

In [125]:
df2=df.copy().drop(columns=["wine"])
tgt = "quality"
ls_pred = [x for x in df2.columns if x not in [tgt]]

In [126]:
X=df2[ls_pred]
y=df2[tgt]

In [127]:
knn = KNeighborsRegressor()

In [142]:
KNeighborsRegressor?

In [128]:
knn.fit(X, y)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')

In [129]:
ls_res = cross_val_score(estimator = knn, X=X, y=y, cv=4, n_jobs=-1, scoring="r2")

In [130]:
np.mean(ls_res), np.std(ls_res)

(0.15391777792856526, 0.016830750212744914)

In [136]:
param_grid = {"n_neighbors": range(100),
              "algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute'],
              "weights":["uniform","distance"],
              "metric": ["euclidean", "manhattan", "chebyshev", "minkowski", "seuclidean", "mahalanobis", ]}

In [137]:
rs = RandomizedSearchCV(cv=4, error_score=-1000, estimator=knn, n_jobs=-1, scoring="r2", param_distributions=param_grid, verbose=True, n_iter=100)

In [138]:
%%time
rs.fit(X, y)

Fitting 4 folds for each of 100 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 308 tasks      | elapsed:   20.6s


CPU times: user 1.61 s, sys: 154 ms, total: 1.77 s
Wall time: 26.4 s


[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:   26.4s finished


RandomizedSearchCV(cv=4, error_score=-1000,
                   estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30,
                                                 metric='minkowski',
                                                 metric_params=None,
                                                 n_jobs=None, n_neighbors=5,
                                                 p=2, weights='uniform'),
                   iid='warn', n_iter=100, n_jobs=-1,
                   param_distributions={'algorithm': ['auto', 'ball_tree',
                                                      'kd_tree', 'brute'],
                                        'metric': ['euclidean', 'manhattan',
                                                   'chebyshev', 'minkowski',
                                                   'seuclidean',
                                                   'mahalanobis'],
                                        'n_neighbors': range(0, 100),
                          

In [139]:
rs.best_score_

0.5029231430758611

In [140]:
rs.best_estimator_

KNeighborsRegressor(algorithm='brute', leaf_size=30, metric='seuclidean',
                    metric_params=None, n_jobs=None, n_neighbors=17, p=2,
                    weights='distance')