### Import Libraries

In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import RadiusNeighborsClassifier
from imblearn.over_sampling import SMOTE


### Take the data and create train an test sets

In [20]:
data = pd.read_csv('trans_cookies.csv')

In [21]:
data.head()

Unnamed: 0,sugar to flour ratio,sugar index,bake temp,chill time,calories,density,pH,grams baking soda,bake time,weight,chocolate,nuts,butter type_melted,quality
0,-0.464475,0.875168,-0.941445,-0.894671,0.367978,-0.333415,-0.74221,-0.61246,1.337694,0.310813,-1.44426,-0.558017,0.566112,8
1,-0.604563,-0.451709,-0.083989,0.20644,-0.042954,-0.122242,-0.370369,-0.334466,-1.76002,-0.772054,-1.44426,-0.558017,0.566112,7
2,-0.954783,-0.751326,-0.707593,0.148487,-0.16802,-2.448553,-0.060502,2.097979,2.928411,-1.93227,0.692396,1.792059,0.566112,9
3,-0.954783,1.089181,-0.200915,0.612112,0.153579,0.562368,-0.494316,-1.237946,-0.001858,-0.849402,0.692396,-0.558017,0.566112,7
4,-0.534519,-0.64432,0.890393,-1.41625,-1.472281,0.93703,-0.804184,0.29102,-0.9228,2.08981,0.692396,1.792059,-1.766434,5


In [22]:
data.columns

Index(['sugar to flour ratio', 'sugar index', 'bake temp', 'chill time',
       'calories', 'density', 'pH', 'grams baking soda', 'bake time', 'weight',
       'chocolate', 'nuts', 'butter type_melted', 'quality'],
      dtype='object')

In [23]:
data.shape

(5138, 14)

In [26]:
X = data.drop(columns='quality')
y = data['quality']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

sm = SMOTE(k_neighbors=3, random_state=2)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())

## We try KNeighborsClassifier

In [41]:
# Parameters

n_neighbors = 1
# n_neighbors : int, optional (default = 5) Number of neighbors to use by default for kneighbors queries.

weights = 'uniform'
#weights : str or callable, optional (default = ‘uniform’) weight function used in prediction. Possible values:
    #‘uniform’ : uniform weights. All points in each neighborhood are weighted equally.
    #‘distance’ : weight points by the inverse of their distance. in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away.
    #[callable] : a user-defined function which accepts an array of distances, and returns an array of the same shape containing the weights.

    
algorithm = 'auto'
#algorithm : {‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}, optional

#    Algorithm used to compute the nearest neighbors:

#        ‘ball_tree’ will use BallTree
#        ‘kd_tree’ will use KDTree
#        ‘brute’ will use a brute-force search.
#        ‘auto’ will attempt to decide the most appropriate algorithm based on the values passed to fit method.

#    Note: fitting on sparse input will override the setting of this parameter, using brute force.


leaf_size = 30
#leaf_size : int, optional (default = 30) Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem.


p = 2
#p : integer, optional (default = 2) Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.

metric = 'minkowski'
#metric : string or callable, default ‘minkowski’ the distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean metric. See the documentation of the DistanceMetric class for a list of available metrics.

metric_params = None
#metric_params : dict, optional (default = None). Additional keyword arguments for the metric function.

n_jobs = None
#n_jobs : int or None, optional (default=None). The number of parallel jobs to run for neighbors search. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors. See Glossary for more details. Doesn’t affect fit method.


In [42]:
KN = KNeighborsClassifier(n_neighbors = n_neighbors, weights= weights, algorithm= algorithm, leaf_size= leaf_size,
                         p= p, metric=metric, metric_params=metric_params, n_jobs=n_jobs)
KN.fit(X_train_res, y_train_res)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=1, p=2,
           weights='uniform')

In [43]:
y_pred = KN.predict(X_test)

### Evaluate using metrics

In [45]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix

In [46]:
print(classification_report(y_test, y_pred))
print("---")
print(accuracy_score(y_test, y_pred))
print('---')
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.17      0.22      0.19         9
           5       0.72      0.65      0.68       112
           6       0.50      0.57      0.53       121
           7       0.63      0.58      0.61       262
           8       0.69      0.66      0.67       354
           9       0.50      0.54      0.52       141
          10       0.26      0.41      0.31        27
          11       0.00      0.00      0.00         1

   micro avg       0.60      0.60      0.60      1028
   macro avg       0.39      0.40      0.39      1028
weighted avg       0.61      0.60      0.60      1028

---
0.5992217898832685
---
[[  0   0   1   0   0   0   0   0   0]
 [  1   2   4   2   0   0   0   0   0]
 [  0   5  73  31   3   0   0   0   0]
 [  0   5  20  69  19   4   4   0   0]
 [  1   0   4  28 153  58  14   4   0]
 [  0   0   0   5  55 232  49  12   1]
 [  0   0   0   2   9  38  76  15 

## Fitting the hyperparameters

In [35]:
# CHECKING BEST PARAMETER COMBINATION FOR KNN

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

KN_best = KNeighborsClassifier()

# Parameters to try
parameter_grid = {'n_neighbors': [i for i in range(1, 20, 2)]}

# Instantiate stratified cross validation
cross_validation = StratifiedKFold(n_splits=10)

# Create grid search object on decision tree using stratified cross validation
grid_search = GridSearchCV(KN_best,
                           param_grid=parameter_grid,
                           cv=cross_validation)

# Fit model with grid_search
grid_search.fit(X_train_res, y_train_res)
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

Best score: 0.9205677095585353
Best parameters: {'n_neighbors': 1}


In [47]:
# Random CV -CHECKING BEST PARAMETER COMBINATION FOR KNN

from sklearn.model_selection import RandomizedSearchCV

KN_best_2 = KNeighborsClassifier()

# Parameters to try
parameter_grid = {'n_neighbors': [i for i in range(1, 40, 2)],
                 'weights': ['uniform', 'distance']}



# Create grid search object on decision tree using stratified cross validation
n_iter_search = 20
random_search = RandomizedSearchCV(KN_best_2,
                           param_distributions=parameter_grid,
                           n_iter=n_iter_search, cv=5)

# Fit model with grid_search
random_search.fit(X_train_res, y_train_res)
print('Best score: {}'.format(random_search.best_score_))
print('Best parameters: {}'.format(random_search.best_params_))

Best score: 0.8920254057868737
Best parameters: {'weights': 'distance', 'n_neighbors': 5}
