# Wine Quality Classification

### 1. Importing the necessary libraries:

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import model_selection ,datasets, svm, metrics

### 2. Loading the wine dataset:

In [3]:
x, y = datasets.load_wine(return_X_y=True) #splitting the data into x (features) and y (targets).

### 3. Performing K-fold cross-validation:

In [7]:
#Making a function that splits the data into train and test splits using indices.
def kfold_train_test_split(x, y, train_indices, test_indices):
  return x[train_indices], x[test_indices], y[train_indices], y[test_indices]
  
scores = [] #Defining an empty array to store the model scores.

In [11]:
s_s_k_fold = model_selection.StratifiedShuffleSplit(n_splits=10, random_state=42) #spliting the data into 10 splits, with 9 of them being for training and 1 for testing for each fold.

In [12]:
#looping through the splitted data that was splitted using the StratisfiedShuffleSplit().

for train_indices, test_indices in s_s_k_fold.split(x, y): # y is used  for stratification. 
  
  #Spliting the data using the defined function (kfold_train_test_split).
  x_train, x_test, y_train, y_test = kfold_train_test_split(x, y, train_indices, test_indices)

### 4. Defining the classification model:

In [48]:
svc = svm.SVC(random_state=42) #Using support vector machines classifier (SVC), as it is a classification problem.
svc.fit(x_train, y_train) #Training the model

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=42,
    shrinking=True, tol=0.001, verbose=False)

### 5. Training the model and making predictions without tuning the hyperparameters:

In [49]:
y_pred = svc.predict(x_test) #Making predictions on the test set

In [50]:
accuracy = metrics.accuracy_score(y_test, y_pred) #Calculating the accuracy score, (recall and precision metrics will not be calculated as this a multiclass problem).

scores_dict = {"accuracy": accuracy} #Making a dictionary and fit the accuracy scores.
scores.append(scores_dict) #Adding the scores dictionary into the previously defined empty array.

In [51]:
scores_df = pd.DataFrame(scores) #Converting the array into a dataframe for an organized display.
scores_df

Unnamed: 0,accuracy,Accuracy
0,0.555556,
1,0.555556,
2,,0.555556
3,,0.555556
4,0.555556,
5,0.555556,
6,0.944444,
7,0.555556,


In [52]:
print("Model's mean accuracy value:", scores_df.mean()) #Printing the mean value for the accuracies of the 10 splits.

Model's mean accuracy value: accuracy    0.620370
Accuracy    0.555556
dtype: float64


### 6. Performing hyperparameters tuning:

In [127]:
#Defining the hyperparameters to be tuned and define a range for the values.
h_params = {'kernel': ['sigmoid','linear', 'rbf'], 'C': [0.1,10,100], 'gamma': [0.001, 0.01, 0.1]}

In [128]:
#Using the automatic GridSearch() function to test every parameter and give us the best ones.
auto_gs = model_selection.GridSearchCV(svc, h_params, cv=s_s_k_fold, refit="accuracy", verbose=2) 
#verbose is set to 2 to display the training logs, we can also set it to 1 for just a describtion of the training and 0 for just the results.

In [129]:
#Fitting the grid search operation onto our data
auto_gs.fit(x, y)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=42, test_size=None,
            train_size=None),
             error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=42, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.1, 10, 100], 'gamma': [0.001, 0.01, 0.1],
                         'kernel': ['sigmoid', 'linear', 'rbf']},
             pre_dispatch='2*n_jobs', refit='accuracy',
             return_train_score=False, scoring=None, verbose=0)

In [130]:
print("Best Hyperparameters:", auto_gs.best_params_) #To display the best values of the hyperparameters.
print("Best Score (Accuracy): ", auto_gs.best_score_) #To display the accuracy associated with the best parameters.

Best Hyperparameters: {'C': 10, 'gamma': 0.001, 'kernel': 'linear'}
Best Score (Accuracy):  0.95
