In [None]:
#import the basic libraries
# Referência Sklearn Dataset Wine: https://www.kaggle.com/code/cristianlapenta/wine-dataset-sklearn-machine-learning-project/notebook
import numpy as np
import pandas as pd
#get the wine dataset from sklearn and take a look at the description provided
from sklearn import datasets
wine = datasets.load_wine()
print(wine.DESCR)

In [None]:
df = pd.DataFrame(wine.data, columns=wine.feature_names)
df['label'] = wine.target
df.head()

In [None]:
np.bincount(df["label"])

In [None]:
#check the percentage representation of the classes
df.label.value_counts(normalize=True).round(3)

In [None]:
#check for null/missing values
df.info()
import missingno as msno
msno.bar(df)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(15,10)})
sns.heatmap(df.iloc[:,:-1].corr(), annot=True, cmap="YlGnBu")
plt.show()

In [None]:
sns.pairplot(df,
             hue='label',
             palette="tab10",
             corner=True)

plt.show()

In [None]:
# I create X dataframe with features and y with the labels
X = df.drop('label', axis=1).values
y = df.label.values
# I apply feature scaling to the entire dataset in order to apply PCA to display the dataset
from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler()
X_sc = minmax.fit_transform(X)
# I determine percentage of variance for each principal component
from sklearn.decomposition import PCA
pc_range = np.arange(1,X_sc.shape[1] + 1)
pca = PCA(n_components=None)
pca.fit(X_sc)

plt.figure(figsize=(10,7))
plt.bar(pc_range, pca.explained_variance_ratio_)
plt.step(pc_range, np.cumsum(pca.explained_variance_ratio_))
plt.xticks(pc_range)
plt.xlabel("Principal Components")
plt.ylabel("Variance")
_ = plt.title("Number of Principal Components vs Variance")

print("PC\tvariance")
print("---\t--------")
for i,k in  enumerate(pca.explained_variance_ratio_):
    print('PC_{}:\t{}'.format(i+1, round(k, 3)))

In [None]:
df.describe().loc[['min', 'max']].T

In [11]:
#First of all I divide the dataset into training and testing.
#To maintain the class distribution ratio, I assign the labels array to 'stratify'
y = df["label"]
X = df.iloc[:,:-1]

from sklearn.model_selection import train_test_split as split
X_train, X_test, y_train, y_test = split(X, y, test_size=0.3, shuffle=True, random_state=0, stratify=y)

#scaling the data to equalise min and max of each feature
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# I do the scaler fitting on the training data only
#on the training test I do fit and transform simultaneously
X_train = scaler.fit_transform(X_train)

#now that the scaler has been trained on the training data, I do the transform on the test set
#The reason I fit the scaler using only the training data is because I don't want to bias the model with information from the test data.
X_test = scaler.transform(X_test)
#You should apply fit_transform on the training set and only transform on the validation/test set.
#This is done because the validation/test data is meant to emulate data the model has not seen before.
#So to fit the validation/test data, we use what was fitted on the training data by using just transform,
#which should also be applied to new data that will be fed into the model.

In [None]:
from sklearn.svm import SVC #model I will train
from sklearn.model_selection import StratifiedKFold # for splitting the training-validation data
from sklearn.model_selection import GridSearchCV #for validating hyperparameters

crossval = StratifiedKFold(n_splits=5, shuffle=True, random_state=0) #choose how many subsets to create

#KERNEL: function used to map data into a higher dimension
#in SVM+kernel, the idea is to transform the starting space (that of features)
#into a more complex one in which there is a hyperplane (hence something linear) separating the data

#C: adjusts the amount of errors accepted
#for high C the error committed on the training data will weight more in the SVM objective function
#the model is then asked to fail less (i.e. to fit the learning data more),
#but obtaining a more complex interpolation (= model) interpolation (with the risk, if one exaggerates, of overfitting).
#vice versa, if C is low, the model considers less important  the error made on the learning data: the model will be simpler;
#if you overdo it, the risk is that of underfitting;
#if you think about it, if C is 0 I am ignoring the error I make on the learning data

#GAMMA (for RBF kernel): adjusts the curvature of the margins thus limiting the influence of each individual sample
#gamma is proportional to the inverse of the standard deviation of the Gaussian
#"fitted" by the Gaussian kernel to each support vector

#DEGREE (for POLY kernel): degree of the polynomial function

parameters = [
               {"kernel": ["linear"], "C": [0.01, 0.1, 1, 10, 100]},
               {"kernel": ["rbf"], "C": [0.01, 0.1, 1, 10, 100], "gamma": [0.01, 0.1, 1, 10, 100]},
               {"kernel": ["poly"], "C": [0.01, 0.1, 1, 10, 100], "degree": np.arange(1,5,1)}
              ]

model = SVC()
clf = GridSearchCV(estimator= model, param_grid=parameters, cv=crossval, verbose=1 , n_jobs=-1)

clf.fit(X_train, y_train)

In [None]:
#create a table with the results of the combinations, sorting them from best to worst:
scores = pd.DataFrame(clf.cv_results_)
scores = scores.sort_values(by="rank_test_score").set_index("rank_test_score")
int_cols = ["param_C", "param_kernel", "param_degree", "param_gamma", "mean_test_score"]
scores[int_cols].head() # only look at the first 5

In [None]:
#finally I use the best values of the newly found hyperparameters to train the entire original training set,
#evaluating its accuracy on the test set, whose data results as 'unseen'

from sklearn.metrics import accuracy_score as accuracy #accuracy metric
fitted_model = clf.best_estimator_

predictions = fitted_model.predict(X_test)
round(accuracy(y_test, predictions), 3) #evaluation

In [None]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split as split
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score as accuracy

#loading the dataset
wine = datasets.load_wine()
X, y = wine.data, wine.target

#separating training and test data
X_train, X_test, y_train, y_test = split(X, y, test_size=0.3, shuffle=True, random_state=0, stratify=y)


#creating the pipeline containing the scaler, pca and the model
pipe = Pipeline([("scaler", MinMaxScaler()),
                 ("svc", SVC())])

#preparing the values of hyperparameters to be validated
parameters = [{"svc__kernel": ["linear"], "svc__C": [0.01, 0.1, 1, 10, 100]},
              {"svc__kernel": ["rbf"], "svc__C": [0.01, 0.1, 1, 10, 100], "svc__gamma": [0.01, 0.1, 1, 10, 100]},
              {"svc__kernel": ["poly"], "svc__C": [0.01, 0.1, 1, 10, 100], "svc__degree": np.arange(1,5,1)}]

#set the number of subset to be created for validation
crossval= StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

#finding the best values to assign to hyperparameters
clf = GridSearchCV(pipe, param_grid=parameters, cv=crossval, n_jobs=-1)

#training the model with the best hyperparameters found (GridSearchCV applies them automatically)
#and I evaluate the accuracy on training and test sets
clf.fit(X_train, y_train)

pred_train = clf.best_estimator_.predict(X_train)
pred_test = clf.best_estimator_.predict(X_test)
print(f"Best parameters are: {clf.best_params_}, with a score of {round(clf.best_score_,3)}")
print(f"Accuracy on training set is: {round(accuracy(y_train, pred_train), 3)}")
print(f"Accuracy on test set is : {round(accuracy(y_test, pred_test), 3)}")

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pred_test, labels=[0,1,2]))

In [None]:
import scikitplot as skplt
import matplotlib.pyplot as plt

skplt.metrics.plot_confusion_matrix(y_test, pred_test, normalize=False)
plt.show()

In [None]:
pip install scikit-plot