## Preparing the dataset
- load the data
- rename the columns
- split the dataset into train and test (test will be used only at the very final step)
- separate the independent variables from the target variables

In [1]:
import pandas as pd
import numpy as np

# load the data
leaf = pd.read_csv("leaf.csv" , header = None)

# rename the columns
names = np.array(["Class", "Specimen", "Eccentricity", "Aspect Ratio", "Elongation",
                  "Solidity", "Stochastic Convexity", "Isoperimetric Factor",
                  "Maximal Indentation Depth ","Lobedness","Average Intensity",
                  "Average Contrast","Smoothness","Third moment","Uniformity","Entropy"])
leaf.columns = names

data = leaf.loc[:, leaf.columns != 'Specimen']

"""
# split the dataset into train and test
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, random_state = 5, test_size = 0.2, stratify = data['Class'])
train.index = np.linspace(0, len(train)-1, len(train), dtype = 'int')
test.index = np.linspace(0, len(test)-1, len(test), dtype = 'int')


# separate the independent variables from the target variables
X = train.loc[:, train.columns != 'Class']
y = train['Class']
"""

X = data.loc[:, data.columns != 'Class']
y = data['Class']

## Building some useful tools
- declaring two cross-validation loop: 
    - outer stratified cv-loop
    - inner non-stratified cv-loop
- set the scoring we'd like to compute (this tuple will be used inside 'do_cross_validation')
- definition of do_cross_validation: it evaluate metric(s) by cross-validation and also record fit/score times, then print result(s).

In [3]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, KFold, LeaveOneOut, cross_validate

# declaring two cross-validation loop
inner_cv = StratifiedKFold(n_splits = 5) # random_state = 10)
outer_cv = StratifiedKFold(n_splits = 5, shuffle = True)

# set the scoring
scoring = ('f1_weighted', 'accuracy', 'f1_macro', 'f1_micro', 'roc_auc_ovo_weighted', 'roc_auc_ovr_weighted')

# definition of do_cross_validation
def do_cross_validation(clf,X=X, y=y, print_model=True, scoring = scoring):
    cv = cross_validate(clf, X, y, scoring=scoring, cv= outer_cv, return_train_score=False)

    for i in range(len(scoring)):
        scores = ' + '.join(f'{s:.2f}' for s in cv['test_' + scoring[i]])
        mean_ = cv['test_' + scoring[i]].mean()
        msg = f'Cross-validated {scoring[i]}: ({scores}) / {outer_cv.n_splits} = {mean_:.2f}'
        if print_model:
            msg = f'{clf}:\n\t{msg}\n'
        print(msg)

In [4]:
from sklearn.model_selection import GridSearchCV

## Train RandomForest
- train a RandomForest by a two nested CV loop. 
    - in the outer loop we estimate the score indexes using 'do_cross_validation'
    - in the inner loop we to GridSearch

In [5]:
from sklearn.ensemble import RandomForestClassifier
# random forest inner loop
param_grid={'n_estimators': [50, 100, 500], 'criterion': ["gini", "entropy"]}
clf_grid = GridSearchCV(RandomForestClassifier(random_state=0), param_grid=param_grid, cv = inner_cv, n_jobs=4)
# random forest outer loop
do_cross_validation(clf_grid)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=RandomForestClassifier(random_state=0), n_jobs=4,
             param_grid={'criterion': ['gini', 'entropy'],
                         'n_estimators': [50, 100, 500]}):
	Cross-validated f1_weighted: (0.77 + 0.79 + 0.73 + 0.64 + 0.71) / 5 = 0.73

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=RandomForestClassifier(random_state=0), n_jobs=4,
             param_grid={'criterion': ['gini', 'entropy'],
                         'n_estimators': [50, 100, 500]}):
	Cross-validated accuracy: (0.78 + 0.81 + 0.75 + 0.68 + 0.74) / 5 = 0.75

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=RandomForestClassifier(random_state=0), n_jobs=4,
             param_grid={'criterion': ['gini', 'entropy'],
                         'n_estimators': [50, 100, 500]}):
	Cross-validated f1_macro: (0.77 + 0.77

## Standardization

before moving to SVM and kNN, we need to standardize our independent variable.

In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scale = scaler.fit_transform(X)

In [6]:
X_scale = pd.DataFrame(X_scale)
X_scale.columns = X.columns
X_scale.head()

Unnamed: 0,Eccentricity,Aspect Ratio,Elongation,Solidity,Stochastic Convexity,Isoperimetric Factor,Maximal Indentation Depth,Lobedness,Average Intensity,Average Contrast,Smoothness,Third moment,Uniformity,Entropy
0,0.034066,-0.372227,-0.971865,0.709286,0.489279,1.402718,-0.848639,-0.500813,-0.09903,0.065951,-0.113724,-0.13155,-0.26103,0.022209
1,0.10517,-0.352383,-0.781384,0.675827,0.474045,1.231226,-0.833433,-0.499796,-0.757027,-0.657718,-0.695349,-0.609065,-0.7251,-0.798023
2,0.227715,-0.33435,-0.633812,0.641146,0.489279,1.274732,-0.775928,-0.494865,-1.098501,-1.295588,-1.047039,-0.947167,-0.810893,-1.231436
3,0.087093,-0.377814,-0.819275,0.624635,0.489279,1.315476,-0.790999,-0.496324,-0.985642,-1.14021,-0.975571,-0.902955,-0.745003,-0.984224
4,0.495931,-0.257978,-0.35403,0.636167,0.489279,1.029856,-0.776688,-0.494941,-1.208746,-1.52937,-1.137153,-1.015422,-0.844277,-1.404965


## Train SVM
- train a SVM by a two nested CV loop. 
    - in the outer loop we estimate the score indexes using 'do_cross_validation'
    - in the inner loop we to GridSearch

In [7]:
from sklearn.svm import SVC
# svc inner loop
param_grid = {'C': np.logspace(-2, 10, 13), 'gamma' : ['scale', 'auto'], 'kernel': ['linear', 'rbf']}
svc_grid = GridSearchCV(SVC(probability=True), param_grid=param_grid, cv = inner_cv, n_jobs=4)
# svc outer loop
do_cross_validation(svc_grid, X=X_scale)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=SVC(probability=True), n_jobs=4,
             param_grid={'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03, 1.e+04, 1.e+05,
       1.e+06, 1.e+07, 1.e+08, 1.e+09, 1.e+10]),
                         'gamma': ['scale', 'auto'],
                         'kernel': ['linear', 'rbf']}):
	Cross-validated f1_weighted: (0.76 + 0.76 + 0.81 + 0.81 + 0.70) / 5 = 0.77

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=SVC(probability=True), n_jobs=4,
             param_grid={'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03, 1.e+04, 1.e+05,
       1.e+06, 1.e+07, 1.e+08, 1.e+09, 1.e+10]),
                         'gamma': ['scale', 'auto'],
                         'kernel': ['linear', 'rbf']}):
	Cross-validated accuracy: (0.78 + 0.76 + 0.81 + 0.82 + 0.71) / 5 = 0.78

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None

## Train kNN
- train a kNN by a two nested CV loop. 
    - in the outer loop we estimate the score indexes using 'do_cross_validation'
    - in the inner loop we to GridSearch

In [8]:
from sklearn.neighbors import KNeighborsClassifier
# kNN inner loop
param_grid={'n_neighbors': [2,5,10,15], 'p':[1, 2]}
neigh_grid = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid, cv = inner_cv)
# kNN outer loop
do_cross_validation(neigh_grid, X=X_scale)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [2, 5, 10, 15], 'p': [1, 2]}):
	Cross-validated f1_weighted: (0.68 + 0.65 + 0.50 + 0.75 + 0.67) / 5 = 0.65

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [2, 5, 10, 15], 'p': [1, 2]}):
	Cross-validated accuracy: (0.71 + 0.68 + 0.54 + 0.76 + 0.68) / 5 = 0.67

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [2, 5, 10, 15], 'p': [1, 2]}):
	Cross-validated f1_macro: (0.67 + 0.66 + 0.50 + 0.74 + 0.68) / 5 = 0.65

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [2, 5, 10, 15], 'p

## First result

- From the previous computation, we see that independently from the score index used, the RandomForest is the best. So we train a RF

In [None]:
# retraining a RF
# probabilmente da togliere

parameters = {'n_estimators':[50,100,500], 'criterion': ['gini', 'entropy', 'log_loss'] }
clf = GridSearchCV(RandomForestClassifier(), parameters, cv = inner_cv, n_jobs=4)
clf.fit(X=X, y=y)
tree_model = clf.best_estimator_
print("Best score reached is:", clf.best_score_)
print ("Best parameters are:", clf.best_params_)


crit= list(clf.best_params_.values())[0]
n_est = list(clf.best_params_.values())[1]

tree = RandomForestClassifier(criterion=crit, n_estimators= n_est)
tree.fit(X,y)

tree.score(X,y)


tree.predict(test.loc[:, test.columns != 'Class'])


test['Class']

## Next step (asap)

- MICHELE: effectiveness indexes (scegliere quale usare) 
- ELENA: cross validation: inner outer (probabilmente sara: stratified e il numero dei fold è tale che non abbiamo fold vuoti rispetto a una classe)
- SAMUELE: vedere di sistemare la stadardizzazione 

- scegliere gli iperparametri di:
        - ELENA: albero (aggiungere albero singolo all'inizio) + RF (max_features + quanti alberi?) + kNN
        - MICHELE: SVM (linear)
        - SAMUELE: SVM (gauss)


- analisi variabili ???