In [1]:
# Data management
import pandas as pd

# Math and Stat modules
import numpy as np
from scipy.stats import sem
from random import choice

#Data preprocessing and trasformation (ETL)
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler, FunctionTransformer, Binarizer, OneHotEncoder, OrdinalEncoder
from sklearn.compose import  ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

#Supervised Learning
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, KFold, StratifiedKFold, RepeatedKFold, ShuffleSplit, StratifiedShuffleSplit, learning_curve, validation_curve
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, precision_recall_curve, roc_curve
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import VotingClassifier, BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
#Visualization
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline
from matplotlib.pyplot import figure
from sklearn.metrics import accuracy_score

La sfida chiave contro la sua individuazione è come classificare i tumori in maligni (cancerosi) o benigni (non cancerosi). 
Ti chiediamo di completare l'analisi della classificazione di questi tumori utilizzando l'apprendimento automatico (con SVM) e il set di dati (diagnostico) del cancro al seno del Wisconsin.

## 1.1 STEP 1: ETL PROCESSING

In [5]:
breast_dataset = pd.read_csv('OneDrive\Documenti\GitHub\MDS-python\Marcello Brambilla\breast-cancer.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'OneDrive\\Documenti\\GitHub\\MDS-python\\Marcello Brambilla\x08reast-cancer.csv'

In [None]:
breast_dataset

In [None]:
breast_dataset.head()

In [None]:
breast_label = breast_dataset['diagnosis'].map(
    {'B':0,
     'M':1
    }
).values
breast_dataset.drop(columns=['id','diagnosis'], inplace = True)

In [None]:
breast_dataset.info()

In [None]:
data_preprocessing = ColumnTransformer([
    ('scaler',StandardScaler(), ['radius_mean','texture_mean','perimeter_mean','area_mean','smoothness_mean','compactness_mean','concavity_mean','concave points_mean','symmetry_mean','fractal_dimension_mean','radius_se','texture_se','perimeter_se','area_se','smoothness_se','compactness_se','concavity_se','concave points_se','symmetry_se','fractal_dimension_se','radius_worst','texture_worst','perimeter_worst','area_worst','smoothness_worst','compactness_worst','concavity_worst','concave points_worst','symmetry_worst','fractal_dimension_worst'])
    ],
    remainder = 'passthrough'
)

In [None]:
feature_matrix = data_preprocessing.fit_transform(breast_dataset)

In [None]:
fm = pd.DataFrame(feature_matrix)
fm

## 1.2 STEP 2: TRAINING E TEST SETS
dividiamo il dataset in training e test sets in modo tale che il test set contenga il 20% dei record.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(feature_matrix, breast_label, test_size = 0.2, random_state = 42)

## 1.3 STEP 3: LA SCELTA DEGLI ALGORITMI/MODELLI DA UTILIZZARE
-Perceptron
-LogisticRegression
-Support Vector Machine

## 1.3.1 PERCEPTRON

In [None]:
for _ in range(10):
    X_train, X_test, y_train, y_test = train_test_split(feature_matrix, breast_label, test_size = 0.2)
    perceptron = Perceptron()
    perceptron.fit(X_train, y_train) # apprendo su training
    predicted_test = perceptron.predict(X_test) # predico sul test
    print(np.sum(predicted_test == y_test)/len(y_test))

## STEP 4: cross-validation

Utilizzanod 5-fold cross-validation devo valutare le performance dei diversi modelli (la scelta degli iperparametri per ora non e' vincolata). Nel dettaglio si devono utilizzare come misure di performance:
- accuracy
- precision
- recall 
- f1-score

Per ogni modello si deve costruire la distribuzione della misura di performance (un box plot e' sufficiente), oppure calcolare media e deviazione standard.

 1.4.1 A.PERCETTRONE

In [None]:
p = Perceptron()
cvs = cross_val_score(p, X_train, y_train, cv = 5)

In [None]:
cvs

In [None]:
np.mean(cvs)

In [None]:
np.std(cvs)

In [None]:
plt.boxplot(cvs)

In [None]:
y_train_predicted = cross_val_predict(p, X_train, y_train, cv = 5)

In [None]:
as1 = accuracy_score(y_train, y_train_predicted, normalize=True)
as1

In [None]:
confusion_matrix(y_train, y_train_predicted)

In [None]:
precision_score(y_train, y_train_predicted)

In [None]:
recall_score(y_train, y_train_predicted)

In [None]:
f1_score(y_train, y_train_predicted)

## Logistic Regression

In [None]:
logit_cls = LogisticRegression(max_iter = 1000)
y_scores = cross_val_predict(logit_cls, X_train, y_train, cv = 5)

In [None]:
cvs_log = cross_val_score(logit_cls, X_train, y_train, cv = 5)

In [None]:
cvs_log

In [None]:
prec, recall, soglia = precision_recall_curve(y_train, y_scores)

In [None]:
precision_score(y_train, y_scores), recall_score(y_train, y_scores), f1_score(y_train, y_scores)

In [None]:
# Metodo alternativo per il calcolo degli score
logit_cls.fit(X_train, y_train)
y_scores_alternative = logit_cls.decision_function(X_train)

In [None]:
fig_prc = plt.figure(figsize=(16,9))
ax = fig_prc.add_subplot()
ax.plot(soglia, prec[:-1], 'r', label = 'precision')
ax.plot(soglia, recall[:-1], 'b', label = 'recall')
ax.legend(fontsize=20)

Oppure posso visualizzare la precision in funzione della recall

In [None]:
fig_prf = plt.figure(figsize=(16,12))
ax = fig_prf.add_subplot()
ax.plot(recall[:-1], prec[:-1], 'r', label = 'precision', lw = 7)
ax.set_xlabel("Recall")
ax.set_ylabel("Precision")

#### 6.1 Learning curve

In [None]:
train_sizes, train_scores, test_scores = learning_curve(logit_cls,
                                                       X=feature_matrix,
                                                       y=breast_label,
                                                       train_sizes= [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                                       cv = 5,
                                                       n_jobs = -1,
                                                       shuffle = True)

In [None]:
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

fig=plt.figure(figsize=(12,7))
ax = fig.add_subplot()
ax.plot(train_sizes, train_scores[:,0],
         color='blue', marker='o',
         markersize=5, label='Training accuracy - fold 1')

ax.plot(train_sizes, train_mean,
         color='blue', marker='+',
         markersize=5, label='Training accuracy')

ax.fill_between(train_sizes,
                 train_mean + train_std,
                 train_mean - train_std,
                 alpha=0.15, color='blue')

ax.plot(train_sizes, test_scores[:,0],
         color='green', linestyle='--',
         marker='s', markersize=5,
         label='Validation accuracy - fold 1')

ax.plot(train_sizes, test_mean,
         color='green', linestyle='--',
         marker='d', markersize=5,
         label='Validation accuracy')

ax.fill_between(train_sizes,
                 test_mean + test_std,
                 test_mean - test_std,
                 alpha=0.15, color='green')

ax.grid()
ax.set_xlabel('Dimensione del training set')
ax.set_ylabel('Accuracy')
ax.legend(loc='lower right')
ax.set_ylim([0.6, 1.03])

## SUPPORT VECTOR MACHINE (SVM)

In [None]:
np.linspace(0,1,10)

In [None]:
Cs = [0.01, 0.1, 1, 10, 100]# definire un insieme di valori di C tenendo in considerazione le precedenti osservazioni sul suo effetto 
fig = plt.figure(figsize=(18,3.2))
for i, c in enumerate(Cs):
    print('Training SVM per C =', c, i)
    svm_cls = LinearSVC(C = c, max_iter=50000)
    train_sizes, train_scores, test_scores = learning_curve(svm_cls,
                                                       X=feature_matrix,
                                                       y=breast_label,
                                                       train_sizes=np.linspace(0.1,1,10),
                                                       cv = 5, n_jobs=-1,
                                                       shuffle = True)
    

    print('Training per {} finito'.format(c))
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    ax = fig.add_subplot(150+(i+1))
    ax.plot(train_sizes, train_mean,
         color='blue', marker='o',
         markersize=5, label='Training accuracy')
    ax.fill_between(train_sizes,
                 train_mean + train_std,
                 train_mean - train_std,
                 alpha=0.15, color='blue')
    ax.plot(train_sizes, test_mean,
         color='green', linestyle='--',
         marker='s', markersize=5,
         label='Validation accuracy')
    ax.fill_between(train_sizes,
                 test_mean + test_std,
                 test_mean - test_std,
                 alpha=0.15, color='green')
    ax.grid()
    ax.set_ylim((0.8,1))
    ax.set_xlabel('Dimensione del training set')
    ax.set_ylabel('Accuracy')
    ax.legend(loc='lower right')

## DECISION TREE

In [None]:
min_leaf = [5, 10, 100, 200, 350]

train_sizes, train_means, test_means, test_stds, train_stds = [],[],[],[],[]
for mlf in min_leaf:
    dt_mlf = DecisionTreeClassifier(min_samples_leaf=mlf, random_state=42, max_depth=15)
    train_size, train_scores, test_scores = learning_curve(dt_mlf,
                                                       X=feature_matrix,
                                                       y=breast_label,
                                                       train_sizes=np.linspace(0.1,1.0,10),
                                                       cv=5,
                                                       n_jobs=-1)
    print('fatto {}'.format(mlf))
    train_means.append(np.mean(train_scores, axis=1))
    train_stds.append(np.std(train_scores, axis=1))
    test_means.append(np.mean(test_scores, axis=1))
    test_stds.append(np.std(test_scores, axis=1))
    train_sizes.append(train_size)

In [None]:
fig= plt.figure(figsize=(12, 8))
for i in range(5):
    ax = fig.add_subplot(231+i)
    ax.plot(train_sizes[i], train_means[i],
         color='blue', marker='o',
         markersize=5, label='Training accuracy')
    ax.fill_between(train_sizes[i],
                 train_means[i] + train_stds[i],
                 train_means[i] - train_stds[i],
                 alpha=0.15, color='blue')
    ax.plot(train_sizes[i], test_means[i],
         color='green', linestyle='--',
         marker='s', markersize=5,
         label='Validation accuracy')
    ax.fill_between(train_sizes[i],
                 test_means[i] + test_stds[i],
                 test_means[i] - test_stds[i],
                 alpha=0.15, color='green')
    ax.grid()
    ax.set_ylim((0.8,1))
    ax.set_ylabel('Accuracy')
    ax.legend(loc='lower right')
    ax.set_title(r"min_sam_leaf:{}".format(min_leaf[i]), fontsize=18)

## Random Forest

In [None]:
X_train, X_test, y_train, y_test = train_test_split(feature_matrix, breast_label, test_size=0.2, stratify=breast_label)

In [None]:
len(y_train[y_train == 1])/len(y_train), len(y_test[y_test == 1])/len(y_test)

In [None]:
rnf_clf = RandomForestClassifier(n_estimators=250, max_leaf_nodes=64, n_jobs=-1, max_features=10)
et_clf = ExtraTreesClassifier(n_estimators=250, max_leaf_nodes=64, n_jobs=-1, max_features=10)
scores_rnf = cross_val_score(rnf_clf, X_train, y_train, cv=5, scoring='f1', n_jobs=-1)
scores_et = cross_val_score(et_clf, X_train, y_train, cv=5, scoring='f1',n_jobs=-1)

In [None]:
rnf_clf.fit(feature_matrix,breast_label)

In [None]:
fig= plt.figure(figsize=(12, 8))
for i in range(2):
    ax = fig.add_subplot(121+i)
    ax.plot(train_sizes[i], train_means[i],
         color='blue', marker='o',
         markersize=5, label='Training accuracy')
    ax.fill_between(train_sizes[i],
                 train_means[i] + train_stds[i],
                 train_means[i] - train_stds[i],
                 alpha=0.15, color='blue')
    ax.plot(train_sizes[i], test_means[i],
         color='green', linestyle='--',
         marker='s', markersize=5,
         label='Validation accuracy')
    ax.fill_between(train_sizes[i],
                 test_means[i] + test_stds[i],
                 test_means[i] - test_stds[i],
                 alpha=0.15, color='green')
    ax.grid()
    ax.set_ylim((0.8,1))
    ax.set_ylabel('Accuracy')
    ax.legend(loc='lower right')