In [278]:
from sklearn.datasets import load_breast_cancer
from sklearn.svm import SVC
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import accuracy_score
from statistics import mean
from scikitplot.metrics import plot_confusion_matrix
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.manifold import TSNE
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import logging
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
import pandas as pd

In [328]:
def addestraSVC(X, y, c, gamma, kernel, dim):

    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=30, stratify=y)
    
    sc = StandardScaler()
    
    X_train = sc.fit_transform(X_train)
    X_test = sc.fit_transform(X_test)
    
    pca = PCA(n_components=dim)
    
    X_train = pca.fit_transform(X_train)
    X_test = pca.fit_transform(X_test)
    
    model = SVC(gamma = gamma, C=c, kernel = kernel)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    return accuracy_score(y_test, y_pred)

def addrestraDecisionTreeClassifier(X, y, criterion, dim):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=30, stratify=y)
    
    sc = StandardScaler()
    
    X_train = sc.fit_transform(X_train)
    X_test = sc.fit_transform(X_test)
    
    pca = PCA(n_components=dim)
    
    X_train = pca.fit_transform(X_train)
    X_test = pca.fit_transform(X_test)
    
    model = DecisionTreeClassifier(criterion=criterion)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    return accuracy_score(y_test, y_pred)

def addestraRandomForestClassifier(X, y, n_estimators, dim):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=30, stratify=y)
    
    sc = StandardScaler()
    
    X_train = sc.fit_transform(X_train)
    X_test = sc.fit_transform(X_test)
    
    pca = PCA(n_components=dim)
    
    X_train = pca.fit_transform(X_train)
    X_test = pca.fit_transform(X_test)
    
    model = RandomForestClassifier(n_estimators=n_estimators)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    return accuracy_score(y_test, y_pred)

def addestraLinearDiscriminantAnalysis(X, y, solver, dim):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=30, stratify=y)
    
    sc = StandardScaler()
    
    X_train = sc.fit_transform(X_train)
    X_test = sc.fit_transform(X_test)
    
    pca = PCA(n_components=dim)
    
    X_train = pca.fit_transform(X_train)
    X_test = pca.fit_transform(X_test)
    
    model = LinearDiscriminantAnalysis(solver=solver)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    return accuracy_score(y_test, y_pred)

def addestraNaiveBayes(X, y, dim):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=30, stratify=y)
    
    sc = StandardScaler()
    
    X_train = sc.fit_transform(X_train)
    X_test = sc.fit_transform(X_test)
    
    pca = PCA(n_components=dim)
    
    X_train = pca.fit_transform(X_train)
    X_test = pca.fit_transform(X_test)
    
    model = GaussianNB()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    return accuracy_score(y_test, y_pred)

#funzione che calcola il valore degli iperparametri per il modello dato in input
def trovaIperparametri(X, y, model, numero_dimensioni):
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=30, stratify=y)

    if model == SVC:
        steps = [
            ('scaler', StandardScaler()),
            ('reduce_dim', PCA()),
            ('SVM', SVC()), 
        ]

        pipeline = Pipeline(steps)

        valori_C = np.arange(0.1, 1.0, 0.1)
        valori_gamma = [0.1, 0.01]
        valori_kernel = ['linear', 'poly', 'rbf', 'sigmoid']

        params = { 'SVM__C': valori_C,
                  'SVM__gamma': valori_gamma,
                  'SVM__kernel': valori_kernel,
                  'reduce_dim__n_components': np.arange(1, numero_dimensioni, 1),
                }
    elif model == DecisionTreeClassifier:
        
        steps = [
            ('scaler', StandardScaler()),
            ('reduce_dim', PCA()),
            ('tree', DecisionTreeClassifier()),
        ]
        
        pipeline= Pipeline(steps)
        
        params = {'reduce_dim__n_components': np.arange(1, numero_dimensioni, 1),
                  'tree__criterion': ['gini', 'entropy'],
                 }
    
    elif model == RandomForestClassifier:
        
        steps = [
            ('scaler', StandardScaler()),
            ('reduce_dim', PCA()),
            ('random_forest', RandomForestClassifier()),
        ]
        
        pipeline = Pipeline(steps)
        
        params = {'reduce_dim__n_components': np.arange(1, numero_dimensioni, 1),
                  'random_forest__n_estimators': np.arange(10, 100, 10),
                 }
    
    elif model == LinearDiscriminantAnalysis:
        
        steps = [
            ('scaler', StandardScaler()),
            ('reduce_dim', PCA()),
            ('linear_discriminant_analysis', LinearDiscriminantAnalysis()),
        ]
        
        pipeline = Pipeline(steps)
        
        params = {'reduce_dim__n_components': np.arange(1, numero_dimensioni, 1),
                  'linear_discriminant_analysis__solver': ['svd', 'lsqr', 'eigen'],
                  
                 }
    elif model == GaussianNB:
        steps = [
            ('scaler', StandardScaler()),
            ('reduce_dim', PCA()),
            ('linear_discriminant_analysis', LinearDiscriminantAnalysis()),
        ]
        
        pipeline = Pipeline(steps)
        
        params = {'reduce_dim__n_components': np.arange(1, numero_dimensioni, 1),

                 }
    grid = GridSearchCV(pipeline, param_grid=params, cv=3)
    
    grid.fit(X_train, y_train)
    
    
    return grid.best_params_

In [329]:
dataset = load_breast_cancer()

trovaIperparametri(dataset=dataset, model=SVC, numero_dimensioni=30)

TypeError: trovaIperparametri() got an unexpected keyword argument 'dataset'

In [199]:
addestraSVC(dataset, 0.8, 0.1, 'linear', 13)

0.9824561403508771

In [221]:
trovaIperparametri(dataset, DecisionTreeClassifier, 30)

{'reduce_dim__n_components': 3,
 'tree__criterion': 'gini',
 'tree__max_depth': 1000}

In [233]:
addrestraDecisionTreeClassifier(dataset, 'gini', 30)

0.9122807017543859

In [239]:
trovaIperparametri(dataset, RandomForestClassifier, 30)

{'random_forest__n_estimators': 80, 'reduce_dim__n_components': 16}

In [251]:
addestraRandomForestClassifier(dataset, n_estimators=80, dim=16)

0.8859649122807017

In [245]:
trovaIperparametri(dataset, LinearDiscriminantAnalysis, 30)

{'linear_discriminant_analysis__solver': 'svd', 'reduce_dim__n_components': 12}

In [250]:
addestraLinearDiscriminantAnalysis(dataset, solver='svd', dim=12)

0.9473684210526315

In [273]:
trovaIperparametri(dataset=dataset, model=GaussianNB, numero_dimensioni=30)

{'reduce_dim__n_components': 12}

In [277]:
addestraNaiveBayes(dataset=dataset, dim=12)

0.8771929824561403

In [349]:
df=pd.read_csv('african_crises.csv',parse_dates=True)
df

Unnamed: 0,case,cc3,country,year,systemic_crisis,exch_usd,domestic_debt_in_default,sovereign_external_debt_default,gdp_weighted_default,inflation_annual_cpi,independence,currency_crises,inflation_crises,banking_crisis
0,1,DZA,Algeria,1870,1,0.052264,0,0,0.0,3.441456,0,0,0,crisis
1,1,DZA,Algeria,1871,0,0.052798,0,0,0.0,14.149140,0,0,0,no_crisis
2,1,DZA,Algeria,1872,0,0.052274,0,0,0.0,-3.718593,0,0,0,no_crisis
3,1,DZA,Algeria,1873,0,0.051680,0,0,0.0,11.203897,0,0,0,no_crisis
4,1,DZA,Algeria,1874,0,0.051308,0,0,0.0,-3.848561,0,0,0,no_crisis
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1054,70,ZWE,Zimbabwe,2009,1,354.800000,1,1,0.0,-7.670000,1,1,0,crisis
1055,70,ZWE,Zimbabwe,2010,0,378.200000,1,1,0.0,3.217000,1,0,0,no_crisis
1056,70,ZWE,Zimbabwe,2011,0,361.900000,1,1,0.0,4.920000,1,0,0,no_crisis
1057,70,ZWE,Zimbabwe,2012,0,361.900000,1,1,0.0,3.720000,1,0,0,no_crisis


In [360]:
df2 = df.drop('systemic_crisis', 1)
df2 = df2.drop('cc3', 1)
df2 = df2.drop('banking_crisis', 1)
X = df2.drop('country', 1)

y = df['systemic_crisis']

X

Unnamed: 0,case,year,exch_usd,domestic_debt_in_default,sovereign_external_debt_default,gdp_weighted_default,inflation_annual_cpi,independence,currency_crises,inflation_crises
0,1,1870,0.052264,0,0,0.0,3.441456,0,0,0
1,1,1871,0.052798,0,0,0.0,14.149140,0,0,0
2,1,1872,0.052274,0,0,0.0,-3.718593,0,0,0
3,1,1873,0.051680,0,0,0.0,11.203897,0,0,0
4,1,1874,0.051308,0,0,0.0,-3.848561,0,0,0
...,...,...,...,...,...,...,...,...,...,...
1054,70,2009,354.800000,1,1,0.0,-7.670000,1,1,0
1055,70,2010,378.200000,1,1,0.0,3.217000,1,0,0
1056,70,2011,361.900000,1,1,0.0,4.920000,1,0,0
1057,70,2012,361.900000,1,1,0.0,3.720000,1,0,0


In [361]:
trovaIperparametri(X=X, y=y, model=SVC, numero_dimensioni=9)

{'SVM__C': 0.1,
 'SVM__gamma': 0.1,
 'SVM__kernel': 'poly',
 'reduce_dim__n_components': 6}

In [362]:
addestraSVC(X=X, y=y, c=0.1, gamma=0.1, kernel='poly', dim=6)

0.9339622641509434

In [363]:
trovaIperparametri(X=X, y=y, model=RandomForestClassifier, numero_dimensioni=9)

{'random_forest__n_estimators': 30, 'reduce_dim__n_components': 6}

In [364]:
addestraRandomForestClassifier(X=X, y=y, n_estimators=30, dim=6)

0.9245283018867925