## Tutorial Python: Modelos Avanzados de Clasificacion

Carguemos los libraries utility

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import scale
from sklearn.model_selection import KFold, StratifiedKFold,cross_val_score, GridSearchCV

Y ahora los clasificadores que usaremos en este tutorial

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

Set up Cross-Validation (k=10). Dado que es un problema de clasificacion usemos la version estratificada.

In [3]:
k_fold = StratifiedKFold(n_splits=10)

Importemos los datos

In [4]:
datos = pd.read_csv('/Users/carloseqa/Dropbox/RStudioProjects/PredictiveAnalytics/bwt.csv')
datos.head()

Unnamed: 0,id,age,race,smoke,ptd,ht,ui,ftv,lwt,lowweight
0,1,19,black,no,no,no,yes,0,182,no
1,2,33,other,no,no,no,no,3,155,no
2,3,20,white,yes,no,no,no,1,105,no
3,4,21,white,yes,no,no,yes,2,108,no
4,5,18,white,yes,no,no,yes,0,107,no


Sklearn solo acepta variables numericas (aun para clasificacion). Es necesario definir dummies para todas las variables categoricas, incluyendo la variable dependiente. Python creara dummies para todas las categorias; una de ellas siempre es redundante. El setting 'drop_first=True' elimina la primera categoria.

In [5]:
datos=pd.get_dummies(datos,drop_first=True)
datos.head()

Unnamed: 0,id,age,ftv,lwt,race_other,race_white,smoke_yes,ptd_yes,ht_yes,ui_yes,lowweight_yes
0,1,19,0,182,0,0,0,0,0,1,0
1,2,33,3,155,1,0,0,0,0,0,0
2,3,20,1,105,0,1,1,0,0,0,0
3,4,21,2,108,0,1,1,0,0,1,0
4,5,18,0,107,0,1,1,0,0,1,0


Sklearn pide variable dependiente e independientes por separado. Normalicemos desde ahora los datos para uso con SVM

In [6]:
y = datos.lowweight_yes
X = datos.drop(datos.columns[[0,10]],axis=1)
X_scaled = scale(X)
X.head()

Unnamed: 0,age,ftv,lwt,race_other,race_white,smoke_yes,ptd_yes,ht_yes,ui_yes
0,19,0,182,0,0,0,0,0,1
1,33,3,155,1,0,0,0,0,0
2,20,1,105,0,1,1,0,0,0
3,21,2,108,0,1,1,0,0,1
4,18,0,107,0,1,1,0,0,1


## Random Forest

Ahora instanciemos y estimemos el modelo de Random Forest

In [7]:
modelo4 = RandomForestClassifier(random_state=25,n_estimators=500,max_features=2)
modelo4.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=2, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=25, verbose=0, warm_start=False)

In [8]:
scores4= cross_val_score(modelo4, X, y, cv=k_fold, n_jobs=-1)
np.mean(scores4)

0.5985380116959064

In [9]:
print(modelo4.feature_importances_)

[0.27608096 0.10303602 0.33786538 0.03082063 0.03991744 0.05302338
 0.07078219 0.0345712  0.0539028 ]


In [10]:
X.columns

Index(['age', 'ftv', 'lwt', 'race_other', 'race_white', 'smoke_yes', 'ptd_yes',
       'ht_yes', 'ui_yes'],
      dtype='object')

## Gradient Boosting Machine

In [11]:
modelo5 = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1,max_depth=1, subsample=0.75)
modelo5.fit(X,y)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=1,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=0.75, verbose=0,
              warm_start=False)

In [12]:
scores5= cross_val_score(modelo5, X, y, cv=k_fold, n_jobs=-1)
np.mean(scores5)

0.6409356725146199

## Support Vector Machines

In [13]:
modelo6 = SVC()
modelo6.fit(X_scaled,y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [14]:
scores6= cross_val_score(modelo5, X_scaled, y, cv=k_fold, n_jobs=-1)
np.mean(scores6)

0.6517543859649122

## Support Vector Machines (con Grid Search)

Ahora hagamos un poco de parameter search. Ustedes pueden ajustar el programa para que considere mas valores y tambien lo pueden aplicar a Random Forest y Gradient Boosting Machine.

In [20]:
parameters = {'C':2.**np.arange(-5,17,2),'gamma':2.**np.arange(-15,5,2)}
svc = SVC()
modelo7=GridSearchCV(svc, parameters,cv=5)
modelo7.fit(X_scaled,y)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': array([3.1250e-02, 1.2500e-01, 5.0000e-01, 2.0000e+00, 8.0000e+00,
       3.2000e+01, 1.2800e+02, 5.1200e+02, 2.0480e+03, 8.1920e+03,
       3.2768e+04]), 'gamma': array([3.05176e-05, 1.22070e-04, 4.88281e-04, 1.95312e-03, 7.81250e-03,
       3.12500e-02, 1.25000e-01, 5.00000e-01, 2.00000e+00, 8.00000e+00])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [18]:
scores7= cross_val_score(modelo7, X_scaled, y, cv=k_fold, n_jobs=-1)
np.mean(scores7)

0.6771929824561403

In [39]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV

In [35]:
rfe = RFE(LogisticRegression())
param_grid = {'n_features_to_select': np.arange(1,10,1)}
grid = GridSearchCV(rfe,param_grid)
grid.fit(X,y)

GridSearchCV(cv=None, error_score='raise',
       estimator=RFE(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
  n_features_to_select=None, step=1, verbose=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_features_to_select': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [36]:
scores8= cross_val_score(grid, X, y, cv=k_fold, n_jobs=-1)
np.mean(scores8)

0.6824561403508772

In [40]:
rfecv = RFECV(LogisticRegression())
rfecv.fit(X,y)

RFECV(cv=None,
   estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
   n_jobs=1, scoring=None, step=1, verbose=0)

In [None]:
scores= cross_val_score(rfecv, X, y, cv=k_fold, n_jobs=-1)
np.mean(scores8)