![SolidQ](https://antoniosql.github.io/images/SolidQ_Verne.png) 

# Validación Cruzada

In [1]:
import pandas as pd
import numpy as np
import sqlite3
import matplotlib.pyplot as plt


In [2]:
conn = sqlite3.connect('Tiendas24H.sqlite')
consulta="select c.CodCliente, FechaNacimiento, EstadoCivil, Sexo, IngresosAnuales, N.NivelAcademico, O.Ocupacion, case     when v.CodCliente is null then 0   else 1  end as 'EsCliente' from clientes c inner join NivelAcademico N    on c.NivelAcademico = N.Codigo     inner join Ocupacion O on c.Ocupacion = o.Codigo     left outer join VentasCAB v          on v.codcliente = C.codcliente where c.Codcliente <>'CONTADO'"

df = pd.read_sql_query(consulta,conn)

df.head()

Unnamed: 0,CodCliente,FechaNacimiento,EstadoCivil,Sexo,IngresosAnuales,NivelAcademico,Ocupacion,EsCliente
0,20,1976-04-08,M,M,90000.0,Licenciatura,Profesional especializado,1
1,26,1975-05-14,S,M,60000.0,Licenciatura,Profesional especializado,1
2,26,1975-05-14,S,M,60000.0,Licenciatura,Profesional especializado,1
3,26,1975-05-14,S,M,60000.0,Licenciatura,Profesional especializado,1
4,26,1975-05-14,S,M,60000.0,Licenciatura,Profesional especializado,1


In [3]:
df.dropna(axis=0,inplace=True)

In [4]:
caracteristicas = ["EstadoCivil", "Sexo", "IngresosAnuales","NivelAcademico", "Ocupacion"]
etiqueta="EsCliente"

In [5]:
from sklearn.preprocessing import  LabelEncoder

le=LabelEncoder()
df['EstadoCivil'] = le.fit_transform(df['EstadoCivil'])
df['Sexo'] = le.fit_transform(df['Sexo'])

In [6]:
dum_df = pd.get_dummies(df[caracteristicas], columns=["NivelAcademico","Ocupacion"] )

In [12]:

X, y = dum_df, df[etiqueta]

# Escalado

In [14]:

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X= scaler.fit_transform(X)


In [21]:
from imblearn.over_sampling import SMOTE 
from collections import Counter


sm = SMOTE(random_state=42)
X, y = sm.fit_resample(X, y)


Distribución Original Counter({1: 10340, 0: 10340})
Distribución balanceada Counter({1: 10340, 0: 10340})


# Validación Cruzada

In [15]:
from sklearn.model_selection import KFold

def run_cv(X,y,clf_class,**kwargs):
    # Construir el objeto Kfolds
    kf = KFold(5)
    y_pred = y.copy()

    # Iterar por los Folds
    for k, (train, test) in enumerate(kf.split(X, y)):
        X_train, X_test = X[train], X[test]
        y_train = y[train]
        # Inicializar un clasificador con los argumentos clave
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_pred[test] = clf.predict(X_test)
    return y_pred

In [16]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN
import numpy as np

def accuracy(y_true,y_pred):
    # NumPy interprets True and False as 1. and 0.
    return np.mean(y_true == y_pred)

print ("Support vector machines:")
print ("%.3f" % accuracy(y, run_cv(X,y,SVC)))
print ("Random forest:")
print ("%.3f" % accuracy(y, run_cv(X,y,RF)))
print ("K-nearest-neighbors:")
print ("%.3f" % accuracy(y, run_cv(X,y,KNN)))

Support vector machines:
0.994
Random forest:
0.993
K-nearest-neighbors:
0.993


In [17]:
from sklearn.metrics import confusion_matrix

y = np.array(y)


confusion_matrix(y,run_cv(X,y,SVC))

array([[    0,    58],
       [    0, 10340]], dtype=int64)

In [18]:
confusion_matrix(y,run_cv(X,y,RF)) 


array([[    0,    58],
       [   16, 10324]], dtype=int64)

In [19]:
confusion_matrix(y,run_cv(X,y,KNN))

array([[    0,    58],
       [   11, 10329]], dtype=int64)