### Carlos Julian Barreto Mora
#### Universidad de los Andes
#### Ciencia de datos aplicada

In [248]:
import pandas as pd
import json
import numpy as np
from sklearn import pipeline
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler,OrdinalEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
import joblib

In [249]:
#Leer dataframe
df=pd.read_json('DataSet_Entrenamiento_v1.json')
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,5386-THSLQ,Female,1,Yes,No,66,No,No phone service,DSL,No,...,Yes,No,Yes,No,One year,No,Bank transfer (automatic),45.55,3027.25,No
4,3192-NQECA,Male,0,Yes,No,68,Yes,Yes,Fiber optic,No,...,Yes,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),110.00,7611.85,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,4933-BSAIP,Female,0,Yes,No,40,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,19.10,780.1,No
5996,2030-BTZRO,Male,0,Yes,Yes,6,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,Yes,Credit card (automatic),20.40,107.6,No
5997,1116-DXXDF,Male,0,No,No,39,Yes,No,Fiber optic,Yes,...,Yes,No,Yes,Yes,Two year,Yes,Electronic check,100.45,3801.7,No
5998,9274-CNFMO,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,No,Electronic check,74.95,308.7,Yes


Se observa que hay una serie de valores con comillas, por ello es necesario realizar esta corrección antes de clasificar las variables entre númericas y categoricas.

In [250]:
#Ajustar valores irreglulares y definir tipos de datos para cada una de las features
df.loc[df['TotalCharges'] == '', 'TotalCharges'] = 'NaN'
df["TotalCharges"] = df["TotalCharges"].astype("float64")
df["SeniorCitizen"] = df["SeniorCitizen"].astype("object")
ID=df['customerID']
df=df.drop('customerID', axis=1)

In [251]:
df.describe()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
count,6000.0,6000.0,5990.0
mean,32.320667,64.607233,2281.988155
std,24.637296,30.195382,2274.401428
min,0.0,18.25,18.8
25%,9.0,35.0,389.3375
50%,29.0,70.3,1391.95
75%,56.0,89.85,3803.725
max,72.0,118.75,8684.8


In [252]:
#Se adicionan valores nulos y no ceros, para poder realizar la imputación mas adelante
df.isnull().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        10
Churn                0
dtype: int64

In [253]:
column_names={'gender':0, 'SeniorCitizen':1, 'Partner':2, 'Dependents':3, 'tenure':4,
       'PhoneService':5, 'MultipleLines':6, 'InternetService':7, 'OnlineSecurity':8,
       'OnlineBackup':9, 'DeviceProtection':10, 'TechSupport':11, 'StreamingTV':12,
       'StreamingMovies':13, 'Contract':14, 'PaperlessBilling':15, 'PaymentMethod':16,
       'MonthlyCharges':17, 'TotalCharges':18, 'Churn':19}
df=df.rename(columns=column_names)

In [254]:
numeric_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='mean'))
      ,('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='constant'))
      ,('encoder', OrdinalEncoder())
])
print(numeric_transformer)

Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler())])


In [255]:
# get the categorical and numeric column names
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
categorical_features = df.select_dtypes(include=['object','int32']).columns

In [256]:
preprocessor = ColumnTransformer(
   transformers=[
    ('numeric', numeric_transformer, numeric_features)
   ,('categorical', categorical_transformer, categorical_features)
]) 

In [257]:
df2=pd.DataFrame(preprocessor.fit_transform(df))
X_train, X_test, y_train, y_test = train_test_split(df2.iloc[:,:-1].values,df2[19],
                                                   test_size = 0.4,
                                                   random_state = 10)

In [125]:
pipelineLR = make_pipeline(LogisticRegression())
pipelineKN = make_pipeline(KNeighborsClassifier())
pipelineDT = make_pipeline(DecisionTreeClassifier())

In [126]:
pipelineLR.fit(X_train, y_train)
y_predict=pipelineLR.predict(X_test)
score = pipelineLR.score(X_test, y_test)
print(score)
print(roc_auc_score(y_test,y_predict))

0.7970833333333334
0.7156439624485881


In [127]:
pipelineKN.fit(X_train, y_train)
y_predict=pipelineKN.predict(X_test)
scoreKN = pipelineKN.score(X_test, y_test)
print(scoreKN)
print(roc_auc_score(y_test,y_predict))

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


0.7570833333333333
0.6732523917551307


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [128]:
pipelineDT = make_pipeline(DecisionTreeClassifier())
pipelineDT.fit(X_train, y_train)
y_redict=pipelineDT.predict(X_test)
scoreDT = pipelineDT.score(X_test, y_test)
print(scoreDT)
print(roc_auc_score(y_test,y_predict))

0.72
0.6732523917551307


In [129]:
x_train=X_train
x_test=X_test

In [130]:
pipe_lr = Pipeline([('clf', LogisticRegression(random_state=42))])

pipe_dt = Pipeline([('model', DecisionTreeClassifier(random_state=42))])

pipe_rf = Pipeline([('clf', RandomForestClassifier(random_state=42))])

pipe_svm = Pipeline([('clf', svm.SVC(random_state=42))])

In [131]:
jobs = -1
param_range = [9, 10]
param_range_fl = [1.0, 0.5]

grid_params_lr = [{'clf__penalty': ['l1', 'l2'],
        'clf__C': param_range_fl,
        'clf__solver': ['liblinear']}] 


grid_params_rf = [{'clf__criterion': ['gini', 'entropy'],
        'clf__max_depth': param_range,
        'clf__min_samples_split': param_range[1:]}]

grid_params_svm = [{'clf__kernel': ['linear', 'rbf'], 
        'clf__C': param_range}]


In [132]:
LR = GridSearchCV(estimator=pipe_lr,
            param_grid=grid_params_lr,
            scoring='roc_auc',
            cv=10)
RF = GridSearchCV(estimator=pipe_rf,
            param_grid=grid_params_rf,
            scoring='roc_auc',
            cv=10, 
            n_jobs=jobs)
SVM = GridSearchCV(estimator=pipe_svm,
            param_grid=grid_params_svm,
            scoring='roc_auc',
            cv=10,
            n_jobs=jobs)
# List of pipelines for iterating through each of them
grids = [LR,RF,SVM]

In [134]:
grid_dict = {0: 'Logistic Regression', 
        1: 'Random Forest',
        2: 'Support Vector Machine'}
#Fit the grid search objects
print('Performing model optimizations...')
best_auc = 0.0
best_clf = 0
best_gs = ''
for idx, gs in enumerate(grids):
    print('\nEstimator: %s' % grid_dict[idx])
    gs.fit(x_train, y_train)
    print('Best params are : %s' % gs.best_params_)
    # Best training data accuracy
    print('Best training accuracy: %.3f' % gs.best_score_)
    # Predict on test data with best params
    y_pred = gs.predict(X_test)
    # Test data accuracy of model with best params
    print('Test set accuracy score for best params: %.3f ' % roc_auc_score(y_test, y_pred))
    # Track best (highest test accuracy) model
    if roc_auc_score(y_test, y_pred) > best_auc:
        best_auc = roc_auc_score(y_test, y_pred)
        best_gs = gs
        best_clf = idx
print('\nClassifier with best test set accuracy: %s' % grid_dict[best_clf])
dump_file = 'best_grid_search_pipeline.pkl'
best_gs.fit(X_train,y_train)
joblib.dump(best_gs, dump_file, compress=1)
print('\nSaved %s grid search pipeline to file: %s' % (grid_dict[best_clf], dump_file))

Performing model optimizations...

Estimator: Logistic Regression
Best params are : {'clf__C': 0.5, 'clf__penalty': 'l1', 'clf__solver': 'liblinear'}
Best training accuracy: 0.838
Test set accuracy score for best params: 0.716 

Estimator: Random Forest
Best params are : {'clf__criterion': 'entropy', 'clf__max_depth': 9, 'clf__min_samples_split': 10}
Best training accuracy: 0.837
Test set accuracy score for best params: 0.718 

Estimator: Support Vector Machine
Best params are : {'clf__C': 10, 'clf__kernel': 'linear'}
Best training accuracy: 0.825
Test set accuracy score for best params: 0.706 

Classifier with best test set accuracy: Random Forest

Saved Random Forest grid search pipeline to file: best_grid_search_pipeline.pkl
