In [92]:
# data manipulation and plotting
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# for saving the pipeline
import joblib

# from Scikit-learn
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Binarizer
from sklearn.metrics import *


# from feature-engine
from feature_engine.imputation import (
     MeanMedianImputer,
     CategoricalImputer
)


from feature_engine.encoding import (
     RareLabelEncoder,
     OneHotEncoder,
)



from feature_engine.selection import DropFeatures

from imblearn.under_sampling import RandomUnderSampler

# XGboost classifier
from xgboost import XGBClassifier


import warnings
warnings.filterwarnings('ignore')

from helpers import MongoDAO
from datetime import datetime

#Shap-values
# from shap import TreeExplainer


In [93]:
#Load data
mongo = MongoDAO('admin', 'admin', 'localhost', 27017)
mongo.set_db('bra_paralelo')
mongo.set_collection('feature_store')
account_store_collection = mongo.get_collection()

In [94]:
# load dataset
df = pd.DataFrame(list(account_store_collection.find()))

# rows and columns of the data
print(df.shape)

# visualise the dataset
df.head()

(14641, 17)


Unnamed: 0,_id,account_id,churn,ltv,months,product,channel,date_of_birth,country,post,like,adview,message,reply,newfriend,dislike,unfriend
0,62f98647a101fb33dedd2c23,1,0,49.95,5,standard,appstore2,1948-09-10,CN,329.0,323.0,184.0,123.0,59.0,57.0,25.0,1.0
1,62f98647a101fb33dedd2c24,2,0,49.95,5,standard,appstore1,1952-11-06,AU,340.0,207.0,117.0,33.0,8.0,30.0,388.0,1.0
2,62f98647a101fb33dedd2c25,3,0,49.95,5,standard,appstore1,2002-11-10,GB,303.0,793.0,1043.0,95.0,26.0,38.0,88.0,3.0
3,62f98647a101fb33dedd2c26,4,0,49.95,5,standard,appstore1,1976-06-18,US,140.0,434.0,135.0,201.0,39.0,55.0,160.0,1.0
4,62f98647a101fb33dedd2c27,5,0,49.95,5,standard,appstore1,1975-11-18,KR,21.0,46.0,66.0,54.0,8.0,3.0,16.0,2.0


In [95]:
X_train,X_test,y_train,y_test = train_test_split(df.drop(['churn'],axis=1),
                                                 df['churn'],
                                                 test_size=0.2,
                                                 random_state=0)


X_train.shape, X_test.shape

((11712, 16), (2929, 16))

In [96]:
#CONFIGURATION
NUM_FREQUENT_INPUTATIONS = [ 'post',
                             'like',
                             'adview',
                             'message',
                             'reply',
                             'newfriend',
                             'dislike',
                             'unfriend']


CATEGORICAL_INPTUTATIONS = ['country']

NUM_VARS_TO_NORMALIZE = ['ltv',
                         'post',
                         'like',
                         'adview',
                         'message',
                         'reply',
                         'newfriend',
                         'dislike',
                         'unfriend',
                         'idade']

CAT_VARS_RARE_LABELS = ['channel',
                        'country']

CAT_VARS_ONE_HOT_ENCODER = ['channel',
                        'country']


DROP_FEATURES = ['_id',
                 'account_id',
                 'months',
                 'product',
                 'date_of_birth']



In [97]:
under = RandomUnderSampler(sampling_strategy='majority')
X_train, y_train = under.fit_resample(X_train, y_train)
X_test, y_test = under.fit_resample(X_test, y_test)

In [98]:
class_pipe = Pipeline([

    #======== drop features ===========
    ('drop_features', DropFeatures(DROP_FEATURES)),

    #======== input missing values =======
    ('missing_imputation', MeanMedianImputer(
        imputation_method='median', variables=NUM_FREQUENT_INPUTATIONS)),

    ('categorical_imputation', CategoricalImputer(
        imputation_method='frequent', variables=CATEGORICAL_INPTUTATIONS)),

    #========= RareLabelEncoder =========
    ('rare_label_encoder', RareLabelEncoder(
        variables=CAT_VARS_RARE_LABELS, tol=0.05, n_categories=10)),

    #========= OneHotEncoder =========
    ('one_hot_encoder', OneHotEncoder(variables=CAT_VARS_ONE_HOT_ENCODER,drop_last=False, top_categories=6)),



    #========= MinMaxScaler  =========
    ('min_max', MinMaxScaler(feature_range=(0,1))),


    #========= XGBClassifier =========
    ('xgb', XGBClassifier(n_estimators=100,
                          learning_rate=0.1,
                          max_depth=3,
                          min_child_weight=1,
                          gamma=1,
                          subsample=0.8,
                          colsample_bytree=0.6,
                          objective='binary:logistic',
                          nthread=4,
                          scale_pos_weight=1,
                          seed=27))

])

In [99]:
class_pipe.fit(X_train, y_train)

In [100]:
y_test_pred = class_pipe.predict(X_test)


In [101]:
print(confusion_matrix(y_test,y_test_pred))


[[344 132]
 [ 25 451]]


In [102]:
print('Accuracy: ', accuracy_score(y_test,y_test_pred))
print('ROC AUC: ', roc_auc_score(y_test,y_test_pred))
print('Recall: ', recall_score(y_test,y_test_pred))
print('Precision: ', precision_score(y_test,y_test_pred))
print('F1: ', f1_score(y_test,y_test_pred))

Accuracy:  0.8350840336134454
ROC AUC:  0.8350840336134453
Recall:  0.9474789915966386
Precision:  0.7735849056603774
F1:  0.8517469310670444


In [103]:
import joblib

joblib.dump(class_pipe, '../model/class_pipe.pkl')

['../model/class_pipe.pkl']

In [105]:
df.to_csv('../output/class_pipe.csv')