# Vodafone COPS. Model designing & training. Local version using scikit-learn

In [26]:
import pandas as pd
import numpy as np
import time
import logging
import seaborn as sns
import json
from collections import defaultdict
from scipy import sparse, stats
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from imblearn.over_sampling import SMOTE

logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [62]:
# Utilities

def compute_linear_model_statistics(model, X, featurenames):
    """
    Linear model statistics computing.
    """
    
    params = np.append(model.intercept_, model.coef_)
    predictions = model.predict(X)

    newX = np.append(np.ones((len(X),1)), X, axis=1)
    MSE = (sum((y-predictions) ** 2)) / (len(newX) - len(newX[0]))

    var_b = MSE * (np.linalg.inv(np.dot(newX.T,newX)).diagonal())
    sd_b = np.sqrt(var_b)
    ts_b = params/ sd_b
    p_values =[2 * (1 - stats.t.cdf(np.abs(i),(len(newX) - 1))) for i in ts_b]

    sd_b = np.round(sd_b, 3)
    ts_b = np.round(ts_b, 3)
    p_values = np.round(p_values, 3)
    params = np.round(params, 4)
    
    statistics = pd.DataFrame({'coefficien_name': ['intercept'] + featurenames, 
                         'coefficien_value': params, 
                         'standard_error': sd_b,
                         't_value': ts_b,
                         'p_value': p_values}).sort_values('coefficien_value', ascending=False)
    statistics.drop(0, inplace=True)
    return statistics.style.bar(subset=['coefficien_value'], align='mid', color=['#d65f5f', '#5fba7d'])

def clean_data(df):
    """
    Custom data cleaning performed before preprocessing.
    """
    
    with open('../data/world_regions.json', 'r') as f:
        map_dict = json.load(f)
    
    df.loc[:, string_cols_imput_most_freq].replace('', np.nan, inplace=True)
    df.loc[:, 'billing_cycle_id'] = df['billing_cycle_id'].apply(lambda x: str(x)[-2:])
    df.loc[:, 'nationality'] = df['nationality'].apply(lambda x: 'Español' if x == 'España' else 'Extranjero')
    #df.loc[:, 'nationality'] = df['nationality'].map(map_dict)

def preprocessing_train(train, 
          string_cols_imput_most_freq, 
          numeric_cols_imput_mean, 
          numeric_cols_imput_zero,
          targetcol):
    """
    Training set preprocessing script.
    """
    
    logging.info('#########################################################')
    logging.info("################ Preprocessing train set ################")

    params = defaultdict(dict)

    y = train[targetcol].values
    
    feature_cols = string_cols_imput_most_freq + \
                   numeric_cols_imput_mean + \
                   numeric_cols_imput_zero

    features = train[feature_cols].copy()

    clean_data(features)

    for col in string_cols_imput_most_freq:
        logging.info("Processing feature {col}".format(col=col))
        most_frequent_value = features[col].value_counts().idxmax()
        features[col].fillna(most_frequent_value, inplace=True)
        params[col]['most_frequent_value'] = most_frequent_value
        dummies_cols = sorted([col + '_' + e for e in list(features[col].unique())])
        params[col]['dummies_cols'] = dummies_cols
        dummy_df = pd.get_dummies(features[col].astype('category'), 
                                                prefix=col, drop_first=False)
        features = features.join(dummy_df[dummies_cols])
        features.drop(col, axis=1, inplace=True)
        
    for col in numeric_cols_imput_mean:
        logging.info("Processing feature {col}".format(col=col))
        mean_value = features[col].mean()
        features.loc[:, col].fillna(mean_value, inplace=True)
        params[col]['mean_value'] = mean_value
        
    for col in numeric_cols_imput_zero:
        logging.info("Processing feature {col}".format(col=col))
        features[col].fillna(0, inplace=True)
        
    logging.info("Feature columns are: {columns}".format(columns=","\
                                                        .join(['\n * ' + col for col in features.columns])))

    scaler = StandardScaler()
    
    logging.info("Performing standarization")
    X = scaler.fit_transform(features.values)
    
    featurenames = list(features.columns)
    
    params['scaler'] = scaler
    
    logging.info("Done!")
    logging.info('#########################################################')

    return X, y, params, featurenames

def preprocessing_test(test,
                       string_cols_imput_most_freq, 
                       numeric_cols_imput_mean, 
                       numeric_cols_imput_zero,
                       targetcol,
                       params):
    """
    Test set preprocessing script.
    """
    
    logging.info('#########################################################')
    logging.info("################ Preprocessing test set ################")
    
    y = test[targetcol].values
    
    feature_cols = string_cols_imput_most_freq + \
                   numeric_cols_imput_mean + \
                   numeric_cols_imput_zero

    features = test[feature_cols].copy()

    clean_data(features)
    
    for col in string_cols_imput_most_freq:
        logging.info("Processing feature {col}".format(col=col))
        most_frequent_value = params[col]['most_frequent_value']
        features[col].fillna(most_frequent_value, inplace=True)
        dummies_cols_seen = [col + '_' + e for e in list(features[col].unique())]
        dummies_cols_expected = params[col]['dummies_cols']
        
        dummy_df = pd.get_dummies(features[col].astype('category'), 
                                                prefix=col, drop_first=False)
        
               
        dummies_not_seen_before = set(dummies_cols_seen) - set(dummies_cols_expected)
        dummies_dissapeared = set(dummies_cols_expected) - set(dummies_cols_seen)
        
        for dummy in dummies_dissapeared:
            logging.info("Expected category {dummy}, setting to 0...".format(dummy=dummy))
            dummy_df[dummy] = 0
            
        for dummy in dummies_not_seen_before:
            logging.info("Found new category {dummy}, dropping...".format(dummy=dummy))
            dummy_df.drop(dummy, inplace=True, axis=1)
            
        features = features.join(dummy_df[dummies_cols_expected])
        features.drop(col, axis=1, inplace=True) 
            
    for col in numeric_cols_imput_mean:
        logging.info("Processing feature {col}".format(col=col))
        mean_value = params[col]['mean_value']
        features.loc[:, col].fillna(mean_value, inplace=True)
    
    for col in numeric_cols_imput_zero:
        logging.info("Processing feature {col}".format(col=col))
        features[col].fillna(0, inplace=True)
        
    logging.info("Feature columns are: {columns}".format(columns=","\
                                                        .join(['\n * ' + col for col in features.columns])))    
    logging.info("Performing standarization")
    X = params['scaler'].transform(features.values)
    
    logging.info("Done!")
    logging.info('#########################################################\n')
    
    return X, y

def compute_and_save_predictions(model, X, y, name):
    logging.info("Computing predictions for {name}...".format(name=name))
    results = pd.DataFrame({'probability': model.predict_proba(X)[:,1], 'prediction': model.predict(X), 'label':y})
    logging.info("Saving {name} predictions to csv file...".format(name=name))
    results.to_csv('../data/{name}_local_predictions.csv'.format(name=name))

## scikit-learn models

### pandas Data Loading

In [67]:
# dtype dict

dtypes = {
    'billing_cycle_id': 'int',
    'data_plan_c': 'object',
    'voice_plan_c': 'object',
    'promo_code_vf': 'object',
    'promo_code_tarif': 'object', 
    'zip_code': 'object',
    'region_code': 'object',
    'gender': 'object', 
    'type_ident': 'object', 
    'nationality': 'object',
    'n_lines': 'float', 
    'n_lines_pre': 'float',
    'n_lines_post': 'float',
    'age': 'float', 
    'months_to_end_promo_tarif': 'float',
    'months_to_end_promo_vf': 'float',
    'voice_plan_change': 'int',
    'data_plan_change': 'int',
    'n_calls_billing_c': 'int',
    'n_calls_billing_c_minus_1': 'int',
    'n_calls_churn_c': 'int',
    'n_calls_churn_c_minus_1': 'int', 
    'n_calls_tariff_c': 'int',
    'n_calls_tariff_c_minus_1': 'int',
    'n_calls_dsl_inc_c': 'int',
    'n_calls_dsl_inc_c_minus_1': 'int',
    'n_calls_mobile_inc_c': 'int',
    'n_calls_mobile_inc_c_minus_1': 'int', 
    'n_calls_device_upgr_c': 'int',
    'n_calls_device_upgr_c_minus_1': 'int',
    'n_calls_device_del_rep_c': 'int',
    'n_calls_device_del_rep_c_minus_1':'int',
    'n_calls_new_adds_c': 'int',
    'n_calls_new_adds_c_minus_1': 'int',
    'n_calls_ser_man_c': 'int',
    'n_calls_ser_man_c_minus_1': 'int',
    'label': 'int'
}

# Complete column set to load

string_cols_imput_null = ['data_plan_c', 'voice_plan_c', 'promo_code_vf', 'promo_code_tarif', 'zip_code', 'region_code',
                          'gender', 'type_ident', 'nationality']

numeric_cols_imput_mean = ['n_lines', 'n_lines_pre', 'n_lines_post', 'age', 'months_to_end_promo_tarif', 
                           'months_to_end_promo_vf']

numeric_cols_imput_zero = ['voice_plan_change', 'data_plan_change', 'n_calls_billing_c', 'n_calls_billing_c_minus_1', 
                           'n_calls_churn_c', 'n_calls_churn_c_minus_1', 'n_calls_tariff_c', 'n_calls_tariff_c_minus_1', 
                           'n_calls_dsl_inc_c', 'n_calls_dsl_inc_c_minus_1', 'n_calls_mobile_inc_c', 
                           'n_calls_mobile_inc_c_minus_1', 'n_calls_device_upgr_c', 'n_calls_device_upgr_c_minus_1', 
                           'n_calls_device_del_rep_c', 'n_calls_device_del_rep_c_minus_1', 'n_calls_new_adds_c',
                           'n_calls_new_adds_c_minus_1', 'n_calls_ser_man_c', 'n_calls_ser_man_c_minus_1']

targetcol = 'label'

cols = string_cols_imput_null + numeric_cols_imput_mean + numeric_cols_imput_zero + [targetcol] + ['billing_cycle_id']

# csv loading

logging.info("Loading train data...")
train = pd.read_csv("../data/train.csv", dtype=dtypes, usecols=cols)
logging.info("Loading test data 1...")
test_1 = pd.read_csv("../data/test_1.csv", dtype=dtypes, usecols=cols)
logging.info("Loading test data 2...")
test_2 = pd.read_csv("../data/test_2.csv", dtype=dtypes, usecols=cols)
billing_cycle_id_filter = (test_2['billing_cycle_id'] == 20171201) | (test_2['billing_cycle_id'] == 20171208)
test_2 = test_2[billing_cycle_id_filter]

whole = pd.concat([train, test_1, test_2]).reset_index()

INFO:root:Loading train data...
INFO:root:Loading test data 1...
INFO:root:Loading test data 2...


### Data Preprocessing

In [69]:
# Selected features to proces

train = pd.concat([test_1]).reset_index()
test_1 = test_2[test_2['billing_cycle_id'] == 20171201]
test_2 = test_2[test_2['billing_cycle_id'] == 20171208]

string_cols_imput_most_freq = ['nationality', 'billing_cycle_id', 'gender']

numeric_cols_imput_mean = ['n_lines_pre', 'n_lines_post', 'age', 'months_to_end_promo_vf']

numeric_cols_imput_zero = ['voice_plan_change', 
                           'n_calls_billing_c', 'n_calls_billing_c_minus_1', 
                           'n_calls_churn_c', 'n_calls_churn_c_minus_1', 
                           'n_calls_tariff_c', 'n_calls_tariff_c_minus_1', 
                           'n_calls_dsl_inc_c', 'n_calls_dsl_inc_c_minus_1', 
                           'n_calls_mobile_inc_c', 'n_calls_mobile_inc_c_minus_1', 
                           'n_calls_device_upgr_c', 'n_calls_device_upgr_c_minus_1', 
                           'n_calls_device_del_rep_c', 'n_calls_device_del_rep_c_minus_1', 
                           'n_calls_new_adds_c', 'n_calls_new_adds_c_minus_1', 
                           'n_calls_ser_man_c', 'n_calls_ser_man_c_minus_1']

X, y, params, featurenames = preprocessing_train(train, 
                             string_cols_imput_most_freq, 
                             numeric_cols_imput_mean, 
                             numeric_cols_imput_zero,
                             targetcol)

X_test_1, y_test_1 = preprocessing_test(test_1, 
                                    string_cols_imput_most_freq, 
                                    numeric_cols_imput_mean, 
                                    numeric_cols_imput_zero,
                                    targetcol,
                                    params)

X_test_2, y_test_2 = preprocessing_test(test_2, 
                                    string_cols_imput_most_freq, 
                                    numeric_cols_imput_mean, 
                                    numeric_cols_imput_zero,
                                    targetcol,
                                    params)

X_whole, y_whole, _, _, = preprocessing_train(whole.sample(frac=0.05), 
                                       string_cols_imput_most_freq, 
                                       numeric_cols_imput_mean, 
                                       numeric_cols_imput_zero, 
                                       targetcol)

# sm = SMOTE(ratio= 1.0, n_jobs=10)
# X, y = sm.fit_sample(X, y)

INFO:root:#########################################################
INFO:root:################ Preprocessing train set ################
INFO:root:Processing feature nationality
INFO:root:Processing feature billing_cycle_id
INFO:root:Processing feature gender
INFO:root:Processing feature n_lines_pre
INFO:root:Processing feature n_lines_post
INFO:root:Processing feature age
INFO:root:Processing feature months_to_end_promo_vf
INFO:root:Processing feature voice_plan_change
INFO:root:Processing feature n_calls_billing_c
INFO:root:Processing feature n_calls_billing_c_minus_1
INFO:root:Processing feature n_calls_churn_c
INFO:root:Processing feature n_calls_churn_c_minus_1
INFO:root:Processing feature n_calls_tariff_c
INFO:root:Processing feature n_calls_tariff_c_minus_1
INFO:root:Processing feature n_calls_dsl_inc_c
INFO:root:Processing feature n_calls_dsl_inc_c_minus_1
INFO:root:Processing feature n_calls_mobile_inc_c
INFO:root:Processing feature n_calls_mobile_inc_c_minus_1
INFO:root:Proces

### Logistic Regression Model Definition & Training

In [84]:
# X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2)
lm = LogisticRegressionCV(verbose=3, n_jobs=16, cv=10, Cs=10, scoring='roc_auc')
logging.info("Fitting logistic regression model...")
lm.fit(X, y)
logging.info("Done!")

INFO:root:Fitting logistic regression model...
[Parallel(n_jobs=16)]: Done   3 out of  10 | elapsed:  2.1min remaining:  5.0min
[Parallel(n_jobs=16)]: Done   7 out of  10 | elapsed:  2.5min remaining:  1.1min
[Parallel(n_jobs=16)]: Done  10 out of  10 | elapsed:  2.7min finished
INFO:root:Done!


### Other Models Definition & Training

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 150, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

model = RandomForestClassifier()

model_random_grid = RandomizedSearchCV(estimator=model, 
                                       param_distributions=random_grid, 
                                       n_iter=50, 
                                       cv=5,
                                       verbose=3, 
                                       n_jobs=16, 
                                       scoring='roc_auc')

logging.info("Fitting classifier model...")
model_random_grid.fit(X, y)
logging.info("Done!")

INFO:root:Fitting classifier model...


Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] n_estimators=134, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=10, bootstrap=False 
[CV] n_estimators=134, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=10, bootstrap=False 
[CV] n_estimators=134, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=10, bootstrap=False 
[CV] n_estimators=134, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=10, bootstrap=False 
[CV] n_estimators=134, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=10, bootstrap=False 
[CV] n_estimators=118, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=20, bootstrap=True 
[CV] n_estimators=118, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=20, bootstrap=True 
[CV] n_estimators=118, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=20, bootstrap=True 
[CV] n_estimators=118, min

In [17]:
model = RandomForestClassifier()
logging.info("Fitting classifier model...")
model.fit(X_whole, y_whole)
logging.info("Done!")

INFO:root:Fitting classifier model...
INFO:root:Done!


In [64]:
from sklearn.externals import joblib
joblib.dump(model_random_grid, 'rf.pkl') 

['rf.pkl']

### Predictions computing

In [73]:
compute_and_save_predictions(lm, X, y, 'train')
compute_and_save_predictions(lm, X_test_1, y_test_1, 'test_1')
compute_and_save_predictions(lm, X_test_2, y_test_2, 'test_2')

INFO:root:Computing predictions for train...
INFO:root:Saving train predictions to csv file...
INFO:root:Computing predictions for test_1...
INFO:root:Saving test_1 predictions to csv file...
INFO:root:Computing predictions for test_2...
INFO:root:Saving test_2 predictions to csv file...


In [66]:
compute_and_save_predictions(model_random_grid, X, y, 'train')
compute_and_save_predictions(model_random_grid, X_test_1, y_test_1, 'test_1')
compute_and_save_predictions(model_random_grid, X_test_2, y_test_2, 'test_2')

INFO:root:Computing predictions for train...
INFO:root:Saving train predictions to csv file...
INFO:root:Computing predictions for test_1...
INFO:root:Saving test_1 predictions to csv file...
INFO:root:Computing predictions for test_2...
INFO:root:Saving test_2 predictions to csv file...


### Models statistics

In [75]:
model_stats = compute_linear_model_statistics(lm, X, featurenames)
model_stats

  from ipykernel import kernelapp as app


Unnamed: 0,coefficien_name,coefficien_value,p_value,standard_error,t_value
6,n_calls_billing_c,0.2178,0.0,0.0,4389.18
7,n_calls_billing_c_minus_1,0.1637,0.0,0.0,3318.82
4,months_to_end_promo_vf,0.0958,0.0,0.0,2027.39
8,n_calls_churn_c,0.0902,0.0,0.0,1830.92
5,voice_plan_change,0.0868,0.0,0.0,1800.6
23,n_calls_ser_man_c_minus_1,0.0787,0.0,0.0,1600.28
12,n_calls_dsl_inc_c,0.0768,0.0,0.0,1566.36
22,n_calls_ser_man_c,0.0608,0.0,0.0,1253.94
9,n_calls_churn_c_minus_1,0.0506,0.0,0.0,1023.4
13,n_calls_dsl_inc_c_minus_1,0.0483,0.0,0.0,972.743
