In [1]:
import pandas as pd
import numpy as np
import pickle
# import training and test set
train = pd.read_csv('/Users/alessia/Desktop/churn/train.csv')
test = pd.read_csv('/Users/alessia/Desktop/churn/test.csv')

In [2]:
# create dummy variable for Gender
# replace 'Male' with 0 and 'Female' with 1
test['Gender'] = test['Gender'].replace({'Male': 0, 'Female': 1})

In [3]:
# get dummy variable for Geography
dummies = pd.get_dummies(test['Geography'])

test_new = pd.concat([test, dummies], axis=1)      
test_new.drop(['Geography'], inplace=True, axis=1)

# create dummy for France and Germany
test_new['France'] = test_new['France'].replace({True: 1, False: 0})
test_new['Germany'] = test_new['Germany'].replace({True: 1, False: 0})
# drop Spain, as it is redundant (already encoded in the 0s of France and Germany)
test_new.drop(['Spain'], inplace=True, axis=1)

We don't remove outliers from the test set, so that predictions can be yielded for these datapoints, too. However, we check for them, in order to be careful with the predictions that are yielded for these datapoints. By printing them out, one can eventually check the reasons why they're considered outliers.

In [4]:
def outliers(train, test, continuous_train):
    for attr in continuous_train:
        q1, q3 = np.percentile(train[attr], [25, 75])
        iqr = q3 - q1
        lower_fence = q1 - 1.5 * iqr
        higher_fence = q3 + 1.5 * iqr

        print("Interquartile Range for {}: {}".format(attr, iqr))
        print("Lower Fence for {}: {}".format(attr, lower_fence))
        print("Higher Fence for {}: {}".format(attr, higher_fence))

        # remove outliers
        outliers_up = test[test[attr] > higher_fence]
        outliers_down = test[test[attr] < lower_fence]
        print(outliers_up, outliers_down)

In [5]:
outliers(train, test, ["CreditScore", "Age", "Balance", "EstimatedSalary"])

Interquartile Range for CreditScore: 113.0
Lower Fence for CreditScore: 427.5
Higher Fence for CreditScore: 879.5
Empty DataFrame
Columns: [id, CustomerId, Surname, CreditScore, Geography, Gender, Age, Tenure, Balance, NumOfProducts, HasCrCard, IsActiveMember, EstimatedSalary]
Index: []             id  CustomerId   Surname  CreditScore Geography  Gender   Age  \
239     165273    15724863  Sheppard          420    France       1  40.0   
488     165522    15726720  McKenzie          418    France       0  37.0   
1116    166150    15582951  Crawford          425    France       1  29.0   
1845    166879    15634606      Chin          425    France       1  32.0   
2086    167120    15636999       Mao          414    France       0  38.0   
...        ...         ...       ...          ...       ...     ...   ...   
107854  272888    15638989  Lettiere          418    France       1  42.0   
108711  273745    15695475       Hao          411    France       0  39.0   
109218  274252    1

In [6]:
# import base models and meta-learner for Stacking
model_file=open('/Users/alessia/Desktop/churn/gb_base.pkl', 'rb')
gb=pickle.load(model_file)
model_file=open('/Users/alessia/Desktop/churn/rf_base.pkl', 'rb')
rf=pickle.load(model_file)
model_file=open('/Users/alessia/Desktop/churn/dt_base.pkl', 'rb')
dt=pickle.load(model_file)

model_file=open('/Users/alessia/Desktop/churn/Stacking.pkl', 'rb')
meta_model=pickle.load(model_file)

# predict on base-learners
dt_pred_new = dt.predict(test_new[['CreditScore', 'Age', 'EstimatedSalary', 'Balance', 'Gender', 'Tenure',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'France', 'Germany']])
rf_pred_new = rf.predict(test_new[['CreditScore', 'Age', 'EstimatedSalary', 'Balance', 'Gender', 'Tenure',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'France', 'Germany']])
gb_pred_new = gb.predict(test_new[['CreditScore', 'Age', 'EstimatedSalary', 'Balance', 'Gender', 'Tenure',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'France', 'Germany']])

X_new_meta = np.column_stack((dt_pred_new, rf_pred_new, gb_pred_new))

# AUC on the test set
y_probs_test_stack = meta_model.predict_proba(X_new_meta)[:, 1]

In [7]:
# compute probabilities
probabilities = pd.DataFrame(y_probs_test_stack, columns = ['probabilities'])

In [8]:
# concatenate original test dataframe with probabilities
test_probabilities = pd.concat([test, probabilities], axis=1) 

In [9]:
# save test_probabilities as csv
test_probabilities.to_csv('/Users/alessia/Desktop/churn/test_probs.csv')