# Binary Classification: Expenditure Churn Prediction

## Load the dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_excel("./Expenditure-churn (3).xlsx")
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       10000 non-null  int64  
 1   gender    10000 non-null  int64  
 2   marital   10000 non-null  float64
 3   dep       10000 non-null  int64  
 4   Income    10000 non-null  float64
 5   Job yrs   10000 non-null  int64  
 6   Town yrs  10000 non-null  int64  
 7   Yrs Ed    10000 non-null  int64  
 8   Dri Lic   10000 non-null  int64  
 9   Own Home  10000 non-null  int64  
 10  # Cred C  10000 non-null  int64  
 11  Churn     10000 non-null  int64  
dtypes: float64(2), int64(10)
memory usage: 937.6 KB


## Use a Pipeline to preprocess and fit binary classification models

In [3]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Separate features from target
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Standardise the whole dataset
std_scaler = StandardScaler().fit(X_train)

def preprocessor(X):
    D = np.copy(X)
    D = std_scaler.transform(D)
    return D

In [4]:
preprocess_transformer = FunctionTransformer(preprocessor)
preprocess_transformer

FunctionTransformer(func=<function preprocessor at 0x0000023B7150CC18>)

### Fit a LogisticRegression() model

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

p1 = Pipeline([('scaler', preprocess_transformer),
              ('Logistic Regression', LogisticRegression())])
p1

Pipeline(steps=[('scaler',
                 FunctionTransformer(func=<function preprocessor at 0x0000023B7150CC18>)),
                ('Logistic Regression', LogisticRegression())])

In [12]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, accuracy_score

def fit_and_print(p, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test):
    # Fit the transformer
    p.fit(X_train, y_train)
    # Predict the train and test outputs
    test_prediction =p.predict(X_test)
    
    # Print the errors
    print("Accuracy Score:   "+str(accuracy_score(test_prediction, y_test)*100))
    print("Precision Score:  "+str(precision_score(test_prediction, y_test)*100))
    print("Recall Score:     "+str(recall_score(test_prediction, y_test)*100))
    print("roc_auc_score:    "+str(accuracy_score(test_prediction, y_test)*100))
    print("\nConfusion Matrix:\n", confusion_matrix(test_prediction, y_test))

In [13]:
fit_and_print(p1)

Accuracy Score:   99.44
Precision Score:  98.32335329341318
Recall Score:     100.0
roc_auc_score:    99.44

Confusion Matrix:
 [[1665   14]
 [   0  821]]


### Fit a RandomForestClassifier()

In [14]:
from sklearn.ensemble import RandomForestClassifier
p2 = Pipeline([('scaler', preprocess_transformer),
              ('RFC', RandomForestClassifier())])
fit_and_print(p2)

Accuracy Score:   98.56
Precision Score:  97.96407185628743
Recall Score:     97.7299880525687
roc_auc_score:    98.56

Confusion Matrix:
 [[1646   17]
 [  19  818]]


#### Feature Importances

In [15]:
rnd_clf = RandomForestClassifier()
sc = StandardScaler()
X = sc.fit_transform(X)
rnd_clf.fit(X, y)
for name, score in zip(dataset.iloc[:,:-1].columns.to_list(), rnd_clf.feature_importances_):
    print(name, score)

age 0.03753936938934116
gender 0.0025993071716105738
marital 0.0032149887099501323
dep 0.004585368083725957
Income 0.01788431192341633
Job yrs 0.01868626002885181
Town yrs 0.024382190821894634
Yrs Ed 0.0237583870586684
Dri Lic 0.0017683376331745356
Own Home 0.007394257836903157
# Cred C 0.8581872213424634


### Fit a voting classifier

In [16]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC(kernel='linear', random_state=0)
dec_clf = DecisionTreeClassifier()

voting_clf = VotingClassifier(
estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf), ('dec', dec_clf)],
voting='hard')

In [17]:
p3 = Pipeline([('scaler', preprocess_transformer),
              ('VCL', voting_clf)])
fit_and_print(p3)

Accuracy Score:   99.4
Precision Score:  98.20359281437125
Recall Score:     100.0
roc_auc_score:    99.4

Confusion Matrix:
 [[1665   15]
 [   0  820]]


### Bagging and Pasting

both bagging and pasting allow training instances to be sampled several
times across multiple predictors, but only bagging allows training instances to be sampled
several times for the same predictor. 

In [18]:
from sklearn.ensemble import BaggingClassifier
bag_clf = BaggingClassifier(
SVC(kernel='linear', random_state=0), n_estimators=500,
max_samples=100, bootstrap=True, n_jobs=-1, oob_score=True)

In [19]:
p4 = Pipeline([('scaler', preprocess_transformer),
              ('VCL', bag_clf)])
fit_and_print(p4)

Accuracy Score:   97.88
Precision Score:  93.65269461077844
Recall Score:     100.0
roc_auc_score:    97.88

Confusion Matrix:
 [[1665   53]
 [   0  782]]


### Boosting with a Adaboost and DecisionTree and SVC

In [25]:
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(
DecisionTreeClassifier(max_depth=1), n_estimators=200,
algorithm="SAMME.R", learning_rate=0.5)

In [26]:
p5 = Pipeline([('scaler', preprocess_transformer),
              ('AdaCL', ada_clf)])
fit_and_print(p5)

Accuracy Score:   99.52
Precision Score:  99.28143712574851
Recall Score:     99.28143712574851
roc_auc_score:    99.52

Confusion Matrix:
 [[1659    6]
 [   6  829]]


### Gradient Boosting

In [22]:
from sklearn.ensemble import GradientBoostingClassifier
p6 = Pipeline([('scaler', preprocess_transformer),
              ('GBC', GradientBoostingClassifier())])
fit_and_print(p6)

Accuracy Score:   99.2
Precision Score:  98.80239520958084
Recall Score:     98.80239520958084
roc_auc_score:    99.2

Confusion Matrix:
 [[1655   10]
 [  10  825]]


### XGBoost Classifier

In [27]:
from xgboost import XGBClassifier

p7 = Pipeline([('scaler', preprocess_transformer),
              ('XGBC', XGBClassifier())])
fit_and_print(p7)

Accuracy Score:   99.52
Precision Score:  99.52095808383234
Recall Score:     99.04648390941597
roc_auc_score:    99.52

Confusion Matrix:
 [[1657    4]
 [   8  831]]


# Conclusion

* The Boosting algorithms (AdaBoost, GradientBoost, and XGBoost) performed better than all other models.
* XGBoost was the best performing model overall

In [32]:
# Standardize the features
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# Fit the XGBoost model
xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, y_train)
y_pred = xgb_clf.predict(X_test)

In [33]:
# Create a DataFrame for y_test and y_pred
data_dict = {"Actual": y_test, "Prediction": y_pred}
results_df = pd.DataFrame(data_dict)

In [34]:
results_df

Unnamed: 0,Actual,Prediction
0,1,1
1,0,0
2,0,0
3,1,1
4,0,0
...,...,...
2495,0,0
2496,0,0
2497,1,1
2498,0,0
