# Predicting `Purchase` in `Caravan` dataset with a boosted tree classifier

## Preparing the data

Information on the dataset can be [found here](https://rdrr.io/cran/ISLR/man/Caravan.html)

In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns; sns.set_style('whitegrid')
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [5]:
caravan = pd.read_csv('../../datasets/Caravan.csv', index_col=0)
caravan.reset_index(inplace=True, drop=True)
caravan.head()

Unnamed: 0,MOSTYPE,MAANTHUI,MGEMOMV,MGEMLEEF,MOSHOOFD,MGODRK,MGODPR,MGODOV,MGODGE,MRELGE,...,APERSONG,AGEZONG,AWAOREG,ABRAND,AZEILPL,APLEZIER,AFIETS,AINBOED,ABYSTAND,Purchase
0,33,1,3,2,8,0,5,1,3,7,...,0,0,0,1,0,0,0,0,0,No
1,37,1,2,2,8,1,4,1,4,6,...,0,0,0,1,0,0,0,0,0,No
2,37,1,2,2,8,0,4,2,4,3,...,0,0,0,1,0,0,0,0,0,No
3,9,1,3,3,3,2,3,2,4,5,...,0,0,0,1,0,0,0,0,0,No
4,40,1,4,2,10,1,4,1,4,7,...,0,0,0,1,0,0,0,0,0,No


In [6]:
caravan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5822 entries, 0 to 5821
Data columns (total 86 columns):
MOSTYPE     5822 non-null int64
MAANTHUI    5822 non-null int64
MGEMOMV     5822 non-null int64
MGEMLEEF    5822 non-null int64
MOSHOOFD    5822 non-null int64
MGODRK      5822 non-null int64
MGODPR      5822 non-null int64
MGODOV      5822 non-null int64
MGODGE      5822 non-null int64
MRELGE      5822 non-null int64
MRELSA      5822 non-null int64
MRELOV      5822 non-null int64
MFALLEEN    5822 non-null int64
MFGEKIND    5822 non-null int64
MFWEKIND    5822 non-null int64
MOPLHOOG    5822 non-null int64
MOPLMIDD    5822 non-null int64
MOPLLAAG    5822 non-null int64
MBERHOOG    5822 non-null int64
MBERZELF    5822 non-null int64
MBERBOER    5822 non-null int64
MBERMIDD    5822 non-null int64
MBERARBG    5822 non-null int64
MBERARBO    5822 non-null int64
MSKA        5822 non-null int64
MSKB1       5822 non-null int64
MSKB2       5822 non-null int64
MSKC        5822 non-null int

In [8]:
caravan = pd.get_dummies(caravan, drop_first=True)

## a. Train test split

In [9]:
from sklearn.model_selection import train_test_split

X, y = caravan.drop(columns=['Purchase_Yes']), caravan['Purchase_Yes']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=1000, random_state=27)
X_train.shape

(1000, 85)

## b. Fit boosted tree model

In [13]:
from sklearn.ensemble import GradientBoostingClassifier

boost_clf = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.01)
boost_clf.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=1000,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [18]:
feat_imp = pd.DataFrame({'Feature Importance': boost_clf.feature_importances_},
                        index=X.columns).sort_values(by='Feature Importance', ascending=False)

feat_imp

Unnamed: 0,Feature Importance
PBRAND,7.989783e-02
MOPLLAAG,7.138105e-02
MKOOPKLA,6.779442e-02
MBERARBG,6.553880e-02
PPERSAUT,5.104349e-02
MSKD,4.747047e-02
MINK7512,4.596157e-02
PPLEZIER,3.968977e-02
MGODOV,3.914403e-02
MOPLMIDD,3.819071e-02


## c. Predict `Purchase` and compare with KNN, Logistic Regression

### Confusion Matrix and precision for Boosted Tree model

In [33]:
from sklearn.metrics import confusion_matrix

y_act = pd.Series(['Yes' if entry == 1 else 'No' for entry in y_test],
                  name='Actual')
y_pred = pd.Series(['Yes' if prob[1] > 0.2 else 'No' for prob in boost_clf.predict_proba(X_test)],
                        name='Predicted')
boost_tree_conf = pd.crosstab(y_act, y_pred, margins=True)
boost_tree_conf

Predicted,No,Yes,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,4328,204,4532
Yes,242,48,290
All,4570,252,4822


In [36]:
# fraction of people predicted to make a purchase that actually do - this is the "precision"
boost_tree_conf.at['Yes', 'Yes']/(boost_tree_conf.at['Yes', 'No'] + boost_tree_conf.at['Yes', 'Yes'])

0.16551724137931034

### Confusion matrix and precision for KNN model

In [34]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)

y_act = pd.Series(['Yes' if entry == 1 else 'No' for entry in y_test],
                  name='Actual')
y_pred = pd.Series(['Yes' if prob[1] > 0.2 else 'No' for prob in knn_clf.predict_proba(X_test)],
                        name='Predicted')
knn_conf = pd.crosstab(y_act, y_pred, margins=True)
knn_conf

Predicted,No,Yes,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,4340,192,4532
Yes,259,31,290
All,4599,223,4822


In [35]:
# fraction of people predicted to make a purchase that actually do - this is the "precision"
knn_conf.at['Yes', 'Yes']/(knn_conf.at['Yes', 'No'] + knn_conf.at['Yes', 'Yes'])

0.10689655172413794

### Confusion matrix and precision for Logistic Regression

In [40]:
from sklearn.linear_model import LogisticRegression

logreg_clf = LogisticRegression()
logreg_clf.fit(X_train, y_train)

y_act = pd.Series(['Yes' if entry == 1 else 'No' for entry in y_test],
                  name='Actual')
y_pred = pd.Series(['Yes' if prob[1] > 0.2 else 'No' for prob in logreg_clf.predict_proba(X_test)],
                        name='Predicted')
logreg_conf = pd.crosstab(y_act, y_pred, margins=True)
logreg_conf

Predicted,No,Yes,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,4275,257,4532
Yes,254,36,290
All,4529,293,4822


In [41]:
# fraction of people predicted to make a purchase that actually do - this is the "precision"
logreg_conf.at['Yes', 'Yes']/(logreg_conf.at['Yes', 'No'] + logreg_conf.at['Yes', 'Yes'])

0.12413793103448276