In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

  from numpy.core.umath_tests import inner1d


In [2]:
data = pd.read_csv('../../data/mini.csv')

In [3]:
data.head()

Unnamed: 0,customer_id,occurence,cost,item_id
0,416705,2017-05-07 21:58:10.000000,299.0,515274
1,13891,2018-02-10 17:35:11.000000,1090.0,828115
2,9081,2017-12-21 17:13:44.000000,499.0,695501
3,470904,2017-10-31 10:39:49.000000,290.0,899821
4,58500,2018-03-09 20:57:29.000000,150.0,518554


In [4]:
data.shape

(1990712, 4)

In [5]:
data['occurence'] = pd.to_datetime(data.occurence, yearfirst = True)

In [6]:
data['year'] = [d.year for d in data.occurence]
data['month'] = [d.month for d in data.occurence]

In [7]:
data.head()

Unnamed: 0,customer_id,occurence,cost,item_id,year,month
0,416705,2017-05-07 21:58:10,299.0,515274,2017,5
1,13891,2018-02-10 17:35:11,1090.0,828115,2018,2
2,9081,2017-12-21 17:13:44,499.0,695501,2017,12
3,470904,2017-10-31 10:39:49,290.0,899821,2017,10
4,58500,2018-03-09 20:57:29,150.0,518554,2018,3


In [8]:
data.drop(['occurence'], axis=1, inplace=True)

In [9]:
data.head()

Unnamed: 0,customer_id,cost,item_id,year,month
0,416705,299.0,515274,2017,5
1,13891,1090.0,828115,2018,2
2,9081,499.0,695501,2017,12
3,470904,290.0,899821,2017,10
4,58500,150.0,518554,2018,3


In [10]:
data['customer_id'].nunique()

245676

In [11]:
data['year'].nunique()

2

In [12]:
data = data[data['year']==2018]

In [13]:
data.head()

Unnamed: 0,customer_id,cost,item_id,year,month
1,13891,1090.0,828115,2018,2
4,58500,150.0,518554,2018,3
5,572181,990.0,656673,2018,5
6,616443,2390.0,678768,2018,7
7,114804,3990.0,958574,2018,5


In [14]:
data = data[data['month']<7]

In [15]:
data.head()

Unnamed: 0,customer_id,cost,item_id,year,month
1,13891,1090.0,828115,2018,2
4,58500,150.0,518554,2018,3
5,572181,990.0,656673,2018,5
7,114804,3990.0,958574,2018,5
16,568563,1050.0,969194,2018,6


In [16]:
first_5_months = data[data['month']<=5]

In [17]:
first_5_months.head()

Unnamed: 0,customer_id,cost,item_id,year,month
1,13891,1090.0,828115,2018,2
4,58500,150.0,518554,2018,3
5,572181,990.0,656673,2018,5
7,114804,3990.0,958574,2018,5
17,422260,1100.0,451858,2018,1


In [18]:
first_5_months = pd.DataFrame(first_5_months['customer_id'].unique())

In [19]:
first_5_months.head()

Unnamed: 0,0
0,13891
1,58500
2,572181
3,114804
4,422260


In [20]:
target_month = data[data['month']==6]

In [21]:
target_month.head()

Unnamed: 0,customer_id,cost,item_id,year,month
16,568563,1050.0,969194,2018,6
83,589411,2490.0,977771,2018,6
88,11551,850.0,689153,2018,6
93,564748,4499.0,956799,2018,6
94,106202,8990.0,961388,2018,6


In [22]:
target_month = pd.DataFrame(target_month['customer_id'].unique())

In [23]:
target_month.head()

Unnamed: 0,0
0,568563
1,589411
2,11551
3,564748
4,106202


In [24]:
target_month['target']= 'Yes'

In [25]:
target_month.head()

Unnamed: 0,0,target
0,568563,Yes
1,589411,Yes
2,11551,Yes
3,564748,Yes
4,106202,Yes


In [26]:
target_dataset = pd.merge(first_5_months, target_month, how='left')

In [27]:
target_dataset['cost'] = data.groupby('customer_id')['cost'].mean()
target_dataset['number of transactions'] = data.groupby('customer_id')['cost'].count()
target_dataset['total'] = data.groupby('customer_id')['cost'].sum()


In [28]:
target_dataset.head()

Unnamed: 0,0,target,cost,number of transactions,total
0,13891,,,,
1,58500,,1713.0,3.0,5139.0
2,572181,,,,
3,114804,,,,
4,422260,Yes,,,


In [29]:
target_dataset.shape

(131774, 5)

In [30]:
first_5_months.shape

(131774, 1)

In [31]:
target_month.shape

(53727, 2)

In [36]:
target_dataset['target'] = target_dataset['target'].fillna('No')
target_dataset['cost'] = target_dataset['cost'].fillna(target_dataset['cost'].median())
target_dataset['number of transactions'] = target_dataset['number of transactions'].fillna(target_dataset['number of transactions'].median())
target_dataset['total'] = target_dataset['total'].fillna(target_dataset['total'].median())

In [37]:
target_dataset.head()

Unnamed: 0,0,target,cost,number of transactions,total
0,13891,No,2240.0,3.0,7980.0
1,58500,No,1713.0,3.0,5139.0
2,572181,No,2240.0,3.0,7980.0
3,114804,No,2240.0,3.0,7980.0
4,422260,Yes,2240.0,3.0,7980.0


In [38]:
target_dataset['target'] = target_dataset['target'].map({"Yes":1, "No":0})

In [39]:
target_dataset.head()

Unnamed: 0,0,target,cost,number of transactions,total
0,13891,0,2240.0,3.0,7980.0
1,58500,0,1713.0,3.0,5139.0
2,572181,0,2240.0,3.0,7980.0
3,114804,0,2240.0,3.0,7980.0
4,422260,1,2240.0,3.0,7980.0


In [41]:
y = target_dataset['target']
X = target_dataset.drop('target', axis=1)

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=17)

In [43]:
first_tree = DecisionTreeClassifier(random_state=17)

In [44]:
np.mean(cross_val_score(first_tree, X_train, y_train, cv=5))

0.6391952016557091

In [45]:
tree_params = {'max_depth': np.arange(1, 11), 'max_features':[.5, .7, 1]}

In [46]:
tree_grid = GridSearchCV(first_tree, tree_params, cv=5, n_jobs=-1)

In [47]:
tree_grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=17,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]), 'max_features': [0.5, 0.7, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [48]:
tree_grid.best_score_, tree_grid.best_params_

(0.7573855443891545, {'max_depth': 1, 'max_features': 0.5})

In [49]:
tree_test_pred = tree_grid.predict(X_test)

In [50]:
accuracy_score(y_test, tree_test_pred)

0.7577466926365315