### Import Blocks

In [118]:
import warnings
warnings.filterwarnings('ignore')

In [119]:
import os
import string
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
from datetime import datetime
from pprint import pprint

In [120]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score

### Set Environment

In [121]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
df = pd.DataFrame

### Read in Data

In [122]:
known = pd.read_csv('BADS_WS1819_known.csv')
unknown = pd.read_csv('BADS_WS1819_unknown.csv')

### Preparation for unknown_data IDs

In [123]:
unknown['item_id'] = unknown.apply(lambda row: (row['item_id'] * 2) + 2, axis = 1)
unknown['brand_id'] = unknown.apply(lambda row: row['brand_id'] + 100, axis = 1)

### Brand CE

In [124]:
brand_counts = df(known['brand_id'].value_counts())
brand_returns = df(known.groupby(['brand_id'])['return'].agg('sum'))
new_df_brand = brand_counts.join(brand_returns, on = brand_counts.index)
new_df_brand.columns = ['count', 'return']
new_df_brand['ce_brand'] = np.round(new_df_brand['return'] / new_df_brand['count'], 3)
new_df_brand.loc[new_df_brand['count'] < 100, 'ce_brand'] = .48

x_brand = list(new_df_brand.index)
y_brand = list(new_df_brand.ce_brand.values)
zip_dict_known_brand = dict(zip(x_brand, y_brand))
zip_dict_unknown_brand = {id: (zip_dict_known_brand[id] if id in zip_dict_known_brand.keys() else .48) for id in list(unknown['brand_id'].unique())}

known['ce_brand'] = known['brand_id'].map(zip_dict_known_brand)
unknown['ce_brand'] = unknown['brand_id'].map(zip_dict_unknown_brand)



### Item CE

In [125]:
item_counts = df(known['item_id'].value_counts())
item_returns = df(known.groupby(['item_id'])['return'].agg('sum'))
new_df_item = item_counts.join(item_returns, on = item_counts.index)
new_df_item.columns = ['count', 'return']
new_df_item['ce_item'] = np.round(new_df_item['return'] / new_df_item['count'], 3)
new_df_item.loc[new_df_item['count'] < 40, 'ce_item'] = .48

x_item = list(new_df_item.index)
y_item = list(new_df_item.ce_item.values)
zip_dict_known_item = dict(zip(x_item, y_item))
zip_dict_unknown_item = {id: (zip_dict_known_item[id] if id in zip_dict_known_item.keys() else .48) for id in list(unknown['item_id'].unique())}

known['ce_item'] = known['item_id'].map(zip_dict_known_item)
unknown['ce_item'] = unknown['item_id'].map(zip_dict_unknown_item)



### User CE

In [126]:
user_counts = df(known['user_id'].value_counts())
user_returns = df(known.groupby(['user_id'])['return'].agg('sum'))
new_df_user = user_counts.join(user_returns, on = user_counts.index)
new_df_user.columns = ['count', 'return']
new_df_user['ce_user'] = np.round(new_df_user['return'] / new_df_user['count'], 3)
new_df_user.loc[new_df_user['count'] < 5, 'ce_user'] = .48

x_user = list(new_df_user.index)
y_user = list(new_df_user.ce_user.values)
zip_dict_known_user = dict(zip(x_user, y_user))
zip_dict_unknown_user = {id: (zip_dict_known_user[id] if id in zip_dict_known_user.keys() else .48) for id in list(unknown['user_id'].unique())}

known['ce_user'] = known['user_id'].map(zip_dict_known_user)
unknown['ce_user'] = unknown['user_id'].map(zip_dict_unknown_user)



### Drop 'useless' features

In [127]:
known = known.drop(['user_state'], axis = 1)
unknown = unknown.drop(['user_state'], axis = 1)

known = known.drop(['user_title'], axis = 1)
unknown = unknown.drop(['user_title'], axis = 1)

known = known.drop(['item_color'], axis = 1)
unknown = unknown.drop(['item_color'], axis = 1)

### Round price

In [128]:
known.item_price = known.item_price.round()
unknown.item_price = unknown.item_price.round()

### Create 'customer_value' feature

In [129]:
joined = pd.concat([known, unknown], sort = False)
customer_value = df(joined.groupby(['user_id'])['item_price'].agg('sum'))

x_value = list(customer_value.index)
y_value = list(customer_value.item_price.values)
zip_dict_value = dict(zip(x_value, y_value))

known['customer_value'] = known['user_id'].map(zip_dict_value)
unknown['customer_value'] = unknown['user_id'].map(zip_dict_value)

### Create 'count_user' feature

In [130]:
joined = pd.concat([known, unknown], sort = False)
user_counts = df(joined['user_id'].value_counts())

x_count = list(user_counts.index)
y_count = list(user_counts.user_id.values)
zip_dict_count = dict(zip(x_count, y_count))

known['count_user'] = known['user_id'].map(zip_dict_count)
unknown['count_user'] = unknown['user_id'].map(zip_dict_count)

### Create 'new_user' boolean

In [131]:
known['new_user'] = known.apply(lambda row: 1 if row['count_user'] <= 1 else 0, axis = 1)
unknown['new_user'] = unknown.apply(lambda row: 1 if row['count_user'] <= 1 else 0, axis = 1)

### Fix lower-case upper-case issue in sizes

In [132]:
known['item_size'] = known['item_size'].str.lower()
unknown['item_size'] = unknown['item_size'].str.lower()

### Fix sizes with plus signs

In [133]:
known['item_size'] = known['item_size'].str.replace('+', '.5', regex = False)
unknown['item_size'] = unknown['item_size'].str.replace('+', '.5', regex = False)

### Derive 'item types' by item

In [134]:
joined = pd.concat([known, unknown], sort = False)
item_sizes = df(joined.groupby('item_id')['item_size'].apply(list))
x_type = list(item_sizes.index)
y_type = list(item_sizes.item_size.values)
zip_dict_type = dict(zip(x_type, y_type))

item_type_dict = {}
for item in joined['item_id'].unique():
    item_sizes = zip_dict_type[item]
    if any(['.5' in size for size in item_sizes]):
        item_type = 1
    elif all([size.isdigit() for size in item_sizes]):
        item_sizes = [float(size) for size in item_sizes]
        max_size = max(item_sizes)
        min_size = min(item_sizes)
        if min_size < 15 and max_size < 25:
            item_type = 3
        elif min_size > 2000 and max_size > 2000:
            item_type = 4
        elif min_size > 80 and max_size < 200:
            item_type = 5
        else:
            item_type = 6
    elif all([size == 'unsized' for size in item_sizes]):
        item_type = 7
    else:
        item_type = 2

    item_type_dict.update({item: item_type})

### Create dummies for type

In [None]:
known = known.join(pd.get_dummies(known['type'], prefix = 'type'))
unknown = unknown.join(pd.get_dummies(unknown['type'], prefix = 'type'))

### Drop type stacked

In [None]:
known = known.drop(['type'], axis = 1)
unknown = unknown.drop(['type'], axis = 1)

### Map values to df

In [135]:
known['type'] = known['item_id'].map(item_type_dict)
unknown['type'] = unknown['item_id'].map(item_type_dict)

### Drop original size feature

In [136]:
known = known.drop(['item_size'], axis = 1)
unknown = unknown.drop(['item_size'], axis = 1)

### Change date columns to datetime features

In [137]:
known['order_date'] = pd.to_datetime(known['order_date'])
known['delivery_date'] = pd.to_datetime(known['delivery_date'])
known['user_dob'] = pd.to_datetime(known['user_dob'])
known['user_reg_date'] = pd.to_datetime(known['user_reg_date'])

unknown['order_date'] = pd.to_datetime(unknown['order_date'])
unknown['delivery_date'] = pd.to_datetime(unknown['delivery_date'])
unknown['user_dob'] = pd.to_datetime(unknown['user_dob'])
unknown['user_reg_date'] = pd.to_datetime(unknown['user_reg_date'])

### Create 'was_delivered' feature

In [138]:
known['was_delivered'] = known.apply(lambda row: 0 if pd.isnull(row['delivery_date']) else 1, axis = 1)
unknown['was_delivered'] = unknown.apply(lambda row: 0 if pd.isnull(row['delivery_date']) else 1, axis = 1)

### Create 'membership_age' feature

In [139]:
known['membership_age_days'] = (known['order_date'] - known['user_reg_date']).dt.days
known.loc[known['membership_age_days'] == -1, 'membership_age_days'] = 0

unknown['membership_age_days'] = (unknown['order_date'] - unknown['user_reg_date']).dt.days
unknown.loc[unknown['membership_age_days'] == -1, 'membership_age_days'] = 0

### Create 'time_to_delivery' feature

In [140]:
known['time_to_delivery_days'] = (known['delivery_date'] - known['order_date']).dt.days
known['time_to_delivery_days'].fillna(9999, inplace = True)

unknown['time_to_delivery_days'] = (unknown['delivery_date'] - unknown['order_date']).dt.days
unknown['time_to_delivery_days'].fillna(9999, inplace = True)

### Create 'assumed_age' feature

In [141]:
known['assumed_age'] = (known['order_date'] - known['user_dob']).dt.days / 365
known['assumed_age'].fillna(0, inplace = True)
known.assumed_age = known.assumed_age.round()

unknown['assumed_age'] = (unknown['order_date'] - unknown['user_dob']).dt.days / 365
unknown['assumed_age'].fillna(0, inplace = True)
unknown.assumed_age = unknown.assumed_age.round()

### Remove datetime date features

In [142]:
known = known.drop(['order_date', 'delivery_date', 'user_reg_date', 'user_dob'], axis = 1)
unknown = unknown.drop(['order_date', 'delivery_date', 'user_reg_date', 'user_dob'], axis = 1)

### Store order IDs and remove columns

In [143]:
known_order_id = known['order_item_id']
unknown_order_id = unknown['order_item_id']

known = known.drop(['order_item_id'], axis = 1)
unknown = unknown.drop(['order_item_id'], axis = 1)

known = known.drop(['item_id'], axis = 1)
unknown = unknown.drop(['item_id'], axis = 1)

known = known.drop(['brand_id'], axis = 1)
unknown = unknown.drop(['brand_id'], axis = 1)

known = known.drop(['user_id'], axis = 1)
unknown = unknown.drop(['user_id'], axis = 1)

### Reorder columns

In [144]:
columns = list(known.columns.values)
columns.pop(columns.index('return'))
known = known[columns + ['return']]
columns = list(known.columns.values)
print(columns)

['item_price', 'ce_brand', 'ce_item', 'ce_user', 'customer_value', 'count_user', 'new_user', 'type', 'was_delivered', 'membership_age_days', 'time_to_delivery_days', 'assumed_age', 'return']


### Separate target & split test / train data

In [145]:
X, y = known.iloc[:, :-1], known.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123)

In [146]:
print(X.shape)
print(y.shape)

(100000, 12)
(100000,)


# XGBoost

## Parametres for GridSearchCV

In [None]:
folds = 5
parameter_iteration = 12
parameter_grid = {
     'learning_rate' : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
     'max_depth' : [3, 4, 5, 6, 8, 10, 12, 15],
     'min_child_weight' : [1, 3, 5, 7],
     'gamma' : [0.0, 0.1, 0.2 , 0.3, 0.4],
     'colsample_bytree' : [0.3, 0.4, 0.5, 0.7]
 }

## Define search elements

In [None]:
skf = StratifiedKFold(n_splits = folds, shuffle = True, random_state = 1001)
xgb = xgb.XGBClassifier(n_estimators = 500, objective = 'binary:logistic',
                        silent = True, subsample = 0.8, nthread = 1)

random_search = RandomizedSearchCV(xgb, param_distributions = parameter_grid,
                                   n_iter = parameter_iteration, scoring = 'roc_auc',
                                   n_jobs = 4, cv = skf.split(X, y), verbose = 3,
                                   random_state = 1001)

## Fit search

In [None]:
random_search.fit(X, y)

## Inspect output

In [None]:
random_search.best_score_
random_search.best_params_
random_search.cv_results_

## Define final model

In [151]:
xg_classifier = xgb.XGBClassifier(silent = False,
                                  min_child_weight = 7,
                                  scale_pos_weight = 1,
                                  learning_rate = 0.1,
                                  colsample_bytree = 0.3,
                                  subsample = 0.8,
                                  objective = 'binary:logistic',
                                  n_estimators = 500,
                                  reg_alpha = 0.3,
                                  max_depth = 3,
                                  gamma = .1,
                                  nthread = 1)

## Fit model

In [152]:
xg_classifier.fit(X_train, y_train)

[15:10:06] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 8 extra nodes, 0 pruned nodes, max_depth=3
[15:10:06] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[15:10:06] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 8 extra nodes, 0 pruned nodes, max_depth=3
[15:10:06] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 8 extra nodes, 0 pruned nodes, max_depth=3
[15:10:06] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[15:10:06] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 8 extra nodes, 0 pruned nodes, max_depth=3
[15:10:06] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 8 extra nodes, 0 pruned nodes, max_depth=3
[15:10:07] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[15:10:07] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 8 extra nodes, 0 pruned nodes, max_depth=

[15:10:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=3
[15:10:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[15:10:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 8 extra nodes, 0 pruned nodes, max_depth=3
[15:10:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=3
[15:10:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 8 extra nodes, 0 pruned nodes, max_depth=3
[15:10:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=3
[15:10:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[15:10:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=3
[15:10:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_de

[15:10:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[15:10:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[15:10:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=3
[15:10:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[15:10:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=3
[15:10:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[15:10:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 8 extra nodes, 0 pruned nodes, max_depth=3
[15:10:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 8 extra nodes, 0 pruned nodes, max_depth=3
[15:10:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_de

[15:10:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=3
[15:10:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=3
[15:10:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[15:10:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=3
[15:10:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=3
[15:10:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[15:10:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[15:10:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[15:10:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 8 extra nodes, 0 pruned nodes, max_d

[15:10:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=3
[15:10:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=3
[15:10:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[15:10:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=3
[15:10:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 8 extra nodes, 0 pruned nodes, max_depth=3
[15:10:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[15:10:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[15:10:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[15:10:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_d

[15:10:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=3
[15:10:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[15:10:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[15:10:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[15:10:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[15:10:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[15:10:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=3
[15:10:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[15:10:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_

[15:10:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 8 extra nodes, 0 pruned nodes, max_depth=3
[15:10:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[15:10:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[15:10:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=3
[15:10:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[15:10:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=3
[15:10:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=3
[15:10:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[15:10:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_d

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.3, gamma=0.1, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=7, missing=None,
       n_estimators=500, n_jobs=1, nthread=1, objective='binary:logistic',
       random_state=0, reg_alpha=0.3, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=False, subsample=0.8)

## Predict test scores; check ROC

In [153]:
predictions_test = xg_classifier.predict_proba(X_test)[:, 1]
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, predictions_test)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

0.7872452608599935

## Predict real scores

In [None]:
predictions_real = xg_classifier.predict_probab(unknown)[:, 1]
prediction_frame = df(unknown_order_id, columns = ['order_item_id'])
prediction_frame['return'] = predictions_real

## Output as .csv

In [None]:
prediction_frame_path = '/Users/alextruesdale/Documents/business-analytics/term_project/prediction_2.csv'
prediction_frame.to_csv(prediction_frame_path, index = False)

## Show feature importance

In [154]:
print('Feature importance Table\n',
      pd.DataFrame(xg_classifier.feature_importances_[:],
      unknown.columns[:]).sort_values(by = 0, ascending = False)[:32])

Feature importance Table
                               0
time_to_delivery_days  0.127493
ce_item                0.123570
customer_value         0.119320
ce_user                0.115070
membership_age_days    0.111474
item_price             0.094475
assumed_age            0.089572
ce_brand               0.087283
count_user             0.069958
type                   0.042171
new_user               0.009807
was_delivered          0.009807


# Random Forest

## Parametres for GridSearchCV

In [None]:
folds = 5
parameter_iteration = 12

n_estimators = [int(x) for x in np.linspace(start = 30, stop = 1000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 100, num = 10)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

## Define search elements

In [None]:
skf = StratifiedKFold(n_splits = folds, shuffle = True, random_state = 1001)
rf = RandomForestClassifier()

random_search = RandomizedSearchCV(rf, param_distributions = random_grid,
                                   n_iter = parameter_iteration, scoring = 'roc_auc',
                                   n_jobs = 4, cv = skf.split(X, y), verbose = 3,
                                   random_state = 1001)

## Fit search

In [None]:
random_search.fit(X, y)

## Inspect output

In [None]:
random_search.best_score_
random_search.best_params_
random_search.cv_results_

## Define final model

In [147]:
random_forest = RandomForestClassifier(n_estimators = 676,
                                       min_samples_split = 5,
                                       min_samples_leaf = 1,
                                       max_features = 'auto',
                                       max_depth = 10,
                                       bootstrap = False,
                                       verbose = 3)

## Fit model

In [148]:
random_forest.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


building tree 1 of 461
building tree 2 of 461
building tree 3 of 461
building tree 4 of 461
building tree 5 of 461
building tree 6 of 461
building tree 7 of 461
building tree 8 of 461
building tree 9 of 461
building tree 10 of 461
building tree 11 of 461
building tree 12 of 461
building tree 13 of 461
building tree 14 of 461
building tree 15 of 461
building tree 16 of 461
building tree 17 of 461
building tree 18 of 461
building tree 19 of 461
building tree 20 of 461
building tree 21 of 461
building tree 22 of 461
building tree 23 of 461
building tree 24 of 461
building tree 25 of 461
building tree 26 of 461
building tree 27 of 461
building tree 28 of 461
building tree 29 of 461
building tree 30 of 461
building tree 31 of 461
building tree 32 of 461
building tree 33 of 461
building tree 34 of 461
building tree 35 of 461
building tree 36 of 461
building tree 37 of 461
building tree 38 of 461
building tree 39 of 461
building tree 40 of 461
building tree 41 of 461
building tree 42 of 461
b

building tree 337 of 461
building tree 338 of 461
building tree 339 of 461
building tree 340 of 461
building tree 341 of 461
building tree 342 of 461
building tree 343 of 461
building tree 344 of 461
building tree 345 of 461
building tree 346 of 461
building tree 347 of 461
building tree 348 of 461
building tree 349 of 461
building tree 350 of 461
building tree 351 of 461
building tree 352 of 461
building tree 353 of 461
building tree 354 of 461
building tree 355 of 461
building tree 356 of 461
building tree 357 of 461
building tree 358 of 461
building tree 359 of 461
building tree 360 of 461
building tree 361 of 461
building tree 362 of 461
building tree 363 of 461
building tree 364 of 461
building tree 365 of 461
building tree 366 of 461
building tree 367 of 461
building tree 368 of 461
building tree 369 of 461
building tree 370 of 461
building tree 371 of 461
building tree 372 of 461
building tree 373 of 461
building tree 374 of 461
building tree 375 of 461
building tree 376 of 461


[Parallel(n_jobs=1)]: Done 461 out of 461 | elapsed:   38.1s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=461, n_jobs=None,
            oob_score=False, random_state=None, verbose=3,
            warm_start=False)

## Predict test scores; check ROC

In [149]:
predictions_test = random_forest.predict_proba(X_test)[:,1]
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, predictions_test)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 461 out of 461 | elapsed:    1.0s finished


0.78546752757178

## Predict real scores

In [116]:
prediction = random_forest.predict_proba(unknown)[:,1]
prediction_frame = df(unknown_order_id, columns = ['order_item_id'])
prediction_frame['return'] = prediction

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 461 out of 461 | elapsed:    2.3s finished


## Output as .csv

In [117]:
prediction_frame_path = '/Users/alextruesdale/Documents/business-analytics/term_project/prediction_3.csv'
prediction_frame.to_csv(prediction_frame_path, index = False)

## Show feature importance

In [150]:
print('Feature importance Table\n',
      pd.DataFrame(random_forest.feature_importances_[:],
      unknown.columns[:]).sort_values(by = 0, ascending = False)[:32])

Feature importance Table
                               0
ce_user                0.322630
time_to_delivery_days  0.184362
was_delivered          0.162772
ce_item                0.132542
item_price             0.062162
ce_brand               0.045040
customer_value         0.030162
type                   0.015403
count_user             0.015236
assumed_age            0.013795
membership_age_days    0.012290
new_user               0.003605
