# Next Purchase Prediction

What if you know if a customer is likely to make another purchase in 7 days?

You can build your strategy on top of that and come up with lots of tactical actions like:

- No promotional offer to this customer since s/he will make a purchase anyways
- Nudge the customer with inbound marketing if there is no purchase in the predicted time window (or fire the guy who did the prediction) 

**Outilne** 

- Feature Engineering
- Modeling
- Drowing a conclusion

Data for the analysis was taken from [kaggle](https://www.kaggle.com/mkechinov/ecommerce-purchase-history-from-jewelry-store).

### Data and Packages 

In [None]:
# Pachages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import date
%matplotlib inline

from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Dataset
data = pd.read_csv('data/events.csv')
data.shape

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.dropna(inplace=True)

In [None]:
cols = ['user_session', 'product_id', 'category_id', 'user_id']

for col in cols: 
    data[col] = data[col].astype('O')
    
data['event_time'] = pd.to_datetime(data['event_time'])

## EDA

As usual, we start with a quick exploratory data analysis. Describe the plots and draw conclusions.

In [None]:
def plot_continuous(dataset, var_name):
    sns.displot(dataset[var_name])
    plt.axvline(dataset[var_name].mean(), color='k', linestyle='dashed', linewidth=1)
    plt.axvline(dataset[var_name].median(), color='r', linewidth=1)
    plt.title(f'Distribution of variable "{var_name}"')
    
def plot_objects(dataset, var_name):
    sns.countplot(dataset[var_name])
    plt.title(f'Distribution of variable "{var_name}"')

In [None]:
plot_continuous(data, 'price')

In [None]:
plot_continuous(data[data['price'] < 3000], 'price')

In [None]:
plot_objects(data, 'brand')

In [None]:
plot_objects(data, 'event_type')

In [None]:
data = data[data.event_type=='view']

In [None]:
data.event_time.describe()

We're going to use 5 months of data to make predictions.

In [None]:
purch_hist = data[(data.event_time < '2021-01-31')]
purch_next = data[(data.event_time >= '2021-01-31')].reset_index(drop=True)

In [None]:
customer  = pd.DataFrame(purch_hist['user_id'].unique())
customer.columns = ['user_id']

In [None]:
purch_hist.shape, purch_next.shape, customer.shape

### Labeling

In [None]:
next_ = purch_next.groupby('user_id').event_time.min().reset_index()
next_.columns = ['user_id','MinNextPurchase']
next_.head()

In [None]:
last_ = purch_hist.groupby('user_id').event_time.max().reset_index()
last_.columns = ['user_id','MaxHistPurchase']
purchase = pd.merge(last_, next_,on='user_id',how='left')
purchase['NextPurchaseDay'] = (purchase['MinNextPurchase'] - purchase['MaxHistPurchase']).dt.days
purchase.head()

In [None]:
customer = pd.merge(customer, purchase[['user_id','NextPurchaseDay']],on='user_id',how='left')
print('Number of retained customers: ', customer.NextPurchaseDay.notna().sum())
customer = customer.fillna(-1)
customer.shape

In [None]:
customer.sort_values(by='NextPurchaseDay')

###  New Features

In [None]:
day_order = purch_hist[['user_id','event_time']]
day_order = day_order.sort_values(['user_id','event_time'])
day_order = day_order.drop_duplicates(subset=['user_id','event_time'],keep='first')

# last 3 purchase dates
day_order['Previous'] = day_order.groupby('user_id')['event_time'].shift(1)
day_order['Previous2'] = day_order.groupby('user_id')['event_time'].shift(2)
day_order['Previous3'] = day_order.groupby('user_id')['event_time'].shift(3)


In [None]:
day_order.head(10)

In [None]:
data_nas = pd.DataFrame(day_order.isna().sum(), columns=['Number of NAs'])
data_nas['Percentage of NAs'] = round(data_nas['Number of NAs'] / day_order.shape[0] *100, 3)
data_nas

In [None]:
day_order['DayDiff'] = (day_order['event_time'] - day_order['Previous']).dt.days
day_order['DayDiff2'] = (day_order['event_time'] - day_order['Previous2']).dt.days
day_order['DayDiff3'] = (day_order['event_time'] - day_order['Previous3']).dt.days
day_order.head(10)

In [None]:
day_diff = day_order.groupby('user_id').agg({'DayDiff': ['mean','std']}).reset_index()
day_diff.columns = ['user_id', 'DayDiffMean','DayDiffStd']
day_diff.head()

In [None]:
last_day = day_order.drop_duplicates(subset=['user_id'],keep='last')
last_day.head(5)

In [None]:
last_day = pd.merge(last_day, day_diff, on='user_id')
customer = pd.merge(customer, last_day[['user_id','DayDiff', 'DayDiff2','DayDiffMean','DayDiffStd']], on='user_id')
customer = customer.fillna(-1)


len(customer)
# USE 'DayDiff3' in case you have more data

In [None]:
customer.head()

In [None]:
customer.describe()

In [None]:
other_features = data.groupby('user_id', as_index=False)['price'].mean()
#Add more if you have other handy features

In [None]:
final = pd.merge(customer, other_features, on= 'user_id', how = 'left')

final.set_index('user_id', inplace=True)
final.head()

In [None]:
final.shape

## Version 1: make a classification task

For the first model we are going to create labels for dataset. This may help to predict churn as well

In [None]:
final_v1 = final.copy()

In [None]:
final_v1.NextPurchaseDay.describe()

# Use final.NextPurchaseDay.describe(np.linspace(0,1,10)) to get percentiles of data

In [None]:
final_v1[final_v1['NextPurchaseDay'] > -1]['NextPurchaseDay'].describe() 

In [None]:
final_v1['NextPurchase'] = 2 # returnd after 25 days
final_v1.loc[final_v1.NextPurchaseDay < 31,'NextPurchase'] = 1 #returned within 25 days
final_v1.loc[final_v1.NextPurchaseDay == -1,'NextPurchase'] = 0 #never retained

In [None]:
final_v1.NextPurchase.value_counts()/len(customer)*100 

In [None]:
final_v1 = final_v1.drop(columns='NextPurchaseDay')

In [None]:
final_v1.head()

### Modeling

We have very imbalanced data. [Some of the techniques were covered in our last tutorial](https://github.com/LilitYolyan/customer_behavior_analysis/blob/master/Week_6_Churn_Analysis_Prediction.ipynb).

In [None]:
X, y = final_v1.drop('NextPurchase',axis=1), final_v1.NextPurchase
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44)

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

print("Accuracy:", accuracy_score(y_test, y_pred))

## Version 2: Classification only on retained customers

In the next step, we will only use those customers data who have not left. Thus, we can get a more accurate model and solve the problem of imbalanced data.

In [None]:
retained = final[final['NextPurchaseDay'] != -1]
retained.head()

In [None]:
retained.NextPurchaseDay.describe(percentiles=[0.33, 0.66])

In [None]:
retained['NextPurchase'] = 2 # returned after 275
retained.loc[retained.NextPurchaseDay < 33,'NextPurchase'] = 1 #returned within 158 and 275 days
retained.loc[retained.NextPurchaseDay < 11,'NextPurchase'] = 0 # returned within 0 and 158 days

In [None]:
retained = retained.drop(columns='NextPurchaseDay')
retained.head()

In [None]:
retained.NextPurchase.value_counts()/len(retained)*100 

In [None]:
X, y = retained.drop('NextPurchase',axis=1), retained.NextPurchase
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44)

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

print("Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
plt.bar([x for x in range(len(clf.feature_importances_))], clf.feature_importances_)
names = plt.xticks(np.arange(5), retained.columns[:-1])

Parameters which define the model architecture are referred to as hyperparameters and thus this process of searching for the ideal model architecture is referred to as hyperparameter tuning.

Hyperparameters are not model parameters and they cannot be directly trained from the data. Model parameters are learned during training when we optimize a loss function using something like gradient descent.The process for learning parameter values is shown generally below.

In [None]:
param_grid = {'max_features': ['auto', 'sqrt', 'log2'],
              'ccp_alpha': [0.1, .01, .001],
              'max_depth' : [5, 6, 7, 8, 9],
              'criterion' :['gini', 'entropy']
             }
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, verbose=True)
grid_search.fit(X_train, Y_train)

In [None]:
final_model = grid_search.best_estimator_
final_model