# Customer purchase prediction

# Dataset preparation

## combine orders , reviews, payments dataset with customer dataset and dropping unwanted columns


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
item=pd.read_csv("olist_order_items_dataset.csv")
order=pd.read_csv("olist_orders_dataset.csv")
product=pd.read_csv("olist_products_dataset.csv")
customer=pd.read_csv("olist_customers_dataset.csv")
review=pd.read_csv("olist_order_reviews_dataset.csv") 
payment=pd.read_csv("olist_order_payments_dataset.csv")

In [6]:
df1=pd.merge(customer.drop(columns=['customer_zip_code_prefix']),order[['customer_id','order_id','order_purchase_timestamp']],on='customer_id')

In [8]:
df2 = pd.merge(df1,review[['order_id','review_score']],on='order_id')

In [9]:
paid = payment[['order_id','payment_value']].groupby('order_id').sum().reset_index()

In [10]:
df3 = pd.merge(df2,paid,on='order_id')

In [11]:
## making purchase date in datetime format
df3['order_purchase_timestamp']=pd.to_datetime(df3['order_purchase_timestamp']).dt.date

In [47]:
df3['order_purchase_timestamp']

0        2017-05-16
1        2018-01-12
2        2018-05-19
3        2018-03-13
4        2018-07-29
            ...    
99994    2018-04-07
99995    2018-04-04
99996    2018-04-08
99997    2017-11-03
99998    2017-12-19
Name: order_purchase_timestamp, Length: 99999, dtype: object

## We are going to sperate out 180 days (last 6 months) from the maximum day of purchase by customers out of the dataset. 2018/4 to 2018/10.
## We are using that data to predict whether the customer made a purchase in that period. We are going to use data until 2018/4 to make that prediction

In [17]:
number_of_days_for_purchase=180
max_date_in_data= df3['order_purchase_timestamp'].max()

In [67]:
min_date_in_data= df3['order_purchase_timestamp'].min()

In [68]:
min_date_in_data

datetime.date(2016, 9, 4)

In [51]:
from datetime import datetime, timedelta 

In [66]:
max_date_in_data

datetime.date(2018, 10, 17)

In [59]:
data_split_date=max_date_in_data -timedelta(days=number_of_days_for_purchase)

In [69]:
data_split_date

datetime.date(2018, 4, 20)

In [60]:
df_full=df3[df3['order_purchase_timestamp']<=data_split_date]
df_last=df3[df3['order_purchase_timestamp']>data_split_date]

In [61]:
df_last_180 =pd.DataFrame({'customer_unique_id':df3['customer_unique_id'].values.tolist()})

In [62]:
df_last_180=df_last_180.merge(df_last.groupby(['customer_unique_id'])['payment_value'].sum().reset_index(),how='outer',on='customer_unique_id')

In [63]:
df_last_180.fillna(0,inplace=True)

In [64]:
df_last_180['purchased']=np.where(df_last_180['payment_value']>0, 1,0)

In [65]:
df_last_180.head()

Unnamed: 0,customer_unique_id,payment_value,purchased
0,861eff4711a542e4b93843c6dd7febb0,0.0,0
1,290c77bc529b7ac935b93aa66c333dc3,0.0,0
2,060e732b5b29e8181a18229c7b0b2b5e,157.73,1
3,259dac757896d24d7702b9acbbff3f3c,0.0,0
4,345ecd01c38d18a9036ed96c73b8d066,252.25,1


## The customers who bought items in that 6 months were given 1 and others were given 0. these values will act as binary classification for our prediction model

# Feature Engineering

In [70]:
## total amount per customer
tot_Amount=df_full.groupby('customer_unique_id')['payment_value'].sum().reset_index().rename(columns={'payment_value':'total_amount'})

In [71]:
## average review given
avg_review=df_full.groupby('customer_unique_id')['review_score'].mean().reset_index().rename(columns={'review_score':'avg_review'})

In [73]:
## months between first purchase and today
min_max_date=df_full.groupby('customer_unique_id')['order_purchase_timestamp'].agg([min,max])
min_max_date['diff_first_today']=(datetime.today().date()-min_max_date['min']).dt.days

In [74]:
## months from first to last purchase
min_max_date['max']=pd.to_datetime(min_max_date['max'])
min_max_date['min']=pd.to_datetime(min_max_date['min'])
min_max_date['diff_first_last']=(min_max_date['max']-min_max_date['min']).dt.days

In [75]:
## recency of Sales 
max_date=df_full['order_purchase_timestamp'].max()

min_max_date['recency']=(np.datetime64(max_date)-min_max_date['max'])/np.timedelta64(1, 'M')

In [76]:
## Frequency of Sales
frequency=df_full.groupby('customer_unique_id')['order_id'].count().reset_index().rename(columns={'order_id':'frequency'})

In [77]:
## joining all the engineered features
dataset=pd.merge(tot_Amount,avg_review,on='customer_unique_id')
dataset=pd.merge(dataset,min_max_date,on='customer_unique_id')
dataset=pd.merge(dataset,frequency,on='customer_unique_id')
dataset=pd.merge(dataset,df_full[['customer_unique_id','customer_city','customer_state']],on='customer_unique_id')
dataset.drop(['min','max'],axis=1,inplace=True)

In [79]:
from sklearn.preprocessing import StandardScaler,LabelEncoder,MinMaxScaler

In [80]:
### label encoding city and state names
encoder=LabelEncoder()
dataset['customer_city']=encoder.fit_transform(dataset['customer_city'])
dataset['customer_state']=encoder.fit_transform(dataset['customer_state'])

In [81]:
##merging with the label dataset we have created 
dataset_full=dataset.merge(df_last_180[['customer_unique_id','purchased']],on='customer_unique_id')
dataset_full.drop(columns='customer_unique_id',inplace=True)

# Evaluating Machine learning models

### we are going to do binary classification to predict whether a customer will purchase within the next 6 months Since this is a classification problem we use several models here , For comparing the models we use metrics like r2 score and accuracy score.


1. Linear Regression
2. Random Forest Classifier
3. Extra Trees Classifier
4. Gradient Boost Classifier
5. K nearest nerighbour Classifier

In [82]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import r2_score,accuracy_score,classification_report
# Splitting data into training/testing
from sklearn.model_selection import train_test_split,GridSearchCV

In [83]:
##splitting to train and test dataset
X_train,X_test,y_train,y_test=train_test_split(dataset_full.iloc[:,:-1],dataset_full.iloc[:,-1], test_size=0.2, random_state=31)

In [84]:
## calculating gini scores for the models
def Gini(y_true, y_pred):
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]
    
    # sort rows on prediction column 
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:,0].argsort()][::-1,0]
    pred_order = arr[arr[:,1].argsort()][::-1,0]
    
    # get Lorenz curves
    L_true = np.cumsum(true_order) / np.sum(true_order)
    L_pred = np.cumsum(pred_order) / np.sum(pred_order)
    L_ones = np.linspace(1/n_samples, 1, n_samples)
    
    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)
    
    # normalize to true Gini coefficient
    return G_pred/G_true

In [85]:
# Evaluate several ml models by training on training set and testing on testing set
def evaluate(X_train, X_test, y_train, y_test):
    # Names of models
    model_name_list = ['Linear Regression',
                      'Random Forest', 'Extra Trees',
                       'Gradient Boosted','KNeighbors']

    
    # Instantiate the models
    model1 = LinearRegression()
    model3 = RandomForestClassifier(n_estimators=50)
    model4 = ExtraTreesClassifier(n_estimators=50)
    model6 = GradientBoostingClassifier(n_estimators=20)
    model7= KNeighborsClassifier(n_neighbors = 5)
    
    # Dataframe for results
    results = pd.DataFrame(columns=['r2', 'accuracy','gini'], index = model_name_list)
    
    # Train and predict with each model
    for i, model in enumerate([model1, model3, model4, model6,model7]):
   
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
               
        # Metrics
        r2 = r2_score(y_test,predictions)
        preds=np.where(predictions>0.5,1,0)
        accuracy = accuracy_score(y_test,preds)
        gini=Gini(y_test,preds)
        
        # Insert results into the dataframe
        model_name = model_name_list[i]
        results.loc[model_name, :] = [r2, accuracy,gini]
    
    return results

In [86]:
results=evaluate(X_train, X_test, y_train, y_test)

In [87]:
results

Unnamed: 0,r2,accuracy,gini
Linear Regression,0.051544,0.977076,-0.0151789
Random Forest,0.835912,0.996325,0.862908
Extra Trees,0.83035,0.9962,0.883488
Gradient Boosted,0.0961274,0.979755,0.134211
KNeighbors,0.338087,0.985174,0.442184


### As r2 score reaches 1, the model is much capable of explaining the variance in purchase probability prediction of the customers. So Random Forest Classifier is the best classifier for the prediction.

### The gini coefficient of the forest classifier and Extratrees classifier is closer to one, meaning there is inequality in the predicted values