In [1]:
# Loading autotime for the notebook
%load_ext autotime

time: 315 µs (started: 2022-08-17 13:14:41 +00:00)


In [2]:
# Setting the environment variables

time: 2.14 ms (started: 2022-08-17 13:14:41 +00:00)


In [3]:
import os
import sys
os.environ["PYSPARK_PYTHON"]="/usr/bin/python3"
os.environ["PYSPARK_DRIVER_PYTHON"]="/usr/bin/python3"
os.environ["PYSPARK_DRIVER_PYTHON_OPTS"]="notebook --no-browser"
os.environ["JAVA_HOME"] = "/usr/java/jdk1.8.0_161/jre"
os.environ["SPARK_HOME"] = "/home/ec2-user/spark-2.4.4-bin-hadoop2.7"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
sys.path.insert(0, os.environ["PYLIB"] + "/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] + "/pyspark.zip")

time: 2.64 ms (started: 2022-08-17 13:14:41 +00:00)


In [4]:
# Spark environment
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T

from pyspark.ml.classification import DecisionTreeClassifier, DecisionTreeClassificationModel
from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics


# Python Utilities
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

time: 1.84 s (started: 2022-08-17 13:14:41 +00:00)


# Ecommerce Churn Assignment

The aim of the assignment is to build a model that predicts whether a person purchases an item after it has been added to the cart or not. Being a classification problem, you are expected to use your understanding of all the three models covered till now. You must select the most robust model and provide a solution that predicts the churn in the most suitable manner. 

For this assignment, you are provided the data associated with an e-commerce company for the month of October 2019. Your task is to first analyse the data, and then perform multiple steps towards the model building process.

The broad tasks are:
- Data Exploration
- Feature Engineering
- Model Selection
- Model Inference

### Data description

The dataset stores the information of a customer session on the e-commerce platform. It records the activity and the associated parameters with it.

- **event_time**: Date and time when user accesses the platform
- **event_type**: Action performed by the customer
            - View
            - Cart
            - Purchase
            - Remove from cart
- **product_id**: Unique number to identify the product in the event
- **category_id**: Unique number to identify the category of the product
- **category_code**: Stores primary and secondary categories of the product
- **brand**: Brand associated with the product
- **price**: Price of the product
- **user_id**: Unique ID for a customer
- **user_session**: Session ID for a user


### Initialising the SparkSession

The dataset provided is 5 GBs in size. Therefore, it is expected that you increase the driver memory to a greater number. You can refer to notebook 1 for the steps involved here.

In [5]:
MAX_MEMORY = "14G"

spark = SparkSession \
    .builder \
    .appName("demo") \
    .config("spark.driver.memory", MAX_MEMORY) \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")
spark

time: 3.79 s (started: 2022-08-17 13:14:43 +00:00)


In [6]:
# Loading the clean data
df_transformed = spark.read.parquet("Parquets/transformed_df.parquet")

time: 2.81 s (started: 2022-08-17 13:14:47 +00:00)


In [7]:
# Splitting the data into train and test (Remember you are expected to compare the model later)
df_train, df_test = df_transformed.randomSplit([0.7, 0.3], seed=42)

time: 73.1 ms (started: 2022-08-17 13:14:50 +00:00)


In [8]:
# Number of rows in train and test data
print(f"Number of Train rows: {df_train.count()}")
print(f"Number of Test rows: {df_test.count()}")

Number of Train rows: 628038
Number of Test rows: 270405
time: 23.3 s (started: 2022-08-17 13:14:50 +00:00)


<hr>

## Task 4: Model Inference

- Feature Importance
- Model Inference
- Feature exploration

In [9]:
label_column = "is_purchased"

time: 390 µs (started: 2022-08-17 13:15:13 +00:00)


In [10]:
lr_model = LogisticRegressionModel.load("Models/LogisticRegression")
dt_model = DecisionTreeClassificationModel.load("Models/DecisionTree")
rf_model = RandomForestClassificationModel.load("Models/RandomForest")

best_models = {"LogisticRegression": lr_model, 
               "DecisionTree": dt_model, 
               "RandomForest": rf_model}

time: 4.1 s (started: 2022-08-17 13:15:13 +00:00)


In [11]:
def get_metrics(model):
    df_result = model.transform(df_test)
    preds_and_labels = df_result.select(["prediction", "is_purchased"]) \
                            .withColumn("label", F.col("is_purchased").cast(T.FloatType())) \
                            .withColumn("prediction", F.col("prediction").cast(T.FloatType())) \
                            .drop("is_purchased") \
                            .orderBy("prediction")
    return MulticlassMetrics(preds_and_labels.rdd.map(tuple))

time: 672 µs (started: 2022-08-17 13:15:17 +00:00)


In [12]:
model_metrics = {}
idx = 0
for name, model in best_models.items():
    model_metrics[idx] = {}
    model_metrics[idx]["Name"] = name
    print(f"{'*' * 25}")
    
    print(f"Model: {name}")
    metrics = get_metrics(model)
    model_metrics[idx]["Metrics"] = metrics
    
    print("Confusion Matrix")
    conf_matrix = metrics.confusionMatrix().toArray()
    model_metrics[idx]["ConfusionMatrix"] = conf_matrix
    print(conf_matrix)
    print()
    
    metric = metrics.falsePositiveRate(0.0)
    model_metrics[idx]["FalsePositiveRate"] = metric
    print(f"{'False Positive Rate': <20}: {metric}")
    
    metric = metrics.precision(1.0)
    model_metrics[idx]["Precision"] = metric
    print(f"{'Precision': <20}: {metric}")
    
    metric = metrics.recall(1.0)
    model_metrics[idx]["Recall"] = metric
    print(f"{'Recall': <20}: {metric}")
    
    metric = metrics.fMeasure(0.0, 1.0)
    model_metrics[idx]["F1-Score"] = metric
    print(f"{'F1-Score': <20}: {metric}")
    
    metric = metrics.accuracy
    model_metrics[idx]["Accuracy"] = metric
    print(f"{'Accuracy': <20}: {metric}")
    
    print(f"{'*' * 25}")
    print("\n")
    idx += 1


*************************
Model: LogisticRegression
Confusion Matrix
[[ 29471.  76855.]
 [ 21230. 142849.]]

False Positive Rate : 0.12938889193620146
Precision           : 0.6501884353493792
Recall              : 0.8706111080637985
F1-Score            : 0.3753621988575213
Accuracy            : 0.637266322738115
*************************


*************************
Model: DecisionTree
Confusion Matrix
[[ 80641.  25685.]
 [122783.  41296.]]

False Positive Rate : 0.7483163598022904
Precision           : 0.6165330466848807
Recall              : 0.2516836401977096
F1-Score            : 0.5206844229217111
Accuracy            : 0.4509421053604778
*************************


*************************
Model: RandomForest
Confusion Matrix
[[ 41047.  65279.]
 [ 50927. 113152.]]

False Positive Rate : 0.3103809750181315
Precision           : 0.6341498954778038
Recall              : 0.6896190249818684
F1-Score            : 0.4139889056984367
Accuracy            : 0.5702520293633624
**************

In [13]:
df = pd.DataFrame(model_metrics).T
df.drop(["Metrics"], axis = 1, inplace = True) 

time: 6.97 ms (started: 2022-08-17 13:17:16 +00:00)


In [14]:
df.head()

Unnamed: 0,Name,ConfusionMatrix,FalsePositiveRate,Precision,Recall,F1-Score,Accuracy
0,LogisticRegression,"[[29471.0, 76855.0], [21230.0, 142849.0]]",0.129389,0.650188,0.870611,0.375362,0.637266
1,DecisionTree,"[[80641.0, 25685.0], [122783.0, 41296.0]]",0.748316,0.616533,0.251684,0.520684,0.450942
2,RandomForest,"[[41047.0, 65279.0], [50927.0, 113152.0]]",0.310381,0.63415,0.689619,0.413989,0.570252


time: 18.5 ms (started: 2022-08-17 13:17:16 +00:00)


In [15]:
def extract_feature_name(dataframe,featuresVector,stat=False):
    """
    dataframe: spark dataframe,  must contain the featuresVector column
    featuresVector,string name of the features vector
    stat: boolean, if True, return mean and sd
    return a pandas dataframe, include columns
    """
    from itertools import chain
    try:
        attrs = sorted(
            (attr["idx"], attr["name"]) for attr in (chain(*dataframe
                .schema[featuresVector]
                .metadata["ml_attr"]["attrs"].values()))) 
    except Exception as ex:
        print(f""""error occur in extract_feature_name(), it looks like column {featuresVector} does not have such metadata.
        error type:  {type(ex).__name__} , error message: {ex.args}
        error handled by   fill the feature name with index""")
        vectorsize=dataframe.rdd.collect()[0][featuresVector].size
        attrs=zip(list(range(vectorsize)),list(range(vectorsize)))
    df=pd.DataFrame(attrs,columns=['feature_index','feature_name'])
    if stat:
        from pyspark.ml.feature import StandardScaler
        standardScaler = StandardScaler(inputCol=featuresVector, outputCol="scaled")
        smodel = standardScaler.fit(dataframe)
        df['N']=dataframe.count()
        df['mean']=smodel.mean.toArray()
        df['std']=smodel.std.toArray()
    return df

def feature_importance(lrm_model, trainDF, trainFeatures, nonzero_only=True):
    coef=extract_feature_name(trainDF,trainFeatures,stat=True)
    coef['coef']=lrm_model.coefficients.toArray()
    coef["std_coef"]=coef["coef"]*coef["std"]
    coef["feature_importance"]=coef.std_coef.abs() 
    if nonzero_only:
        coef=coef.loc[coef.coef!=0,:]    
    coef.sort_values(by=["feature_importance"],ascending=False,inplace=True)
    coef.reset_index(drop=True,inplace=True)
    coef=coef[['feature_index', 'feature_name','coef',  'mean', 'std',  'std_coef', 'feature_importance']]
    return coef

time: 1.57 ms (started: 2022-08-17 13:17:16 +00:00)


### Feature Importances

#### Logitstic Regression

In [16]:
# feature_importances
feature_importance(lrm_model=lr_model, 
                   trainDF=df_train, 
                   trainFeatures='features', 
                   nonzero_only=True)

Unnamed: 0,feature_index,feature_name,coef,mean,std,std_coef,feature_importance
0,76,user_product_count,0.069376,8.327858,11.987979,0.831679,0.831679
1,78,category_secondary_count,-0.006947,33.429711,59.136615,-0.410850,0.410850
2,79,session_count,0.002625,70.474443,108.726912,0.285459,0.285459
3,55,brand_stridx_ohe_xiaomi,-0.400531,0.113412,0.317096,-0.127007,0.127007
4,54,brand_stridx_ohe_others,-0.311619,0.181774,0.385658,-0.120178,0.120178
...,...,...,...,...,...,...,...
75,47,category_code_secondary_stridx_ohe_tshirt,-0.430444,0.000021,0.004550,-0.001958,0.001958
76,60,brand_stridx_ohe_acer,-0.018840,0.010155,0.100261,-0.001889,0.001889
77,49,category_code_secondary_stridx_ohe_cultivator,-0.073110,0.000010,0.003091,-0.000226,0.000226
78,40,category_code_secondary_stridx_ohe_trainer,-0.009815,0.000384,0.019585,-0.000192,0.000192


time: 19.4 s (started: 2022-08-17 13:17:16 +00:00)


#### Decision Tree

In [17]:
f_type_to_flist_dict = df_test.schema['features'].metadata["ml_attr"]["attrs"]
feature_importances = dt_model.featureImportances.toArray()
features = {}
features["FeatureName"] = []
features["FeatureImportance"] = []
for _, f_list in f_type_to_flist_dict.items():
    for f in f_list:
        f_index = f['idx']
        f_name = f['name']
        features["FeatureName"].append(f_name)
        features["FeatureImportance"].append(feature_importances[int(f_index)-1])

features_pd = pd.DataFrame(features)

time: 95.1 ms (started: 2022-08-17 13:17:35 +00:00)


In [18]:
features_pd = features_pd.sort_values(by=["FeatureImportance"], ascending=False).reset_index()
features_pd.rename(columns={'index': 'FeatureNum'}, inplace=True, errors='raise')
features_pd.head(20)

Unnamed: 0,FeatureNum,FeatureName,FeatureImportance
0,6,session_count,0.168214
1,7,category_code_primary_stridx_ohe_electronics,0.168214
2,3,user_product_count,0.152178
3,5,category_secondary_count,0.147429
4,2,total_activities_count,0.127708
5,1,day_of_week,0.122082
6,4,average_price,0.099803
7,21,category_code_secondary_stridx_ohe_unavailable,0.025748
8,79,hours_ohe_2,0.015651
9,78,hours_ohe_1,0.01446


time: 16.6 ms (started: 2022-08-17 13:17:35 +00:00)


#### Random Forest

In [19]:
f_type_to_flist_dict = df_test.schema['features'].metadata["ml_attr"]["attrs"]
feature_importances = rf_model.featureImportances.toArray()
features = {}
features["FeatureName"] = []
features["FeatureImportance"] = []
for _, f_list in f_type_to_flist_dict.items():
    for f in f_list:
        f_index = f['idx']
        f_name = f['name']
        features["FeatureName"].append(f_name)
        features["FeatureImportance"].append(feature_importances[int(f_index)-1])

features_pd = pd.DataFrame(features)

time: 23.9 ms (started: 2022-08-17 13:17:35 +00:00)


In [20]:
features_pd = features_pd.sort_values(by=["FeatureImportance"], ascending=False).reset_index()
features_pd.rename(columns={'index': 'FeatureNum'}, inplace=True, errors='raise')
features_pd.head(20)

Unnamed: 0,FeatureNum,FeatureName,FeatureImportance
0,3,user_product_count,0.328183
1,2,total_activities_count,0.220038
2,6,session_count,0.11704
3,7,category_code_primary_stridx_ohe_electronics,0.11704
4,5,category_secondary_count,0.052541
5,8,category_code_primary_stridx_ohe_unavailable,0.036847
6,4,average_price,0.03638
7,1,day_of_week,0.035777
8,62,brand_stridx_ohe_xiaomi,0.034705
9,21,category_code_secondary_stridx_ohe_unavailable,0.031552


time: 17.3 ms (started: 2022-08-17 13:17:35 +00:00)
