In [1]:
# Setting the environment variables

In [2]:
# Loading autotime for the notebook
%load_ext autotime

time: 263 µs (started: 2022-08-17 05:54:20 +00:00)


In [3]:
import os
import sys
os.environ["PYSPARK_PYTHON"]="/usr/bin/python3"
os.environ["PYSPARK_DRIVER_PYTHON"]="/usr/bin/python3"
os.environ["PYSPARK_DRIVER_PYTHON_OPTS"]="notebook --no-browser"
os.environ["JAVA_HOME"] = "/usr/java/jdk1.8.0_161/jre"
os.environ["SPARK_HOME"] = "/home/ec2-user/spark-2.4.4-bin-hadoop2.7"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
sys.path.insert(0, os.environ["PYLIB"] + "/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] + "/pyspark.zip")

time: 1.06 ms (started: 2022-08-17 05:54:20 +00:00)


In [28]:
# Spark environment
from pyspark import SparkConf
from pyspark.sql import SparkSession

from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Python Utilites
import numpy as np

time: 707 µs (started: 2022-08-17 08:41:03 +00:00)


# Ecommerce Churn Assignment

The aim of the assignment is to build a model that predicts whether a person purchases an item after it has been added to the cart or not. Being a classification problem, you are expected to use your understanding of all the three models covered till now. You must select the most robust model and provide a solution that predicts the churn in the most suitable manner. 

For this assignment, you are provided the data associated with an e-commerce company for the month of October 2019. Your task is to first analyse the data, and then perform multiple steps towards the model building process.

The broad tasks are:
- Data Exploration
- Feature Engineering
- Model Selection
- Model Inference

### Data description

The dataset stores the information of a customer session on the e-commerce platform. It records the activity and the associated parameters with it.

- **event_time**: Date and time when user accesses the platform
- **event_type**: Action performed by the customer
            - View
            - Cart
            - Purchase
            - Remove from cart
- **product_id**: Unique number to identify the product in the event
- **category_id**: Unique number to identify the category of the product
- **category_code**: Stores primary and secondary categories of the product
- **brand**: Brand associated with the product
- **price**: Price of the product
- **user_id**: Unique ID for a customer
- **user_session**: Session ID for a user


### Initialising the SparkSession

The dataset provided is 5 GBs in size. Therefore, it is expected that you increase the driver memory to a greater number. You can refer to notebook 1 for the steps involved here.

In [5]:
MAX_MEMORY = "14G"

spark = SparkSession \
    .builder \
    .appName("demo") \
    .config("spark.driver.memory", MAX_MEMORY) \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")
spark

time: 3.94 s (started: 2022-08-17 05:54:20 +00:00)


In [6]:
# Loading the clean data


time: 285 µs (started: 2022-08-17 05:54:24 +00:00)


## Task 3: Model Selection
3 models for classification:	
- Logistic Regression
- Decision Tree
- Random Forest

### Model 3: Random Forest

In [7]:
# Additional steps for Decision Trees, if any


time: 837 µs (started: 2022-08-17 05:54:24 +00:00)


#### Feature Transformation (Code will be same; check for the columns)

In [8]:
# Check if only the required columns are present to build the model
# If not, drop the redundant columns


time: 490 µs (started: 2022-08-17 05:54:24 +00:00)


In [9]:
# Categorising the attributes into its type - Continuous and Categorical


time: 432 µs (started: 2022-08-17 05:54:24 +00:00)


In [10]:
# Feature transformation for categorical features


time: 424 µs (started: 2022-08-17 05:54:24 +00:00)


In [11]:
# Vector assembler to combine all the features


time: 423 µs (started: 2022-08-17 05:54:24 +00:00)


In [12]:
# Pipeline for the tasks


time: 368 µs (started: 2022-08-17 05:54:24 +00:00)


In [13]:
# Transforming the dataframe df


time: 911 µs (started: 2022-08-17 05:54:24 +00:00)


In [14]:
# Schema of the transformed df


time: 417 µs (started: 2022-08-17 05:54:24 +00:00)


In [15]:
# Checking the elements of the transformed df - Top 20 rows


time: 406 µs (started: 2022-08-17 05:54:24 +00:00)


In [16]:
# Storing the transformed df in S3 bucket to prevent repetition of steps again


time: 417 µs (started: 2022-08-17 05:54:24 +00:00)


In [17]:
# Load transformed data
df_transformed = spark.read.parquet("Parquets/transformed_df.parquet")

time: 2.83 s (started: 2022-08-17 05:54:24 +00:00)


#### Train-test split

In [18]:
# Splitting the data into train and test (Remember you are expected to compare the model later)
df_train, df_test = df_transformed.randomSplit([0.7, 0.3], seed=42)

time: 69.8 ms (started: 2022-08-17 05:54:27 +00:00)


In [19]:
# Number of rows in train and test data
print(f"Number of Train rows: {df_train.count()}")
print(f"Number of Test rows: {df_test.count()}")

Number of Train rows: 628038
Number of Test rows: 270405
time: 22.8 s (started: 2022-08-17 05:54:27 +00:00)


#### Model Fitting

In [20]:
label_column = "is_purchased"

time: 395 µs (started: 2022-08-17 05:54:50 +00:00)


In [21]:
random_forest = RandomForestClassifier(labelCol=label_column, 
                                       featuresCol="features", 
                                       numTrees=10)
    
# Chain indexer and dtc together into a single ML Pipeline.
model = random_forest.fit(df_train)

# Define an evaluation metric and evaluate the model on the validation dataset.
evaluator = MulticlassClassificationEvaluator(labelCol=label_column, metricName="accuracy")
predictions = model.transform(df_test)
validation_metric = evaluator.evaluate(predictions)

time: 1min 26s (started: 2022-08-17 05:54:50 +00:00)


In [22]:
print(f"Accuracy for basic RF model: {validation_metric}")

Accuracy for basic RF model: 0.636460124627873
time: 793 µs (started: 2022-08-17 05:56:17 +00:00)


In [23]:
# Building the model with hyperparameter tuning
# Create ParamGrid for Cross Validation

# Initialising RandomForestClassifier
random_forest = RandomForestClassifier(labelCol=label_column, 
                                       featuresCol="features", 
                                       featureSubsetStrategy="auto",
                                       seed=42)

# Creating Parameter Grid search on RF model
num_trees = [10, 20, 30]
max_depth= [3, 5, 7, 9]
max_bins= [16, 32, 64]
impurity = ["gini", "entropy"]

param_grid = ParamGridBuilder().addGrid(random_forest.numTrees, num_trees) \
                               .addGrid(random_forest.maxDepth, max_depth) \
                               .addGrid(random_forest.maxBins, max_bins) \
                               .addGrid(random_forest.impurity, impurity) \
                               .build()

class_evaluator = MulticlassClassificationEvaluator(labelCol=label_column, 
                                                    metricName="accuracy")

cross_validator = CrossValidator(estimator=random_forest,
                                 estimatorParamMaps=param_grid,
                                 evaluator=class_evaluator,
                                 numFolds=10,
                                 parallelism=4)

# Run cross-validation, and choose the best set of parameters.
cross_validator_model = cross_validator.fit(df_train)

# Make predictions on testing data and calculating ROC metrics and model accuracy. 
prediction = cross_validator_model.transform(df_test)


time: 2h 26min 8s (started: 2022-08-17 05:56:17 +00:00)


In [24]:
# Run cross-validation steps


time: 449 µs (started: 2022-08-17 08:22:26 +00:00)


In [25]:
# Fitting the models on transformed df
# Run cross-validation, and choose the best set of parameters.


time: 432 µs (started: 2022-08-17 08:22:26 +00:00)


In [26]:
# Best model from the results of cross-validation


time: 260 µs (started: 2022-08-17 08:22:26 +00:00)


#### Model Analysis

Required Steps:
- Fit on test data
- Performance analysis
    - Appropriate Metric with reasoning

#### Summary of the best Random Forest model

In [29]:
best_model_params = cross_validator_model.getEstimatorParamMaps()[np.argmax(cross_validator_model.avgMetrics)]
param_keys = list(best_model_params.keys())
for param in param_keys:
    print(f"{param.name} = {best_model_params[param]}")

numTrees = 30
maxDepth = 9
maxBins = 64
impurity = entropy
time: 1.03 ms (started: 2022-08-17 08:41:08 +00:00)


In [35]:
best_model = cross_validator_model.bestModel
evaluator = MulticlassClassificationEvaluator(labelCol=label_column, metricName="accuracy")

time: 5.4 ms (started: 2022-08-17 08:49:49 +00:00)


In [38]:
best_model_result = evaluator.evaluate(best_model.transform(df_test))

time: 22 s (started: 2022-08-17 08:50:02 +00:00)


In [42]:
print(f"Accuracy: {best_model_result}")

Accuracy: 0.6491411031600747
time: 598 µs (started: 2022-08-17 09:03:44 +00:00)


In [43]:
rf_model_path = "Models/RandomForest"
best_model.save(rf_model_path)

time: 1.3 s (started: 2022-08-17 09:03:48 +00:00)
