In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('../data/2019-Nov.csv')

# Show the first 5 rows
df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-11-01 00:00:00 UTC,view,1003461,2053013555631882655,electronics.smartphone,xiaomi,489.07,520088904,4d3b30da-a5e4-49df-b1a8-ba5943f1dd33
1,2019-11-01 00:00:00 UTC,view,5000088,2053013566100866035,appliances.sewing_machine,janome,293.65,530496790,8e5f4f83-366c-4f70-860e-ca7417414283
2,2019-11-01 00:00:01 UTC,view,17302664,2053013553853497655,,creed,28.31,561587266,755422e7-9040-477b-9bd2-6a6e8fd97387
3,2019-11-01 00:00:01 UTC,view,3601530,2053013563810775923,appliances.kitchen.washer,lg,712.87,518085591,3bfb58cd-7892-48cc-8020-2f17e6de6e7f
4,2019-11-01 00:00:01 UTC,view,1004775,2053013555631882655,electronics.smartphone,xiaomi,183.27,558856683,313628f1-68b8-460d-84f6-cec7a8796ef2


In [2]:
print("Shape:", df.shape)

Shape: (67501979, 9)


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67501979 entries, 0 to 67501978
Data columns (total 9 columns):
 #   Column         Dtype  
---  ------         -----  
 0   event_time     object 
 1   event_type     object 
 2   product_id     int64  
 3   category_id    int64  
 4   category_code  object 
 5   brand          object 
 6   price          float64
 7   user_id        int64  
 8   user_session   object 
dtypes: float64(1), int64(3), object(5)
memory usage: 4.5+ GB


In [4]:
df.describe()

Unnamed: 0,product_id,category_id,price,user_id
count,67501980.0,67501980.0,67501980.0,67501980.0
mean,12514060.0,2.057898e+18,292.4593,538639700.0
std,17257410.0,2.012549e+16,355.6745,22885160.0
min,1000365.0,2.053014e+18,0.0,10300220.0
25%,1305977.0,2.053014e+18,69.24,516476200.0
50%,5100568.0,2.053014e+18,165.77,535057300.0
75%,17300750.0,2.053014e+18,360.34,561079400.0
max,100028600.0,2.187708e+18,2574.07,579969900.0


In [5]:
df['event_type'].value_counts()

event_type
view        63556110
cart         3028930
purchase      916939
Name: count, dtype: int64

In [6]:
df.isnull().sum()

event_time              0
event_type              0
product_id              0
category_id             0
category_code    21898171
brand             9224078
price                   0
user_id                 0
user_session           10
dtype: int64

In [7]:
df['purchase'] = df['event_type'].apply(lambda x: 1 if x == 'purchase' else 0)

In [8]:
df['purchase'].value_counts(normalize=True)

purchase
0    0.986416
1    0.013584
Name: proportion, dtype: float64

In [None]:
user_features = df.groupby('user_id').agg(
    num_views = ('event_type', lambda x: (x == 'view').sum()),
    num_cart_adds = ('event_type', lambda x: (x == 'cart').sum()),
    num_purchases = ('event_type', lambda x: (x == 'purchase').sum()),
    unique_categories = ('category_code', pd.Series.nunique),
    label = ('purchase', 'max')  # 1 if user purchased at least once
).reset_index()

In [12]:
user_features.head()

Unnamed: 0,user_id,num_views,num_cart_adds,num_purchases,unique_categories,label
0,10300217,1,0,0,0,0
1,29515875,11,0,0,2,0
2,31198833,20,0,0,2,0
3,34916060,1,0,0,1,0
4,41798457,1,0,0,1,0


In [14]:
user_features.to_csv('../data/user_features.csv', index=False)

In [15]:
X = user_features.drop(columns=['user_id', 'label', 'num_purchases'])   # features
y = user_features['label']                            # target (0 or 1)

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [17]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=20, random_state=42)
model.fit(X_train, y_train)

In [18]:
y_pred = model.predict(X_test)

In [19]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.96      0.95    650896
           1       0.62      0.52      0.57     88328

    accuracy                           0.91    739224
   macro avg       0.78      0.74      0.76    739224
weighted avg       0.90      0.91      0.90    739224



In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [20]:
import joblib

joblib.dump(model, '../app/model.pkl')

['../app/model.pkl']