In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Load the dataset
data = pd.read_csv('../../data/ebay_data.csv')
print("First five rows of the dataset:")
print(data.head())

# Check for missing values
print("\nMissing values in each column:")
print(data.isnull().sum())

# Convert the 'timestamp' column to a datetime object
data['timestamp'] = pd.to_datetime(data['timestamp'])

# Create new time-based features: hour of day and day of week
data['hour'] = data['timestamp'].dt.hour
data['day_of_week'] = data['timestamp'].dt.dayofweek

# Display the updated DataFrame structure
print("\nData types after conversion:")
print(data.dtypes)

First five rows of the dataset:
   session_id  user_id            timestamp       action   price  \
0           1      103  2022-11-01 10:00:41  add_to_cart  187.89   
1           2      271  2022-10-13 18:14:41        click   27.43   
2           3      107  2022-05-24 22:39:31         view  178.22   
3           4       72  2022-01-19 18:11:21        click  465.00   
4           5      189  2022-01-18 19:02:56         view  192.72   

  product_category  purchase  
0          Fashion         0  
1             Toys         0  
2             Toys         0  
3      Electronics         0  
4          Fashion         0  

Missing values in each column:
session_id          0
user_id             0
timestamp           0
action              0
price               0
product_category    0
purchase            0
dtype: int64

Data types after conversion:
session_id                   int64
user_id                      int64
timestamp           datetime64[ns]
action                      object
pric

In [None]:
# Summary statistics for numerical features
print("\nSummary statistics:")
print(data.describe())

# Plot the distribution of the target variable 'purchase'
plt.figure(figsize=(6,4))
sns.countplot(x='purchase', data=data)
plt.title("Distribution of Purchase Indicator")
plt.xlabel("Purchase (0 = No, 1 = Yes)")
plt.ylabel("Count")
plt.show()

# Visualize the distribution of 'price'
plt.figure(figsize=(10,6))
sns.histplot(data['price'], bins=30, kde=True)
plt.title("Price Distribution")
plt.xlabel("Price")
plt.ylabel("Frequency")
plt.show()

# Visualize purchase counts by product category
plt.figure(figsize=(10,6))
sns.countplot(x='product_category', hue='purchase', data=data)
plt.title("Purchase by Product Category")
plt.xlabel("Product Category")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()


In [None]:
# One-hot encode categorical variables: 'action' and 'product_category'
data_encoded = pd.get_dummies(data, columns=['action', 'product_category'], drop_first=True)

# Display the columns after encoding to verify
print("\nColumns after one-hot encoding:")
print(data_encoded.columns)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report

# Define features (X) and target (y)
# Remove non-informative columns: session_id, user_id, timestamp
X = data_encoded.drop(columns=['purchase', 'session_id', 'user_id', 'timestamp'])
y = data_encoded['purchase']

# Split the dataset into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Model Accuracy:", accuracy)
print("ROC AUC Score:", roc_auc)
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)


In [None]:
from sklearn.model_selection import GridSearchCV

# Define a grid of hyperparameters for tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

# Set up GridSearchCV to optimize for ROC AUC score
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Output the best parameters and the best cross-validated ROC AUC score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validated ROC AUC:", grid_search.best_score_)

# Use the best estimator to make predictions
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)

print("\nBest Model Accuracy:", accuracy_score(y_test, y_pred_best))
print("Best Model ROC AUC Score:", roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1]))

# Plot feature importances from the best model
importances = best_model.feature_importances_
features = X.columns
feature_importance = pd.Series(importances, index=features).sort_values(ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(x=feature_importance.values, y=feature_importance.index)
plt.title("Feature Importance")
plt.xlabel("Importance Score")
plt.ylabel("Features")
plt.show()


In [None]:
from sklearn.metrics import fbeta_score

# Define a custom evaluation metric: F-beta score with beta=0.5 (emphasizing precision)
def custom_metric(y_true, y_pred):
    return fbeta_score(y_true, y_pred, beta=0.5)

custom_score = custom_metric(y_test, y_pred_best)
print("Custom Evaluation Metric (F0.5 Score):", custom_score)
