In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns
import os 

# Load the dataset
data = pd.read_csv('BlackFriday.csv')

# Reduce the sample size (e.g., using 10% of the data)
data_sample = data.sample(frac=0.1, random_state=42)




In [None]:
# Data Preprocessing
data_sample.fillna(0, inplace=True)
label_encoder = LabelEncoder()
data_sample['Gender'] = label_encoder.fit_transform(data_sample['Gender'])
data_sample['Age'] = label_encoder.fit_transform(data_sample['Age'])
data_sample['City_Category'] = label_encoder.fit_transform(data_sample['City_Category'])
data_sample = pd.get_dummies(data_sample, columns=['Stay_In_Current_City_Years', 'Product_ID'], drop_first=True)

# Features and target variable
X = data_sample.drop('Purchase', axis=1)
y = data_sample['Purchase']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
linear_reg = LinearRegression()
tree_reg = DecisionTreeRegressor(random_state=42)
forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit models
linear_reg.fit(X_train, y_train)
tree_reg.fit(X_train, y_train)
forest_reg.fit(X_train, y_train)

# Predictions
y_pred_train_lr = linear_reg.predict(X_train)
y_pred_test_lr = linear_reg.predict(X_test)

y_pred_train_dt = tree_reg.predict(X_train)
y_pred_test_dt = tree_reg.predict(X_test)

y_pred_train_rf = forest_reg.predict(X_train)
y_pred_test_rf = forest_reg.predict(X_test)



In [None]:
# Evaluate models
models = ['Linear Regression', 'Decision Tree', 'Random Forest']
train_mse = [mean_squared_error(y_train, y_pred_train_lr), mean_squared_error(y_train, y_pred_train_dt), mean_squared_error(y_train, y_pred_train_rf)]
test_mse = [mean_squared_error(y_test, y_pred_test_lr), mean_squared_error(y_test, y_pred_test_dt), mean_squared_error(y_test, y_pred_test_rf)]

train_r2 = [r2_score(y_train, y_pred_train_lr), r2_score(y_train, y_pred_train_dt), r2_score(y_train, y_pred_train_rf)]
test_r2 = [r2_score(y_test, y_pred_test_lr), r2_score(y_test, y_pred_test_dt), r2_score(y_test, y_pred_test_rf)]

# Print model comparison
print("Model Comparison:")
print(f"{'Model':<20}{'Train MSE':<15}{'Test MSE':<15}{'Train R²':<15}{'Test R²'}")
for i in range(len(models)):
    print(f"{models[i]:<20}{train_mse[i]:<15.2f}{test_mse[i]:<15.2f}{train_r2[i]:<15.2f}{test_r2[i]:.2f}")

# Visualize model comparison
fig, ax = plt.subplots(1, 2, figsize=(14, 6))
sns.barplot(x=models, y=test_mse, ax=ax[0])
ax[0].set_title('Test MSE Comparison')
ax[0].set_ylabel('MSE')

sns.barplot(x=models, y=test_r2, ax=ax[1])
ax[1].set_title('Test R² Comparison')
ax[1].set_ylabel('R²')

plt.tight_layout()
plt.show()

# Conclusion
best_model = models[np.argmin(test_mse)]
print(f"The best performing model based on Test MSE is: {best_model}")