In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

# Load the dataset
data = pd.read_csv('BlackFriday.csv')  # Replace with your file path

data = data.sample(frac=0.1, random_state=42)

# Data preprocessing
# Handling categorical variables using Label Encoding
le = LabelEncoder()
data['Gender'] = le.fit_transform(data['Gender'])
data['Age'] = le.fit_transform(data['Age'])
data['City_Category'] = le.fit_transform(data['City_Category'])
data['Stay_In_Current_City_Years'] = le.fit_transform(data['Stay_In_Current_City_Years'])

# Drop irrelevant columns
data = data.drop(['User_ID', 'Product_ID'], axis=1)

# Handle missing values
data.fillna(data.mean(), inplace=True)

# Define the target variable and features
X = data.drop('Purchase', axis=1)
y = data['Purchase']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Data generator to yield batches of data
def data_generator(X, y, batch_size):
    while True:
        for start in range(0, len(X), batch_size):
            end = min(start + batch_size, len(X))
            yield X[start:end], y[start:end]

# Visualization of the target variable distribution
plt.figure(figsize=(10, 6))
sns.histplot(y, bins=30, kde=True)
plt.title('Distribution of Purchase Amount')
plt.xlabel('Purchase Amount')
plt.ylabel('Frequency')
plt.show()




In [None]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(n_estimators=100)
}

# Train and evaluate models using data generator
batch_size = 10000  # Adjust batch size according to your system's memory

for model_name, model in models.items():
    # Train the model
    for X_batch, y_batch in data_generator(X_train, y_train, batch_size):
        model.fit(X_batch, y_batch)

    # Predict and evaluate on the test set
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"{model_name} Model Performance:")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"R^2 Score: {r2:.2f}")
    print("\n")

# Feature importance visualization for Random Forest
plt.figure(figsize=(12, 8))
sns.barplot(x=models['Random Forest'].feature_importances_, y=X.columns)
plt.title('Feature Importance - Random Forest')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()