In [None]:
import pandas as pd
df = pd.read_csv('/Users/rajpalnati/Downloads/BREWLAB/FM_DATA.csv')

In [None]:
# Convert 'Date' to datetime and drop rows with missing Date
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df.dropna(subset=['Date'])

# Create the aggregated dataset for daily-level predictions (Option B)
daily_agg = df.groupby('Date').agg({
    'Foot Traffic': 'mean',  # assuming consistent value per date
    'Max Temperature (°F)': 'mean',
    'Min Temperature (°F)': 'mean',
    'Max Wind Speed (mph)': 'mean',
    'Avg Wind Speed (mph)': 'mean',
    'Precipitation (in)': 'mean',
    'Special Event': 'max',  # binary flag, if any 1 => 1
    'Total Sales': 'sum',
    'Total Transactions': 'sum'
}).reset_index()

# Create the category-level dataset for "Total Sales by Category" prediction (Option A)
category_level = df.dropna(subset=['Total Sales by Category'])


In [None]:
daily_agg

In [None]:
category_level

In [None]:
## FOOT TRAFFIC PREDICTION (RANDOM FOREST REGRESSOR)

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt

# Filter out rows with missing Foot Traffic or features
foot_traffic_df = daily_agg.dropna(subset=[
    'Foot Traffic', 'Max Temperature (°F)', 'Min Temperature (°F)',
    'Max Wind Speed (mph)', 'Avg Wind Speed (mph)', 'Precipitation (in)', 'Special Event'
])

# Define features and target
features = ['Max Temperature (°F)', 'Min Temperature (°F)',
            'Max Wind Speed (mph)', 'Avg Wind Speed (mph)',
            'Precipitation (in)', 'Special Event']
X = foot_traffic_df[features]
y = foot_traffic_df['Foot Traffic']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train model
model = RandomForestRegressor(random_state=42)
model.fit(X_train_scaled, y_train)

# Predict
y_pred = model.predict(X_test_scaled)

# Evaluate
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

# Plot actual vs predicted
plt.figure(figsize=(10, 6))
plt.plot(y_test.values, label='Actual Foot Traffic', marker='o')
plt.plot(y_pred, label='Predicted Foot Traffic (RF)', marker='x')
plt.title('Actual vs Predicted Foot Traffic')
plt.xlabel('Test Sample Index')
plt.ylabel('Foot Traffic')
plt.legend()
plt.grid(True)
plt.tight_layout()


print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")
plt.show()


In [None]:
## FOOT TRAFFIC PREDICTION (GRADIENT BOOSTING REGRESSOR)

from sklearn.ensemble import GradientBoostingRegressor

# Try Gradient Boosting Regressor
gbr_model = GradientBoostingRegressor(random_state=42)
gbr_model.fit(X_train_scaled, y_train)
y_pred_gbr = gbr_model.predict(X_test_scaled)

# Evaluate new model
rmse_gbr = np.sqrt(mean_squared_error(y_test, y_pred_gbr))
r2_gbr = r2_score(y_test, y_pred_gbr)

# Plot actual vs predicted
plt.figure(figsize=(10, 6))
plt.plot(y_test.values, label='Actual Foot Traffic', marker='o')
plt.plot(y_pred_gbr, label='Predicted Foot Traffic (GBR)', marker='x')
plt.title('Actual vs Predicted Foot Traffic')
plt.xlabel('Test Sample Index')
plt.ylabel('Foot Traffic')
plt.legend()
plt.grid(True)
plt.tight_layout()


print(f"RMSE: {rmse_gbr:.2f}")
print(f"R² Score: {r2_gbr:.2f}")
plt.show()


In [None]:
## TOTAL SALES PREDICTION without Foot Traffic (GRADIENT BOOST REGRESSOR)

# Drop rows with missing target or features
sales_df = daily_agg.dropna(subset=[
    'Total Sales', 'Max Temperature (°F)', 'Min Temperature (°F)',
    'Max Wind Speed (mph)', 'Avg Wind Speed (mph)', 'Precipitation (in)', 'Special Event'
])

# Define features and target
features = ['Max Temperature (°F)', 'Min Temperature (°F)',
            'Max Wind Speed (mph)', 'Avg Wind Speed (mph)',
            'Precipitation (in)', 'Special Event']
X = sales_df[features]
y = sales_df['Total Sales']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Gradient Boosting Regressor
model = GradientBoostingRegressor(random_state=42)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

# Evaluation
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

# Plot actual vs predicted
plt.figure(figsize=(10, 6))
plt.plot(y_test.values, label='Actual Total Sales', marker='o')
plt.plot(y_pred, label='Predicted Total Sales', marker='x')
plt.title('Actual vs Predicted Total Sales')
plt.xlabel('Test Sample Index')
plt.ylabel('Total Sales')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
## TOTAL SALES PREDICTION with Foot Traffic (GRADIENT BOOST REGRESSOR)

# Drop rows with missing target or features
sales_df = daily_agg.dropna(subset=[
    'Total Sales', 'Max Temperature (°F)', 'Min Temperature (°F)',
    'Max Wind Speed (mph)', 'Avg Wind Speed (mph)', 'Precipitation (in)', 'Special Event', 
    'Foot Traffic'
])

# Define features and target
features = ['Max Temperature (°F)', 'Min Temperature (°F)',
            'Max Wind Speed (mph)', 'Avg Wind Speed (mph)',
            'Precipitation (in)', 'Special Event']
X = sales_df[features]
y = sales_df['Total Sales']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Gradient Boosting Regressor
model = GradientBoostingRegressor(random_state=42)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

# Evaluation
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

# Plot actual vs predicted
plt.figure(figsize=(10, 6))
plt.plot(y_test.values, label='Actual Total Sales', marker='o')
plt.plot(y_pred, label='Predicted Total Sales', marker='x')
plt.title('Actual vs Predicted Total Sales')
plt.xlabel('Test Sample Index')
plt.ylabel('Total Sales')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
#pip install xgboost

In [None]:
## TOTAL SALES (XGBOOST)
from xgboost import XGBRegressor

# Add temporal features
daily_agg['DayOfWeek'] = daily_agg['Date'].dt.dayofweek
daily_agg['Month'] = daily_agg['Date'].dt.month
daily_agg['IsWeekend'] = daily_agg['DayOfWeek'].isin([5, 6]).astype(int)

# Drop rows with missing target or predictors
sales_df = daily_agg.dropna(subset=[
    'Total Sales', 'Max Temperature (°F)', 'Min Temperature (°F)',
    'Max Wind Speed (mph)', 'Avg Wind Speed (mph)',
    'Precipitation (in)', 'Special Event'
])

# Define features
features = [
    'Max Temperature (°F)', 'Min Temperature (°F)',
    'Max Wind Speed (mph)', 'Avg Wind Speed (mph)',
    'Precipitation (in)', 'Special Event',
    'DayOfWeek', 'Month', 'IsWeekend'
]
X = sales_df[features]
y = sales_df['Total Sales']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost model
model = XGBRegressor(random_state=42, n_estimators=100)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluation
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

# Plot actual vs predicted
plt.figure(figsize=(10, 6))
plt.plot(y_test.values, label='Actual Total Sales', marker='o')
plt.plot(y_pred, label='Predicted Total Sales (XGBoost)', marker='x')
plt.title('Actual vs Predicted Total Sales')
plt.xlabel('Test Sample Index')
plt.ylabel('Total Sales')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()