In [None]:
# Importing libraries
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder

df_features = pd.read_csv("Features data set.csv")
df_sales = pd.read_csv("sales data-set.csv")
df_stores = pd.read_csv("stores data-set.csv")
df = pd.merge(df_sales, df_features, how = 'left', on = ['Store','Date','IsHoliday']) # merge df_sales and df_features by left since there are more rows for sales 
df_new = pd.merge(df, df_stores, how = 'left', on = 'Store') # merge df and df_stores by left to keep all the columns and rows
df_new['Date'] = pd.to_datetime(df_new['Date'], format = 'mixed')
df_new = df_new.fillna(0)

# Encode categorical variables
le = LabelEncoder()
df_new['Type'] = le.fit_transform(df_new['Type'])

# Define features (excluding target variable)
features = ['Dept', 'IsHoliday', 'Temperature', 'Fuel_Price', 
            'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 
            'MarkDown5', 'CPI', 'Unemployment', 'Type', 'Size']

X = df_new[features]
y = df_new['Weekly_Sales']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a linear regression model 
model1 = LinearRegression() 
model1.fit(X_train, y_train)

# Predictions
y_pred = model1.predict(X_test)

# Calculate mean squared error (mse)
mse1 = mean_squared_error(y_test, y_pred)

# Calculate root mean squared error (rmse)
rmse1 = np.sqrt(mse1)
print("Root Mean Squared Error (RMSE) of Model 1:", rmse1)

# Model evaluation
mae1 = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE) of Model 1: {mae1}")

# Coefficients of the linear regression model
coefficients = pd.Series(model1.coef_, index=features).sort_values(ascending=False)
plt.figure(figsize=(12, 6))
sns.barplot(x=coefficients, y=coefficients.index)
plt.title("Feature Importance of Model 1")
plt.show()

# Train a RandomForestRegressor model
model2 = RandomForestRegressor(n_estimators=100, random_state=42)
model2.fit(X_train, y_train)

# Predictions
y_pred = model2.predict(X_test)

# Calculate mean squared error (mse)
mse2 = mean_squared_error(y_test, y_pred)

# Calculate root mean squared error (rmse)
rmse2 = np.sqrt(mse2)
print("Root Mean Squared Error (RMSE) of MOdel 2:", rmse2)

# Model evaluation
mae2 = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE) of Model 2: {mae2}")

# Feature importance
feature_importance = pd.Series(model2.feature_importances_, index=features).sort_values(ascending=False)
plt.figure(figsize=(12, 6))
sns.barplot(x=feature_importance, y=feature_importance.index)
plt.title("Feature Importance of Model 2")
plt.show()

