**Random Forest Regressor**


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")


In [None]:
#loading the dataset
#train dataset
train = pd.read_csv('notebook/data/rossmann-store-sales/train.csv', parse_dates=['Date'], index_col='Date')
store = pd.read_csv('notebook/data/rossmann-store-sales/store.csv')
#test data
test = pd.read_csv('notebook/data/rossmann-store-sales/test.csv')

train=train.reset_index()

In [1]:


#converting the datetime column to day month and year separate columns
train['Year'] = train['Date'].dt.year
train['Month'] = train['Date'].dt.month
train['day'] = train['Date'].dt.day
train['day_of_week'] = train['Date'].dt.dayofweek
input_cols = ['Store', 'DayOfWeek', 'Promo', 'StateHoliday','Open','SchoolHoliday', 'day', 'Month', 'Year']
target_col = 'Sales'
# Define input and target columns
X = train[input_cols]
y = train[target_col]

# Convert all categorical columns to strings (to handle any categorical data)
categorical_cols = X.select_dtypes(include=['object']).columns
X[categorical_cols] = X[categorical_cols].astype(str)

# Define numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int', 'float']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Preprocessor using ColumnTransformer with SimpleImputer, StandardScaler, and OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),  # Impute missing numerical values with the mean
            ('scaler', StandardScaler())                 # Standardize numerical features
        ]), numerical_cols),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing categorical values with the most frequent value
            ('onehot', OneHotEncoder(handle_unknown='ignore'))    # Encode categorical features with OneHotEncoder
        ]), categorical_cols)
    ])

# Define the model (RandomForestRegressor)
rf = RandomForestRegressor(random_state=42, n_jobs=-1)

# Define the pipeline with preprocessor and the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', rf)
])

# Split the data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to train the model and evaluate performance
def try_model(model):
    # Fit the model using the pipeline
    pipeline.fit(X_train, y_train)

    # Generate predictions
    train_preds = pipeline.predict(X_train)
    val_preds = pipeline.predict(X_test)

    # Compute RMSE for train and validation sets
    train_rmse = mean_squared_error(y_train, train_preds, squared=False)
    val_rmse = mean_squared_error(y_test, val_preds, squared=False)
    
    print(f"Train RMSE: {train_rmse}")
    print(f"Validation RMSE: {val_rmse}")
    
    return train_rmse, val_rmse

# Train and evaluate the model
train_rmse, val_rmse = try_model(rf)

# Feature importance
importance_df = pd.DataFrame({
    'feature': numerical_cols.tolist() + list(pipeline.named_steps['preprocessor'].transformers_[1][1]['onehot'].get_feature_names_out(categorical_cols)),
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print(importance_df.head(10))

# Define the predict_input function to handle single input prediction
def predict_input(model, sample_input):
    if sample_input['Open'] == 0:
        return 0.0  # Return 0 prediction if the store is closed

    input_df = pd.DataFrame([sample_input])
    
    # Extract date components
    input_df['Date'] = pd.to_datetime(input_df['Date'])
    input_df['day'] = input_df['Date'].dt.day
    input_df['Month'] = input_df['Date'].dt.month
    input_df['Year'] = input_df['Date'].dt.year

    # Apply preprocessing using the pipeline before prediction
    pred = pipeline.predict(input_df)[0]
    return pred


  train = pd.read_csv('notebook/data/rossmann-store-sales/train.csv', parse_dates=['Date'], index_col='Date')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[categorical_cols] = X[categorical_cols].astype(str)


Train RMSE: 563.9641329585866
Validation RMSE: 1515.3809767595758
          feature  importance
3            Open    0.459827
0           Store    0.373411
2           Promo    0.073461
1       DayOfWeek    0.032826
5             day    0.025534
6           Month    0.024088
7            Year    0.005985
4   SchoolHoliday    0.004212
8  StateHoliday_0    0.000353
9  StateHoliday_a    0.000179


