In [1]:
#necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Load the dataset
file_path = 'train.csv' 
df = pd.read_csv(file_path)

In [3]:
# 1. Data Preprocessing
# Handle missing values
df['Item_Weight'].fillna(df['Item_Weight'].mean(), inplace=True)
df['Outlet_Size'].fillna('Unknown', inplace=True)

In [5]:
# Encode categorical variables
le = LabelEncoder()
df['Item_Fat_Content'] = le.fit_transform(df['Item_Fat_Content'])
df['Item_Type'] = le.fit_transform(df['Item_Type'])
df['Outlet_Identifier'] = le.fit_transform(df['Outlet_Identifier'])
df['Outlet_Size'] = le.fit_transform(df['Outlet_Size'])
df['Outlet_Location_Type'] = le.fit_transform(df['Outlet_Location_Type'])
df['Outlet_Type'] = le.fit_transform(df['Outlet_Type'])

In [6]:
# 2. Feature Selection
features = df.drop(['Item_Identifier', 'Item_Outlet_Sales'], axis=1)
target = df['Item_Outlet_Sales']

In [7]:
# 3. Model Selection
model = RandomForestRegressor()

In [8]:
# 4. Training and Evaluation
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R-squared: {r2}')

Mean Squared Error: 1185717.756159326
Root Mean Squared Error: 1088.906679270233
R-squared: 0.5637487804877539


In [9]:
# 5. Cross Validation and Hyperparameter Tuning
cv_score = cross_val_score(model, features, target, cv=5, scoring='neg_mean_squared_error')
cv_rmse = (-cv_score.mean()) ** 0.5

print(f'Cross-Validation Root Mean Squared Error: {cv_rmse}')

Cross-Validation Root Mean Squared Error: 1141.2296698550438


In [11]:
from sklearn.model_selection import RandomizedSearchCV

# Define the hyperparameter grid
param_dist = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Use RandomizedSearchCV
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
random_search.fit(features, target)

# Retrieve the best model and parameters
best_model = random_search.best_estimator_
best_params = random_search.best_params_
print(f'Best Parameters: {best_params}')

Best Parameters: {'n_estimators': 150, 'min_samples_split': 10, 'max_depth': 10}
