In [27]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [39]:
data = pd.read_csv('cleaned_data.csv', sep=',')
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Small,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [40]:
num_columns = ['Item_Weight','Item_Visibility','Item_MRP','Outlet_Establishment_Year']
nominal_columns = ['Item_Type','Outlet_Location_Type','Outlet_Type','Outlet_Identifier']
ordinal_columns = ['Item_Fat_Content','Outlet_Size']

In [43]:
num_pipeline = Pipeline([
    ('poly', PolynomialFeatures()),
    ('scaler',StandardScaler())
    ])

ordinal_pipeline = Pipeline([
    ('ord_encoder',OrdinalEncoder())
])

nominal_pipeline = Pipeline([
    ('nom_encoder',OneHotEncoder(drop='first',sparse_output = False))
])

preprocessor = ColumnTransformer([
    ('num_pipeline',num_pipeline,num_columns),
    ('ordinal_pipeline',ordinal_pipeline,ordinal_columns),
    ('nominal_pipeline',nominal_pipeline,nominal_columns)
]).set_output(transform = 'pandas')

In [44]:
y = np.log(data['Item_Outlet_Sales'])
X = data.drop(['Item_Outlet_Sales','Item_Identifier'], axis=1)
processed_data = preprocessor.fit_transform(X)
processed_data.head()

Unnamed: 0,num_pipeline__1,num_pipeline__Item_Weight,num_pipeline__Item_Visibility,num_pipeline__Item_MRP,num_pipeline__Outlet_Establishment_Year,num_pipeline__Item_Weight^2,num_pipeline__Item_Weight Item_Visibility,num_pipeline__Item_Weight Item_MRP,num_pipeline__Item_Weight Outlet_Establishment_Year,num_pipeline__Item_Visibility^2,...,nominal_pipeline__Outlet_Type_Supermarket Type3,nominal_pipeline__Outlet_Identifier_OUT013,nominal_pipeline__Outlet_Identifier_OUT017,nominal_pipeline__Outlet_Identifier_OUT018,nominal_pipeline__Outlet_Identifier_OUT019,nominal_pipeline__Outlet_Identifier_OUT027,nominal_pipeline__Outlet_Identifier_OUT035,nominal_pipeline__Outlet_Identifier_OUT045,nominal_pipeline__Outlet_Identifier_OUT046,nominal_pipeline__Outlet_Identifier_OUT049
0,0.0,-0.771414,-0.97051,1.742038,0.075529,-0.823798,-0.915253,0.457786,-0.770817,-0.635312,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,-1.50004,-0.907291,-1.489291,1.309529,-1.243874,-0.961588,-1.415109,-1.493357,-0.624431,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.996261,-0.956563,0.007388,0.075529,0.97071,-0.725151,0.600348,0.997584,-0.633082,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.36273,-1.284505,0.656363,-0.047871,1.480173,-1.112188,1.536036,1.362132,-0.659862,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,-0.851175,-1.284505,-1.39963,-1.40527,-0.878877,-1.112188,-1.23565,-0.862172,-0.659862,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
X_train, X_test, y_train, y_test = train_test_split(processed_data, y, test_size=0.2, random_state=42)

## Linear Regression Model 

In [46]:
linear_reg = LinearRegression()

# Train the model
linear_reg.fit(X_train, y_train)

# Make predictions
y_pred = linear_reg.predict(X_test)

# Evaluate the model (mean squared error)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"Linear Regression Mean Squared Error: {mse}")
print(f"Root mean Squared Error", rmse)

Linear Regression Mean Squared Error: 0.2725303757054291
Root mean Squared Error 0.5220444192838662


In [49]:
from sklearn.linear_model import Ridge

# Initialize the Ridge Regression model with regularization strength (alpha)
ridge_reg = Ridge(alpha=1.0)  # Adjust alpha for regularization strength

# Train the model
ridge_reg.fit(X_train, y_train)

# Make predictions
y_pred_ridge = ridge_reg.predict(X_test)

# Evaluate the model (mean squared error)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
rmse_ridge = np.sqrt(mse_ridge)
print(f"Ridge Regression Mean Squared Error: {mse_ridge}")
print(f"Root mean Squared Error", rmse_ridge)


Ridge Regression Mean Squared Error: 0.27248832788391103
Root mean Squared Error 0.5220041454662128
