### 1. IMPORTING REQUIRED LIBRARIES

In [82]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import warnings
warnings.filterwarnings("ignore")

### 2. LOAD THE DATASET

In [83]:
#Loading the dataset
df = pd.read_csv('Downloads\Infosys Springboard\Train.csv')
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


### 3. INFORMATION ABOUT THE DATA

In [84]:
#The dataset has 8523 rows × 12 columns
print("The dataset consists : " ,df.shape)
print(df.dtypes)

The dataset consists :  (8523, 12)
Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object


### 4. HANDLING MISSING VALUES

In [85]:
#Finding in which column has missing values
missing_values = df.isnull().sum()
missing_values

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

Observations: 
    There are some missing values in Item_Weight and Outlet_Size columns.

In [86]:
#Filling missing values using median 
df['Item_Weight'] = df.groupby('Item_Type')['Item_Weight'].transform(lambda x: x.fillna(x.median()))

#Filling missing values using mode
df['Outlet_Size'] = df.groupby('Outlet_Type')['Outlet_Size'].transform(lambda x: x.fillna(x.mode()[0]))


In [87]:
#Check for null values
df.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

In [88]:
# Replace zero visibility values with the median
median_visibility = df[df['Item_Visibility'] > 0]['Item_Visibility'].median()
df['Item_Visibility'] = df['Item_Visibility'].replace(0, median_visibility)

In [89]:
#Handling Inconsistencies
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({
    'low fat': 'Low Fat', 'LF': 'Low Fat', 'reg': 'Regular'
})
print(df['Item_Fat_Content'].unique())

['Low Fat' 'Regular']


### 4. FEATURE DERIVATION

In [90]:
# Calculate age of the outlet based on the establishment year
current_year = 2024
df['Outlet_Age'] = current_year - df['Outlet_Establishment_Year']

In [91]:
df.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
Outlet_Age                   0
dtype: int64

In [92]:
rows, columns = df.shape
print("Total number of rows:", rows)
print("Total number of columns:", columns)

Total number of rows: 8523
Total number of columns: 13


In [93]:
# Drop columns that may not contribute to predictive power
df = df.drop(columns=['Item_Identifier', 'Outlet_Identifier', 'Outlet_Establishment_Year'])

### 5. CATEGORICAL VARIABLE ENCODING

In [94]:
# Define Columns for Transformation
nominal_columns = ['Item_Fat_Content', 'Item_Type', 'Outlet_Location_Type', 'Outlet_Type']
ordinal_columns = ['Outlet_Size']
numerical_columns = ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Age']

# Define encoders and scaler
one_hot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ordinal_encoder = OrdinalEncoder(categories=[['Small', 'Medium', 'High']], handle_unknown='use_encoded_value', unknown_value=-1)
scaler = StandardScaler()

# Build the column transformer with specific encodings and scaling
ct = make_column_transformer(
    (one_hot_encoder, nominal_columns),  # Nominal columns
    (ordinal_encoder, ordinal_columns),  # Ordinal column
    (scaler, numerical_columns),  # Numeric columns
    remainder='passthrough'  # Keep any remaining columns unchanged
)

ct.set_output(transform='pandas')  # Ensure the output is a DataFrame

### MODEL

In [96]:

# Prepare the Data
X = df.drop(columns=['Item_Outlet_Sales'])
y = df['Item_Outlet_Sales']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Transform the data using the column transformer
X_train_transformed = ct.fit_transform(X_train)
X_test_transformed = ct.transform(X_test)

# Train the Linear Regression Model
model = LinearRegression()
model.fit(X_train_transformed, y_train)

# Predict and Evaluate
y_pred = model.predict(X_test_transformed)

# Calculate performance metrics
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Display the results
print("R² Score:", r2)
print("Mean Absolute Error (MAE):", mae)
print("Root Mean Squared Error (RMSE):", rmse)


R² Score: 0.5779585258916939
Mean Absolute Error (MAE): 793.3824090087573
Root Mean Squared Error (RMSE): 1071.0257202780763
