### **INFOSYS SPRINGBOARD TASK 1**

IMPORTING REQUIRED LIBRARIES & DATASET

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import make_column_transformer 

In [2]:
data = pd.read_csv('C:\\Users\\Kamlesh P Panchal\\Documents\\Infosys Internship\\train_og\\Train.csv')
data

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Kamlesh P Panchal\\Documents\\Infosys Internship\\train_og\\Train.csv'

### **1. IMPUTATION OF MISSING VALUES**

In [None]:
data.isnull().sum()

In [None]:
# Calculating the mean of 'Item_Visibility' for entries where it is greater than 0
non_zero_mean = data.loc[data['Item_Visibility'] > 0, 'Item_Visibility'].mean()

In [None]:
# Filling missing values in 'Item_Weight' with the median weight
data['Item_Weight'].fillna(data['Item_Weight'].median(), inplace=True)

In [None]:
# Filling missing values in 'Outlet_Size' with the most frequent value (mode)
data['Outlet_Size'].fillna(data['Outlet_Size'].mode()[0], inplace=True)

In [None]:
# Fixing  inconsistencies in the 'Item_Fat_Content' column by standardizing the labels
data['Item_Fat_Content'] = data['Item_Fat_Content'].replace({'LF': 'Low Fat', 'low fat': 'Low Fat', 'reg': 'Regular'})

In [None]:
# Checking for duplicates and removing them if any
data.drop_duplicates(inplace=True)

In [None]:
# rechecking again if we find any missing values
data.isnull().sum()

### **2. CATEGORICAL VARIABLE ENCODING**

In [None]:
# Nominal columns (categorical without order)
nominal_columns = ['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 
                   'Outlet_Identifier', 'Outlet_Location_Type', 
                   'Outlet_Type'] 

# Ordinal columns (categorical with order)
ordinal_columns = ['Outlet_Size']  

# Continuous (Numerical columns)
numerical_columns = ['Item_Weight', 'Item_Visibility', 'Item_MRP', 
                    'Item_Outlet_Sales', 'Outlet_Establishment_Year']  

# printing them seperatly
print("Nominal Columns: ", nominal_columns)
print("Ordinal Columns: ", ordinal_columns)
print("Numerical Columns: ", numerical_columns)

In [None]:
# mapping the ordinal encoding
Outlet_Size_mapping = ['Small', 'Medium', 'High'] 

In [None]:
# Creating encoders and scaler
ohe = OneHotEncoder(sparse_output=False)  # For nominal columns
ode = OrdinalEncoder(categories=[Outlet_Size_mapping])  # For ordinal column
scaler = StandardScaler()  # For numeric columns

In [None]:
# Creating the column transformer
ct = make_column_transformer(
    (ohe, ['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 
            'Outlet_Identifier', 'Outlet_Location_Type', 'Outlet_Type']),  # Nominal columns
    (ode, ['Outlet_Size']),  # Ordinal column
    (scaler, ['Item_Weight', 'Item_Visibility', 'Item_MRP', 
              'Outlet_Establishment_Year', 'Item_Outlet_Sales']),  # Numeric columns
    remainder='passthrough'
)

ct.set_output(transform= 'pandas')

In [None]:
df_encoded = ct.fit_transform(data)
df_encoded

### **3. FEATURE DERIVATION**

In [None]:
# Feature Derivation: Creating a new feature for the store's age
data['Outlet_Age'] = 2024 - data['Outlet_Establishment_Year']

data[['Outlet_Establishment_Year', 'Outlet_Age']].head()

### **4. OUTLIER TREATMENT**

In [None]:
from scipy import stats

In [None]:
data.shape

Using Z-Score Method

In [None]:
# continuous numerical features
continuous_columns = ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Item_Outlet_Sales']

# Z-score threshold
z_threshold = 2.5  # as 3 results with 0 outliers & 2 with many so fixed T as 2.5

# outliers based on Z-score for each column
outliers_dict = {}
for col in continuous_columns:
    
    # Z-Score calculation
    data[col + '_zscore'] = np.abs((data[col] - data[col].mean()) / data[col].std())
    
    # outliers where Z-score is greater than threshold
    outliers_dict[col] = data[data[col + '_zscore'] > z_threshold]
    
    # Printing outlier count
    print(f"{col}: {outliers_dict[col].shape[0]} outliers detected.")

plt.figure(figsize=(15, 10))

for i, col in enumerate(continuous_columns, 1):
    plt.subplot(2, 2, i)  # Creating a 2x2 grid of plots
    sns.boxplot(x=data[col])
    plt.title(f'Boxplot of {col} with Outliers (Z-Threshold: {z_threshold})')

plt.tight_layout()
plt.show()


In [None]:
# Removing rows where Z-score for 'Item_Outlet_Sales' is greater than the threshold
z_threshold = 2.5
outliers_removed = data[data['Item_Outlet_Sales_zscore'] <= z_threshold]

print(f"Original dataset size: {data.shape}")
print(f"Dataset size after removing outliers: {outliers_removed.shape}")

Using IQR Method

In [11]:
'''
outliers_dict_iqr = {}
for col in continuous_columns:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    
    # Defining bounds for outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Outliers where values are below lower_bound or above upper_bound
    outliers_dict_iqr[col] = data[(data[col] < lower_bound) | (data[col] > upper_bound)]
    
    # Printing outlier count
    print(f"{col}: {outliers_dict_iqr[col].shape[0]} outliers detected using IQR.")

# Removing outliers using IQR for 'Item_Outlet_Sales'
outliers_removed_iqr = data[~((data['Item_Outlet_Sales'] < (data['Item_Outlet_Sales'].quantile(0.25) - 1.5 * (data['Item_Outlet_Sales'].quantile(0.75) - data['Item_Outlet_Sales'].quantile(0.25)))) | 
                             (data['Item_Outlet_Sales'] > (data['Item_Outlet_Sales'].quantile(0.75) + 1.5 * (data['Item_Outlet_Sales'].quantile(0.75) - data['Item_Outlet_Sales'].quantile(0.25)))))]
print(f"Dataset size after removing outliers (IQR): {outliers_removed_iqr.shape}")

'''

NameError: name 'continuous_columns' is not defined

In [8]:
# Assuming 'outliers_removed' is cleaned dataset

scaled_columns = ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Item_Outlet_Sales']

# standardizing the columns
scaler = StandardScaler()
outliers_removed[scaled_columns] = scaler.fit_transform(outliers_removed[scaled_columns])

outliers_removed.head()


NameError: name 'outliers_removed' is not defined

In [9]:
# Replacing 'data' with the updated dataset (outliers_removed)
data = outliers_removed
data.head(4)

NameError: name 'outliers_removed' is not defined

### **5. FEATURE SCALING**

In [6]:
from sklearn.preprocessing import MinMaxScaler

# Apply Min-Max Scaling to certain numerical features
min_max_scaler = MinMaxScaler()
data[['Item_MRP', 'Item_Visibility']] = min_max_scaler.fit_transform(data[['Item_MRP', 'Item_Visibility']])

# Check the scaled data
data[['Item_MRP', 'Item_Visibility']].head()

NameError: name 'data' is not defined

### **6. TARGET VARIABLE TRANSFORMATION**

In [7]:
# Investigate the distribution of 'Item_Outlet_Sales'
sns.histplot(data['Item_Outlet_Sales'], kde=True)

# Apply log transformation to address skewness
data['Item_Outlet_Sales_Log'] = np.log1p(data['Item_Outlet_Sales'])

# Check distribution after transformation
sns.histplot(data['Item_Outlet_Sales_Log'], kde=True)


NameError: name 'data' is not defined