In [2]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import io

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

import pickle

In [4]:
test_dataset = pd.read_csv('.\Test.csv')

In [7]:
test_dataset

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.750,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,FDW14,8.300,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1
2,NCN55,14.600,Low Fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.0340,OUT017,2007,,Tier 2,Supermarket Type1
4,FDY38,,Regular,0.118599,Dairy,234.2300,OUT027,1985,Medium,Tier 3,Supermarket Type3
...,...,...,...,...,...,...,...,...,...,...,...
5676,FDB58,10.500,Regular,0.013496,Snack Foods,141.3154,OUT046,1997,Small,Tier 1,Supermarket Type1
5677,FDD47,7.600,Regular,0.142991,Starchy Foods,169.1448,OUT018,2009,Medium,Tier 3,Supermarket Type2
5678,NCO17,10.000,Low Fat,0.073529,Health and Hygiene,118.7440,OUT045,2002,,Tier 2,Supermarket Type1
5679,FDJ26,15.300,Regular,0.000000,Canned,214.6218,OUT017,2007,,Tier 2,Supermarket Type1


In [11]:
print(test_dataset.describe())


       Item_Weight  Item_Visibility     Item_MRP  Outlet_Establishment_Year
count  4705.000000      5681.000000  5681.000000                5681.000000
mean     12.695633         0.065684   141.023273                1997.828903
std       4.664849         0.051252    61.809091                   8.372256
min       4.555000         0.000000    31.990000                1985.000000
25%       8.645000         0.027047    94.412000                1987.000000
50%      12.500000         0.054154   141.415400                1999.000000
75%      16.700000         0.093463   186.026600                2004.000000
max      21.350000         0.323637   266.588400                2009.000000


In [19]:
median_visibility = test_dataset['Item_Visibility'].median()

# Correction des valeurs aberrantes (remplacement par la médiane)
test_dataset.loc[test_dataset['Item_Visibility'] > 0.193, 'Item_Visibility'] = median_visibility

# Vérification après correction
print(test_dataset.describe())  # Pour voir si les valeurs aberrantes ont disparu


       Item_Weight  Item_Visibility     Item_MRP  Outlet_Establishment_Year
count  4705.000000      5681.000000  5681.000000                5681.000000
mean     12.695633         0.061917   141.023273                1997.828903
std       4.664849         0.044305    61.809091                   8.372256
min       4.555000         0.000000    31.990000                1985.000000
25%       8.645000         0.027047    94.412000                1987.000000
50%      12.500000         0.054154   141.415400                1999.000000
75%      16.700000         0.089466   186.026600                2004.000000
max      21.350000         0.192767   266.588400                2009.000000


In [23]:
# Fonction pour détecter les outliers avec la méthode IQR
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data[column] < lower_bound) | (data[column] > upper_bound)]
# Correction des valeurs aberrantes pour chaque colonne
columns_to_check = ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Establishment_Year']

for col in columns_to_check:
    print(f"\n🔍 Détection des outliers dans {col}:")
    outliers = detect_outliers_iqr(test_dataset, col)
    print(outliers)

    # Remplacer les outliers par la médiane pour les colonnes numériques
    if col in ['Item_Weight', 'Item_MRP']:  # Remplacer par la médiane pour ces colonnes
        median_value = test_dataset[col].median()
        test_dataset.loc[test_dataset[col] < test_dataset[col].quantile(0.25) - 1.5 * (test_dataset[col].quantile(0.75) - test_dataset[col].quantile(0.25)), col] = median_value
        test_dataset.loc[test_dataset[col] > test_dataset[col].quantile(0.75) + 1.5 * (test_dataset[col].quantile(0.75) - test_dataset[col].quantile(0.25)), col] = median_value
    elif col == 'Item_Visibility':  # La correction de Item_Visibility a déjà été faite
        pass
    elif col == 'Outlet_Establishment_Year':  # Corriger les années (si nécessaire)
        # Vérifier si des années sont en dehors de la plage attendue (1985-2009)
        test_dataset.loc[test_dataset[col] < 1985, col] = 1985
        test_dataset.loc[test_dataset[col] > 2009, col] = 2009

# Vérifier après correction
print("\n🔧 Statistiques après correction :")
print(test_dataset.describe())



🔍 Détection des outliers dans Item_Weight:
Empty DataFrame
Columns: [Item_Identifier, Item_Weight, Item_Fat_Content, Item_Visibility, Item_Type, Item_MRP, Outlet_Identifier, Outlet_Establishment_Year, Outlet_Size, Outlet_Location_Type, Outlet_Type]
Index: []

🔍 Détection des outliers dans Item_Visibility:
     Item_Identifier  Item_Weight Item_Fat_Content  Item_Visibility  \
134            DRI49       14.150          Low Fat         0.183793   
172            NCP14        8.275          Low Fat         0.184602   
403            FDT50          NaN          Regular         0.189512   
473            FDH28       15.850          Regular         0.184169   
1042           FDP08       20.500          Regular         0.188151   
1286           FDT01       13.650          Regular         0.184133   
1507           FDJ56        8.985               LF         0.183368   
1718           FDA31        7.100          Low Fat         0.184137   
1835           FDJ56        8.985          Low Fat   

In [28]:
# Identifier les lignes où la visibilité est proche de zéro
low_visibility = test_dataset[test_dataset['Item_Visibility'] < 0.05]
print(low_visibility)


     Item_Identifier  Item_Weight Item_Fat_Content  Item_Visibility  \
0              FDW58       20.750          Low Fat         0.007565   
1              FDW14        8.300              reg         0.038428   
3              FDQ58        7.315          Low Fat         0.015388   
7              FDC48          NaN          Low Fat         0.015782   
9              FDA36        5.985          Low Fat         0.005698   
...              ...          ...              ...              ...   
5670           FDO03       10.395          Regular         0.037092   
5672           NCH42        6.860          Low Fat         0.036594   
5674           DRL35       15.700          Low Fat         0.030704   
5676           FDB58       10.500          Regular         0.013496   
5679           FDJ26       15.300          Regular         0.000000   

         Item_Type  Item_MRP Outlet_Identifier  Outlet_Establishment_Year  \
0      Snack Foods  107.8622            OUT049                       1

In [35]:
test_dataset.apply(lambda x:sum(x.isnull()))

Item_Identifier                 0
Item_Weight                   976
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  1606
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64

In [38]:
print(test_dataset.duplicated().sum())

0


In [41]:
#imputing the missing value with mean
test_dataset['Item_Weight']=test_dataset['Item_Weight'].fillna(test_dataset['Item_Weight'].mean())

In [46]:
test_dataset['Item_Weight'].isnull().sum()

0

In [49]:

test_dataset['Outlet_Size'].value_counts()

Outlet_Size
Medium    1862
Small     1592
High       621
Name: count, dtype: int64

In [52]:
test_dataset['Outlet_Size']=test_dataset['Outlet_Size'].fillna(test_dataset['Outlet_Size'].mode()[0])
test_dataset['Outlet_Size'].isna().sum()

0

In [55]:
test_dataset.corr(numeric_only=True)

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year
Item_Weight,1.0,-0.01691,0.045257,0.013548
Item_Visibility,-0.01691,1.0,-0.005132,-0.048016
Item_MRP,0.045257,-0.005132,1.0,-0.007233
Outlet_Establishment_Year,0.013548,-0.048016,-0.007233,1.0
