In [263]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

In [264]:
df = pd.read_csv('named_train.csv')
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Y
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,8.23
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,6.09
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,7.65
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,6.6
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,6.9


In [265]:
df['Y'].isnull().sum()

0

In [266]:
df.isnull().sum()

Item_Identifier                 0
Item_Weight                  1006
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  1711
Outlet_Location_Type            0
Outlet_Type                     0
Y                               0
dtype: int64

In [267]:
# Calculate the mean of Item_Weight for each Item_Identifier
item_weight_mean = df.groupby('Item_Identifier')['Item_Weight'].transform('mean')
# Impte missing values with the mean of the corresponding Item_Identifier
df['Item_Weight'].fillna(item_weight_mean, inplace=True)
# If there are still missing values, impute with the overall mean
df['Item_Weight'].fillna(df['Item_Weight'].mean(), inplace=True)

df.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Item_Weight'].fillna(item_weight_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Item_Weight'].fillna(df['Item_Weight'].mean(), inplace=True)


Item_Identifier                 0
Item_Weight                     0
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  1711
Outlet_Location_Type            0
Outlet_Type                     0
Y                               0
dtype: int64

In [268]:
outlet_size_summary = df.pivot_table(values='Item_Identifier', index='Outlet_Identifier', columns='Outlet_Size', aggfunc='count', fill_value=0)
print(outlet_size_summary)

Outlet_Size        High  Medium  Small
Outlet_Identifier                     
OUT013              672       0      0
OUT018                0     637      0
OUT019                0       0    347
OUT027                0     659      0
OUT035                0       0    671
OUT046                0       0    664
OUT049                0     639      0


In [269]:
indices = df['Outlet_Size'].isnull()
print(df.loc[indices, 'Outlet_Identifier'].value_counts())

Outlet_Identifier
OUT045    677
OUT017    644
OUT010    390
Name: count, dtype: int64


In [270]:
print(indices.sum())

1711


In [271]:
print(indices)

0       False
1       False
2       False
3        True
4       False
        ...  
5995     True
5996    False
5997     True
5998    False
5999    False
Name: Outlet_Size, Length: 6000, dtype: bool


In [272]:
# Create a new Series with the same index as df and set the desired indices to True
indices.iloc[[0, 4]] = True

In [273]:
df_test = df[indices]

df = df[~indices]
df.drop(['Item_Type'], axis=1, inplace=True)

In [274]:
df_test.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Y
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,8.23
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,6.6
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,6.9
8,FDH17,16.2,Regular,0.016687,Frozen Foods,96.9726,OUT045,2002,,Tier 2,Supermarket Type1,6.98
9,FDU28,19.2,Regular,0.09445,Frozen Foods,187.8214,OUT017,2007,,Tier 2,Supermarket Type1,8.46


In [275]:
df['Item_Identifier'] = df['Item_Identifier'].str.slice(0, 2)
df['Item_Visibility'] = np.sqrt(df['Item_Visibility'])
df.drop('Outlet_Identifier', axis = 1, inplace = True)   
df_test.drop('Outlet_Identifier', axis = 1, inplace = True)
df["Item_Fat_Content"] = df["Item_Fat_Content"].replace({ "low fat": "LF", "Low Fat": "LF", "Regular": "REG", "reg": "REG" })
df_test["Item_Fat_Content"] = df_test["Item_Fat_Content"].replace({ "low fat": "LF", "Low Fat": "LF", "Regular": "REG", "reg": "REG" })
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Y
1,DR,5.92,REG,0.138846,48.2692,2009,Medium,Tier 3,Supermarket Type2,6.09
2,FD,17.5,LF,0.129461,141.618,1999,Medium,Tier 1,Supermarket Type1,7.65
5,FD,10.395,REG,0.0,51.4008,2009,Medium,Tier 3,Supermarket Type2,6.32
6,FD,13.65,REG,0.112876,57.6588,1987,High,Tier 3,Supermarket Type1,5.84
7,FD,19.0,LF,0.357029,107.7622,1985,Medium,Tier 3,Supermarket Type3,8.3


In [276]:
df_test.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Y
0,FDA15,9.3,LF,0.016047,Dairy,249.8092,1999,Medium,Tier 1,Supermarket Type1,8.23
3,FDX07,19.2,REG,0.0,Fruits and Vegetables,182.095,1998,,Tier 3,Grocery Store,6.6
4,NCD19,8.93,LF,0.0,Household,53.8614,1987,High,Tier 3,Supermarket Type1,6.9
8,FDH17,16.2,REG,0.016687,Frozen Foods,96.9726,2002,,Tier 2,Supermarket Type1,6.98
9,FDU28,19.2,REG,0.09445,Frozen Foods,187.8214,2007,,Tier 2,Supermarket Type1,8.46


In [277]:
df['Item_Identifier'].value_counts()

Item_Identifier
FD    3107
NC     782
DR     398
Name: count, dtype: int64

In [278]:
df.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_MRP                     0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Y                            0
dtype: int64

In [279]:
# Define the categories for each column
categories = [
    ['LF', 'REG'],  # Item_Fat_Content
    ['Tier 1', 'Tier 3', 'Tier 2'],  # Outlet_Location_Type
    ['Grocery Store', 'Supermarket Type1', 'Supermarket Type2', 'Supermarket Type3'],  # Outlet_Type
    ['Small', 'Medium', 'High']  # Outlet_Size
]

# Initialize the OrdinalEncoder with the specified categories
ordinal_encoder = OrdinalEncoder(categories=categories)

# Ensure the columns contain only the specified categories
# df['Item_Fat_Content'] = df['Item_Fat_Content'].astype('category').cat.set_categories(categories[0])
# df['Outlet_Location_Type'] = df['Outlet_Location_Type'].astype('category').cat.set_categories(categories[1])
# df['Outlet_Type'] = df['Outlet_Type'].astype('category').cat.set_categories(categories[2])
# df['Outlet_Size'] = df['Outlet_Size'].astype('category').cat.set_categories(categories[3])

# Fit and transform the specified columns
df.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_MRP                     0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Y                            0
dtype: int64

In [280]:
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Y
1,DR,5.92,REG,0.138846,48.2692,2009,Medium,Tier 3,Supermarket Type2,6.09
2,FD,17.5,LF,0.129461,141.618,1999,Medium,Tier 1,Supermarket Type1,7.65
5,FD,10.395,REG,0.0,51.4008,2009,Medium,Tier 3,Supermarket Type2,6.32
6,FD,13.65,REG,0.112876,57.6588,1987,High,Tier 3,Supermarket Type1,5.84
7,FD,19.0,LF,0.357029,107.7622,1985,Medium,Tier 3,Supermarket Type3,8.3


In [281]:

trnasformed = ordinal_encoder.fit_transform(
    df[['Item_Fat_Content', 'Outlet_Location_Type', 'Outlet_Type', 'Outlet_Size']]
)


In [282]:
df[['Item_Fat_Content', 'Outlet_Location_Type', 'Outlet_Type', 'Outlet_Size']] = trnasformed
df.head()


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Y
1,DR,5.92,1.0,0.138846,48.2692,2009,1.0,1.0,2.0,6.09
2,FD,17.5,0.0,0.129461,141.618,1999,1.0,0.0,1.0,7.65
5,FD,10.395,1.0,0.0,51.4008,2009,1.0,1.0,2.0,6.32
6,FD,13.65,1.0,0.112876,57.6588,1987,2.0,1.0,1.0,5.84
7,FD,19.0,0.0,0.357029,107.7622,1985,1.0,1.0,3.0,8.3


In [283]:
df.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_MRP                     0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Y                            0
dtype: int64

In [284]:
df['Item_Identifier'].value_counts()

Item_Identifier
FD    3107
NC     782
DR     398
Name: count, dtype: int64

In [285]:
df.shape

(4287, 10)

In [286]:
# Initialize the OneHotEncoder
one_hot_encoder = OneHotEncoder()

# Fit and transform the Item_Identifier column
item_identifier_encoded = one_hot_encoder.fit_transform(df[['Item_Identifier']])

# Create a DataFrame with the encoded columns
item_identifier_encoded_df = pd.DataFrame(item_identifier_encoded.toarray(), columns=one_hot_encoder.get_feature_names_out(['Item_Identifier']))

# Reset index to ensure alignment
df.reset_index(drop=True, inplace=True)
item_identifier_encoded_df.reset_index(drop=True, inplace=True)

# Concatenate the encoded DataFrame with the original DataFrame
df = pd.concat([df, item_identifier_encoded_df], axis=1)



In [287]:
df.shape

(4287, 13)

In [288]:
item_identifier_encoded.shape

(4287, 3)

In [289]:
# Convert the sparse matrix to a DataFrame


# Concatenate the encoded DataFrame with the original DataFra

df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Y,Item_Identifier_DR,Item_Identifier_FD,Item_Identifier_NC
0,DR,5.92,1.0,0.138846,48.2692,2009,1.0,1.0,2.0,6.09,1.0,0.0,0.0
1,FD,17.5,0.0,0.129461,141.618,1999,1.0,0.0,1.0,7.65,0.0,1.0,0.0
2,FD,10.395,1.0,0.0,51.4008,2009,1.0,1.0,2.0,6.32,0.0,1.0,0.0
3,FD,13.65,1.0,0.112876,57.6588,1987,2.0,1.0,1.0,5.84,0.0,1.0,0.0
4,FD,19.0,0.0,0.357029,107.7622,1985,1.0,1.0,3.0,8.3,0.0,1.0,0.0


In [290]:
df.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_MRP                     0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Y                            0
Item_Identifier_DR           0
Item_Identifier_FD           0
Item_Identifier_NC           0
dtype: int64

In [291]:
df_features = df.drop(['Outlet_Size'], axis=1)
df_Target = df['Outlet_Size']


In [292]:
# Fill missing Item_Weight values in df_test
df_test['Item_Identifier'] = df_test['Item_Identifier'].str.slice(0, 2)
df_test['Item_Visibility'] = np.sqrt(df_test['Item_Visibility'])
df_test["Item_Fat_Content"] = df_test["Item_Fat_Content"].replace({"low fat": "LF", "Low Fat": "LF", "Regular": "REG", "reg": "REG"})
df_test.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Y
0,FD,9.3,LF,0.126678,Dairy,249.8092,1999,Medium,Tier 1,Supermarket Type1,8.23
3,FD,19.2,REG,0.0,Fruits and Vegetables,182.095,1998,,Tier 3,Grocery Store,6.6
4,NC,8.93,LF,0.0,Household,53.8614,1987,High,Tier 3,Supermarket Type1,6.9
8,FD,16.2,REG,0.129179,Frozen Foods,96.9726,2002,,Tier 2,Supermarket Type1,6.98
9,FD,19.2,REG,0.307327,Frozen Foods,187.8214,2007,,Tier 2,Supermarket Type1,8.46


In [293]:
df_test.shape

(1713, 11)

In [294]:

# Apply ordinal encoding to df_test
# Define the categories for each column
categories_test = [
    ['LF', 'REG'],  # Item_Fat_Content
    ['Tier 1', 'Tier 3', 'Tier 2'],  # Outlet_Location_Type
    ['Grocery Store', 'Supermarket Type1', 'Supermarket Type2', 'Supermarket Type3']  # Outlet_Type
]

# Initialize the OrdinalEncoder with the specified categories
ordinal_encoder_test = OrdinalEncoder(categories=categories_test)

# Fit and transform the specified columns
df_test[['Item_Fat_Content', 'Outlet_Location_Type', 'Outlet_Type']] = ordinal_encoder_test.fit_transform(
    df_test[['Item_Fat_Content', 'Outlet_Location_Type', 'Outlet_Type']]
)

# Apply one-hot encoding to Item_Identifier in df_test
item_identifier_encoded_test = one_hot_encoder.transform(df_test[['Item_Identifier']])
item_identifier_encoded_df_test = pd.DataFrame(item_identifier_encoded_test.toarray(), columns=one_hot_encoder.get_feature_names_out(['Item_Identifier']))

# Reset index to ensure alignment
df_test.reset_index(drop=True, inplace=True)
item_identifier_encoded_df_test.reset_index(drop=True, inplace=True)

# Concatenate the encoded DataFrame with df_test
df_test = pd.concat([df_test, item_identifier_encoded_df_test], axis=1)

# Drop the original Item_Identifier column from df_test
df_test.drop('Item_Identifier', axis=1, inplace=True)

df_test.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Y,Item_Identifier_DR,Item_Identifier_FD,Item_Identifier_NC
0,9.3,0.0,0.126678,Dairy,249.8092,1999,Medium,0.0,1.0,8.23,0.0,1.0,0.0
1,19.2,1.0,0.0,Fruits and Vegetables,182.095,1998,,1.0,0.0,6.6,0.0,1.0,0.0
2,8.93,0.0,0.0,Household,53.8614,1987,High,1.0,1.0,6.9,0.0,0.0,1.0
3,16.2,1.0,0.129179,Frozen Foods,96.9726,2002,,2.0,1.0,6.98,0.0,1.0,0.0
4,19.2,1.0,0.307327,Frozen Foods,187.8214,2007,,2.0,1.0,8.46,0.0,1.0,0.0


In [295]:
df_test.shape

(1713, 13)

In [296]:
df_test.drop(['Outlet_Size','Item_Type'], axis=1, inplace=True)
df_features.drop(['Item_Identifier'], axis=1, inplace=True)

In [297]:
df_test.columns

Index(['Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_MRP',
       'Outlet_Establishment_Year', 'Outlet_Location_Type', 'Outlet_Type', 'Y',
       'Item_Identifier_DR', 'Item_Identifier_FD', 'Item_Identifier_NC'],
      dtype='object')

In [298]:
df_features.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Location_Type,Outlet_Type,Y,Item_Identifier_DR,Item_Identifier_FD,Item_Identifier_NC
0,5.92,1.0,0.138846,48.2692,2009,1.0,2.0,6.09,1.0,0.0,0.0
1,17.5,0.0,0.129461,141.618,1999,0.0,1.0,7.65,0.0,1.0,0.0
2,10.395,1.0,0.0,51.4008,2009,1.0,2.0,6.32,0.0,1.0,0.0
3,13.65,1.0,0.112876,57.6588,1987,1.0,1.0,5.84,0.0,1.0,0.0
4,19.0,0.0,0.357029,107.7622,1985,1.0,3.0,8.3,0.0,1.0,0.0


In [299]:
print(df_test.shape)
print(df_features.shape)

(1713, 11)
(4287, 11)


In [300]:
df_Target.isnull().sum()

0

In [301]:
df_Target.value_counts()

Outlet_Size
1.0    1934
0.0    1682
2.0     671
Name: count, dtype: int64

In [302]:
import optuna
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier

# Define the objective function for Optuna
def objective(trial):
    # Define the hyperparameters to tune
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5)
    }

    # Initialize the XGBoost classifier with the hyperparameters
    model = XGBClassifier(**param, use_label_encoder=False, eval_metric='mlogloss')

    # Perform cross-validation and return the mean accuracy
    accuracy = cross_val_score(model, df_features, df_Target, cv=5, scoring='accuracy').mean()
    return accuracy

# Create a study object and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

# Print the best hyperparameters and the best accuracy
print('Best hyperparameters: ', study.best_params)
print('Best accuracy: ', study.best_value)

[I 2024-12-18 21:27:05,252] A new study created in memory with name: no-name-92bc6bc6-bfa9-425d-a9d3-b267831f3c0e
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[I 2024-12-18 21:27:05,766] Trial 0 finished with value: 1.0 and parameters: {'n_estimators': 91, 'max_depth': 3, 'learning_rate': 0.010960112224494604, 'subsample': 0.5869726884181026, 'colsample_bytree': 0.6089523620456533, 'gamma': 1.1543536524490872, 'reg_alpha': 1.2050734865235657, 'reg_lambda': 0.2680719124936337}. Best is trial 0 with value: 1.0.
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[I 2024-12-18 21:27:

Best hyperparameters:  {'n_estimators': 91, 'max_depth': 3, 'learning_rate': 0.010960112224494604, 'subsample': 0.5869726884181026, 'colsample_bytree': 0.6089523620456533, 'gamma': 1.1543536524490872, 'reg_alpha': 1.2050734865235657, 'reg_lambda': 0.2680719124936337}
Best accuracy:  1.0


In [303]:
# Train the model with the best hyperparameters
best_model = XGBClassifier(**study.best_params, use_label_encoder=False, eval_metric='mlogloss')
best_model.fit(df_features, df_Target)

# Get feature importances
feature_importances = best_model.feature_importances_

# Create a DataFrame for better visualization
feature_importances_df = pd.DataFrame({
    'Feature': df_features.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print(feature_importances_df)

Parameters: { "use_label_encoder" } are not used.



                      Feature  Importance
4   Outlet_Establishment_Year    0.332257
6                 Outlet_Type    0.326358
5        Outlet_Location_Type    0.261088
7                           Y    0.055553
2             Item_Visibility    0.011235
3                    Item_MRP    0.004403
8          Item_Identifier_DR    0.003906
0                 Item_Weight    0.003225
9          Item_Identifier_FD    0.001976
1            Item_Fat_Content    0.000000
10         Item_Identifier_NC    0.000000


In [304]:
# Apply the best parameters to the model
best_model = XGBClassifier(**study.best_params, use_label_encoder=False, eval_metric='mlogloss')

# Train the model on the training data
best_model.fit(df_features, df_Target)

# Predict the target for df_test
df_test_predictions = best_model.predict(df_test)

# Print the predictions
df_test['Size_predicted'] = df_test_predictions
df_test['Size_predicted'].value_counts()

Parameters: { "use_label_encoder" } are not used.



Size_predicted
0    1711
1       1
2       1
Name: count, dtype: int64

In [305]:
df_test.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Location_Type,Outlet_Type,Y,Item_Identifier_DR,Item_Identifier_FD,Item_Identifier_NC,Size_predicted
0,9.3,0.0,0.126678,249.8092,1999,0.0,1.0,8.23,0.0,1.0,0.0,1
1,19.2,1.0,0.0,182.095,1998,1.0,0.0,6.6,0.0,1.0,0.0,0
2,8.93,0.0,0.0,53.8614,1987,1.0,1.0,6.9,0.0,0.0,1.0,2
3,16.2,1.0,0.129179,96.9726,2002,2.0,1.0,6.98,0.0,1.0,0.0,0
4,19.2,1.0,0.307327,187.8214,2007,2.0,1.0,8.46,0.0,1.0,0.0,0


In [306]:
df_test.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Location_Type,Outlet_Type,Y,Item_Identifier_DR,Item_Identifier_FD,Item_Identifier_NC,Size_predicted
0,9.3,0.0,0.126678,249.8092,1999,0.0,1.0,8.23,0.0,1.0,0.0,1
1,19.2,1.0,0.0,182.095,1998,1.0,0.0,6.6,0.0,1.0,0.0,0
2,8.93,0.0,0.0,53.8614,1987,1.0,1.0,6.9,0.0,0.0,1.0,2
3,16.2,1.0,0.129179,96.9726,2002,2.0,1.0,6.98,0.0,1.0,0.0,0
4,19.2,1.0,0.307327,187.8214,2007,2.0,1.0,8.46,0.0,1.0,0.0,0


# all missing is small
