In [7]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [8]:
# Loading dataset
df = pd.read_csv('Data/train.csv')

In [9]:
# Handle missing values
df.fillna(df.median(numeric_only=True), inplace=True)

In [10]:
label_encoders = {}

# Encoding categorical features
for col in df.select_dtypes(include=['object']).columns:
    df[col].fillna("Unknown", inplace=True)  # Handle missing values
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col])


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna("Unknown", inplace=True)  # Handle missing values
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna("Unknown", inplace=True)  # Handle missing values
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate o

In [11]:
# Split features and target
X = df.drop(columns=['Item_Outlet_Sales'])
y = df['Item_Outlet_Sales']

In [12]:
# Standardize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [13]:
# Store column names for consistency
feature_columns = X.columns

In [14]:
# Split data into training and validation sets
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [15]:
# Define deep learning model
model = keras.Sequential([
    keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(1)  # Regression output layer
])

# Compile model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Train model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=50, batch_size=32)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 6988760.5000 - mae: 2043.9088 - val_loss: 1451206.6250 - val_mae: 937.8271
Epoch 2/50
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1522846.8750 - mae: 936.1843 - val_loss: 1216241.0000 - val_mae: 812.7904
Epoch 3/50
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1355098.3750 - mae: 847.3034 - val_loss: 1170092.1250 - val_mae: 789.6332
Epoch 4/50
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 1381259.2500 - mae: 854.6055 - val_loss: 1149876.6250 - val_mae: 773.8671
Epoch 5/50
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1318124.0000 - mae: 829.4391 - val_loss: 1138799.7500 - val_mae: 772.1984
Epoch 6/50
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1309885.5000 - mae: 819.4849 - val_loss: 1124789.1250 - val_mae: 

In [16]:
# Evaluate model
loss, mae = model.evaluate(X_val, y_val)
print(f'Validation MAE: {mae}')

[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1058954.8750 - mae: 728.8605
Validation MAE: 730.7075805664062


In [17]:
# Loading test data
test_df = pd.read_csv('Data/test.csv')

In [18]:
# Selecting Item_Identifier and Outlet_Identifier for submission
submission_df = test_df[['Item_Identifier', 'Outlet_Identifier']]

In [19]:
# Preprocess test data
test_df.fillna(test_df.median(numeric_only=True), inplace=True)

label_encoders = {}

# Encoding categorical features
for col in test_df.select_dtypes(include=['object']).columns:
    test_df[col].fillna("Unknown", inplace=True)  # Handle missing values
    label_encoders[col] = LabelEncoder()
    test_df[col] = label_encoders[col].fit_transform(test_df[col])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[col].fillna("Unknown", inplace=True)  # Handle missing values
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[col].fillna("Unknown", inplace=True)  # Handle missing values
The behavior will change in pandas 3.0. This inplace method will never work because the inte

In [20]:
# Ensure test features match training features
test_X = test_df[feature_columns]

In [21]:
# Standardize test data
test_X_scaled = scaler.transform(test_X)

In [22]:
# Make predictions
test_predictions = model.predict(test_X_scaled)

[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


In [23]:
# Saving predictions outcome
submission_df['Item_Outlet_Sales'] = test_predictions.flatten()
submission_df.to_csv('DL_submission_v6.csv', index=False)
print("Predictions saved")


Predictions saved


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission_df['Item_Outlet_Sales'] = test_predictions.flatten()
