In [93]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [94]:
train_df = pd.read_csv('train.csv')
train_df_extra = pd.read_csv('training_extra.csv')
test_df = pd.read_csv('test.csv')
test_df['Price'] = 100

In [95]:
train_len = len(train_df)
train_extra_len = len(train_df_extra)
test_len = len(test_df)
final_df_len = train_len + train_extra_len

Merge all files to apply the preprocessing steps

In [96]:
df_merged = pd.concat([train_df, train_df_extra, test_df], axis=0, ignore_index=True)

In [97]:
df_merged.shape

(4194318, 11)

In [98]:
df_merged.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


In [99]:
df_merged.isnull().sum()

id                           0
Brand                   132985
Material                116575
Size                     92166
Compartments                 0
Laptop Compartment      103495
Waterproof               99135
Style                   109333
Color                   140402
Weight Capacity (kg)      1885
Price                        0
dtype: int64

Value counts for all categorical cols

In [100]:
categ_cols = ['Brand', 'Material', 'Size','Color']

for i in range(len(categ_cols)):
    print(df_merged[categ_cols[i]].value_counts())

Brand
Under Armour    841174
Adidas          837173
Nike            802680
Puma            793635
Jansport        786671
Name: count, dtype: int64
Material
Polyester    1113909
Leather      1025175
Nylon         990149
Canvas        948510
Name: count, dtype: int64
Size
Medium    1422262
Large     1377979
Small     1301911
Name: count, dtype: int64
Color
Pink     723018
Gray     699344
Blue     670497
Red      661616
Black    651603
Green    647838
Name: count, dtype: int64


In [101]:
df_merged['Weight Capacity (kg)'].nunique()


2011562

Filling up all NaN vals 

In [102]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy = 'most_frequent')

imputer_cols = ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']

df_merged[imputer_cols] = imputer.fit_transform(df_merged[imputer_cols])

In [103]:
imputer_num = SimpleImputer(strategy = 'median')

imputer_num_cols = ['Weight Capacity (kg)']

df_merged[imputer_num_cols] = imputer_num.fit_transform(df_merged[imputer_num_cols])

In [104]:
yes_no_cols = ['Laptop Compartment', 'Waterproof']

df_merged[yes_no_cols] = df_merged[yes_no_cols].replace({'Yes': 1, 'No': 0})

  df_merged[yes_no_cols] = df_merged[yes_no_cols].replace({'Yes': 1, 'No': 0})


In [105]:
df_merged.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,Jansport,Leather,Medium,7.0,1,0,Tote,Black,11.611723,112.15875
1,1,Jansport,Canvas,Small,10.0,1,1,Messenger,Green,27.078537,68.88056
2,2,Under Armour,Leather,Small,2.0,1,0,Messenger,Red,16.64376,39.1732
3,3,Nike,Nylon,Small,8.0,1,0,Messenger,Green,12.93722,80.60793
4,4,Adidas,Canvas,Medium,1.0,1,1,Messenger,Green,17.749338,86.02312


One Hot Encoding all categorical cols

In [106]:
from sklearn.preprocessing import OneHotEncoder

onehot_cols = ['Brand', 'Material', 'Size', 'Style', 'Color']

onehot = OneHotEncoder()

onehot_df = pd.DataFrame(onehot.fit_transform(df_merged[onehot_cols]).toarray())

onehot_df.columns = onehot.get_feature_names_out(onehot_cols)

df_merged = df_merged.drop(onehot_cols, axis=1)

df_merged = pd.concat([df_merged, onehot_df], axis=1)

df_merged.head()

Unnamed: 0,id,Compartments,Laptop Compartment,Waterproof,Weight Capacity (kg),Price,Brand_Adidas,Brand_Jansport,Brand_Nike,Brand_Puma,...,Size_Small,Style_Backpack,Style_Messenger,Style_Tote,Color_Black,Color_Blue,Color_Gray,Color_Green,Color_Pink,Color_Red
0,0,7.0,1,0,11.611723,112.15875,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1,10.0,1,1,27.078537,68.88056,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,2,2.0,1,0,16.64376,39.1732,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,3,8.0,1,0,12.93722,80.60793,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,4,1.0,1,1,17.749338,86.02312,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


Split dataset into train and test (original)

In [107]:
final_df = df_merged[:final_df_len]
test_df = df_merged[final_df_len:]

In [108]:
'''corr = final_df.corr()
plt.figure(figsize=(50,50))
sns.heatmap(corr, annot=True)
plt.imsave('corr.png', corr)'''

"corr = final_df.corr()\nplt.figure(figsize=(50,50))\nsns.heatmap(corr, annot=True)\nplt.imsave('corr.png', corr)"

Check Skewness

In [109]:
'''skewed_cols = final_df.skew().sort_values(ascending=False)
skewed_cols'''

'skewed_cols = final_df.skew().sort_values(ascending=False)\nskewed_cols'

In [110]:
test_df = test_df.drop('Price', axis=1)


Split the data into train test split & 
Model training

In [111]:
'''xgboost works but lightgbm is just slightly better than xgboost'''

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

X = final_df.drop(columns = ['Price', 'id'], axis=1)
y = final_df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

'''model = XGBRegressor()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))'''

"model = XGBRegressor()\n\nmodel.fit(X_train, y_train)\n\ny_pred = model.predict(X_test)\n\nprint('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))"

In [112]:
from lightgbm import LGBMRegressor

model2 = LGBMRegressor()

model2.fit(X_train, y_train)

y_pred2 = model2.predict(X_test)

print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred2)))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019546 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 312
[LightGBM] [Info] Number of data points in the train set: 3195454, number of used features: 25
[LightGBM] [Info] Start training from score 81.361311
RMSE: 38.8763339303312


In [113]:
'''Random forest doesnt usually perform well for me. I have included the code so anyone can try'''

'''from sklearn.ensemble import RandomForestRegressor

model3 = RandomForestRegressor()

model3.fit(X_train, y_train)

y_pred3 = model3.predict(X_test)

print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred3)))'''

"from sklearn.ensemble import RandomForestRegressor\n\nmodel3 = RandomForestRegressor()\n\nmodel3.fit(X_train, y_train)\n\ny_pred3 = model3.predict(X_test)\n\nprint('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred3)))"

In [114]:
'''NN is the last resort with proper optimizing it can produce the best results'''

'''import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from tqdm import tqdm  # Import tqdm for progress bar

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Convert data to PyTorch tensors and move to device
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1).to(device)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1).to(device)

# Create DataLoader
batch_size = 512
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Define the Neural Network
class PricePredictionNN(nn.Module):
    def __init__(self, input_size):
        super(PricePredictionNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Initialize model and move to device
input_size = X_train.shape[1]
model = PricePredictionNN(input_size).to(device)

# Loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop with progress bar
epochs = 50

for epoch in range(epochs):
    epoch_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=False)

    for batch_X, batch_y in progress_bar:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)  # Move batch to GPU

        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    print(f"Epoch [{epoch+1}/{epochs}] - Avg Loss: {epoch_loss/len(train_loader):.4f}")

# Evaluate on test set
model.eval()
with torch.no_grad():
    y_pred = model(X_test_tensor)
    test_loss = criterion(y_pred, y_test_tensor)
    rmse = torch.sqrt(test_loss)

print(f"\nTest Loss (MSE): {test_loss.item():.4f}")
print(f"Test RMSE: {rmse.item():.4f}")'''


'import torch\nimport torch.nn as nn\nimport torch.optim as optim\nfrom torch.utils.data import DataLoader, TensorDataset\nimport numpy as np\nfrom tqdm import tqdm  # Import tqdm for progress bar\n\n# Check for GPU\ndevice = torch.device("cuda" if torch.cuda.is_available() else "cpu")\nprint(f"Using device: {device}")\n\n# Convert data to PyTorch tensors and move to device\nX_train_tensor = torch.tensor(X_train.values, dtype=torch.float32).to(device)\ny_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1).to(device)\nX_test_tensor = torch.tensor(X_test.values, dtype=torch.float32).to(device)\ny_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1).to(device)\n\n# Create DataLoader\nbatch_size = 512\ntrain_dataset = TensorDataset(X_train_tensor, y_train_tensor)\ntrain_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)\n\n# Define the Neural Network\nclass PricePredictionNN(nn.Module):\n    def __init__(self, input_size):

Predicting vals and saving them in csv files 

In [117]:
test_df = test_df.drop(columns = 'id', axis = 1)

In [118]:
predictions = model2.predict(test_df)

df = pd.read_csv("test.csv")

submission = pd.DataFrame({'id': df['id'],
                           'Price': predictions})
submission.to_csv('submission_2.csv', index = False)

In [67]:
'''test_df_tensor = torch.tensor(test_df.values, dtype=torch.float32).to(device)

# Make predictions
model.eval()
with torch.no_grad():
    predictions = model(test_df_tensor).cpu().numpy()  # Move to CPU & convert to NumPy

# Create submission DataFrame
submission = pd.DataFrame({
    'id': test_df['id'],
    'Price': predictions.flatten()  # Flatten to avoid extra dimensions
})

# Save to CSV
submission.to_csv('submission_1.csv', index=False)'''


"test_df_tensor = torch.tensor(test_df.values, dtype=torch.float32).to(device)\n\n# Make predictions\nmodel.eval()\nwith torch.no_grad():\n    predictions = model(test_df_tensor).cpu().numpy()  # Move to CPU & convert to NumPy\n\n# Create submission DataFrame\nsubmission = pd.DataFrame({\n    'id': test_df['id'],\n    'Price': predictions.flatten()  # Flatten to avoid extra dimensions\n})\n\n# Save to CSV\nsubmission.to_csv('submission_1.csv', index=False)"