# Extract the ZIP File

In [1]:
import zipfile

# Define paths
zip_path = "/content/playground-series-s5e2.zip"  # Update if needed
extract_path = "/content/extracted_data"

# Extract the ZIP file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Extraction complete. Files are available in:", extract_path)


Extraction complete. Files are available in: /content/extracted_data


# Load the Data

In [2]:
import pandas as pd

# Load the dataset
file_path = "/content/extracted_data/train.csv"  # Update if needed
df = pd.read_csv(file_path)

# Display dataset info and first 5 rows
print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    300000 non-null  int64  
 1   Brand                 290295 non-null  object 
 2   Material              291653 non-null  object 
 3   Size                  293405 non-null  object 
 4   Compartments          300000 non-null  float64
 5   Laptop Compartment    292556 non-null  object 
 6   Waterproof            292950 non-null  object 
 7   Style                 292030 non-null  object 
 8   Color                 290050 non-null  object 
 9   Weight Capacity (kg)  299862 non-null  float64
 10  Price                 300000 non-null  float64
dtypes: float64(3), int64(1), object(7)
memory usage: 25.2+ MB
None
   id         Brand Material    Size  Compartments Laptop Compartment  \
0   0      Jansport  Leather  Medium           7.0                Yes   

# Handle Missing Values

In [3]:
# Fill missing categorical values with "Unknown"
categorical_cols = ["Brand", "Material", "Size", "Laptop Compartment", "Waterproof", "Style", "Color"]
df[categorical_cols] = df[categorical_cols].fillna("Unknown")

# Fill missing numerical values with median
numerical_cols = ["Weight Capacity (kg)"]
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())

print("Missing values handled.")


Missing values handled.


# Encode Categorical Variables

In [4]:
# Perform one-hot encoding for categorical columns
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print("Categorical encoding complete. New shape:", df_encoded.shape)


Categorical encoding complete. New shape: (300000, 29)


# Split Data for Training & Testing

In [5]:
from sklearn.model_selection import train_test_split

# Define features and target
X = df_encoded.drop(columns=["id", "Price"])
y = df_encoded["Price"]

# Split data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data split complete. Train shape:", X_train.shape, "Test shape:", X_test.shape)


Data split complete. Train shape: (240000, 27) Test shape: (60000, 27)


# Train a Machine Learning Model

In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Initialize and train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model performance
mae = mean_absolute_error(y_test, y_pred)
print(f"Model Training Complete! Mean Absolute Error: {mae:.2f}")


Model Training Complete! Mean Absolute Error: 34.34


# Save the Model

In [7]:
import joblib

# Save the trained model
joblib.dump(model, "backpack_price_predictor.pkl")
print("Model saved successfully.")


Model saved successfully.


# Load the Test Data (Kaggle Test Set)

In [8]:
# Load the test dataset (Replace with the actual test file path)
test_file_path = "/content/extracted_data/test.csv"  # Update if needed
test_df = pd.read_csv(test_file_path)

# Display test dataset info
print(test_df.info())
print(test_df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 10 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    200000 non-null  int64  
 1   Brand                 193773 non-null  object 
 2   Material              194387 non-null  object 
 3   Size                  195619 non-null  object 
 4   Compartments          200000 non-null  float64
 5   Laptop Compartment    195038 non-null  object 
 6   Waterproof            195189 non-null  object 
 7   Style                 194847 non-null  object 
 8   Color                 193215 non-null  object 
 9   Weight Capacity (kg)  199923 non-null  float64
dtypes: float64(2), int64(1), object(7)
memory usage: 15.3+ MB
None
       id   Brand Material    Size  Compartments Laptop Compartment  \
0  300000    Puma  Leather   Small           2.0                 No   
1  300001    Nike   Canvas  Medium           7.0       

# Preprocess the Test Data

In [9]:
# Fill missing values in test set
test_df[categorical_cols] = test_df[categorical_cols].fillna("Unknown")
test_df[numerical_cols] = test_df[numerical_cols].fillna(df[numerical_cols].median())

# Apply the same encoding as training data
test_df_encoded = pd.get_dummies(test_df, columns=categorical_cols, drop_first=True)

# Ensure test data has the same columns as training data
missing_cols = set(X_train.columns) - set(test_df_encoded.columns)
for col in missing_cols:
    test_df_encoded[col] = 0  # Add missing columns with value 0

# Ensure correct column order
test_df_encoded = test_df_encoded[X_train.columns]

print("Test data preprocessing complete.")


Test data preprocessing complete.


# Make Predictions

In [10]:
# Predict backpack prices
test_df["Price"] = model.predict(test_df_encoded)

# Select required columns
submission = test_df[["id", "Price"]]

print("Predictions complete. Preview:")
print(submission.head())


Predictions complete. Preview:
       id      Price
0  300000  77.731506
1  300001  82.134377
2  300002  79.730793
3  300003  68.417504
4  300004  72.907070


# Save Predictions as submission.csv

In [11]:
# Save the file
submission.to_csv("submission.csv", index=False)
print("submission.csv saved successfully.")


submission.csv saved successfully.
