In [1]:
pip install --upgrade scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
from sklearn.metrics import mean_squared_error


In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# 1. Load the data
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

# Optional: If you have a sample submission, read it to see its structure
sample_submission = pd.read_csv("Samle_Submission.csv")

# 2. Basic data exploration & cleaning
# Check columns in train and test
print("Train columns:", train_data.columns)
print("Test columns:", test_data.columns)

# For example, let’s see if any columns have missing values
print("Missing values in train:\n", train_data.isnull().sum())
print("Missing values in test:\n", test_data.isnull().sum())

# 3. Feature Engineering

# (a) Combine train and test for consistent transformations (except the target)
#     This is sometimes done to ensure consistent label encoding, etc.
train_data["is_train"] = 1
test_data["is_train"] = 0
test_data["Purchase"] = 0  # Temporary placeholder so columns align

full_data = pd.concat([train_data, test_data], axis=0)

# (b) Encode categorical variables
#     Let's do a LabelEncoder for each of the object or categorical fields
cat_cols = ["Gender", "Age", "City_Category", "Stay_In_Current_City_Years"]

for col in cat_cols:
    le = LabelEncoder()
    full_data[col] = le.fit_transform(full_data[col].astype(str))

# (c) Handle missing product category columns
#     Suppose we fill missing product_category_2, product_category_3 with 0 or another placeholder
full_data["Product_Category_2"] = full_data["Product_Category_2"].fillna(0)
full_data["Product_Category_3"] = full_data["Product_Category_3"].fillna(0)

# (d) Drop or transform any features not needed
#     For example, if 'User_ID' and 'Product_ID' are too high-cardinality,
#     you might keep them as is or transform them with some numeric encoding.
#     As a first pass, sometimes you can drop them or keep them and see how the model performs.
#     Let's keep them in if we want the model to possibly learn ID-specific patterns.
#     Or we can do a label encoding of them as well.

# Label encode user_id and product_id for the sake of the model
id_cols = ["User_ID", "Product_ID"]
for col in id_cols:
    le = LabelEncoder()
    full_data[col] = le.fit_transform(full_data[col].astype(str))

# Now split back to train/test
train_cleaned = full_data[full_data["is_train"] == 1].copy()
test_cleaned  = full_data[full_data["is_train"] == 0].copy()

# 4. Model Training
X = train_cleaned.drop(["Purchase","is_train"], axis=1)
y = train_cleaned["Purchase"].values

# Optional: do a local train/validation split to estimate performance
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a model (RandomForest as example)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on validation to check RMSE
val_preds = model.predict(X_val)
mse_val = mean_squared_error(y_val, val_preds)
rmse_val = mse_val ** 0.5  # same as sqrt(MSE)

print("Validation RMSE:", rmse_val)

# 5. Inference on test set
X_test = test_cleaned.drop(["Purchase","is_train"], axis=1)
test_preds = model.predict(X_test)

# 6. Create a submission matching the Sample_Submission format
#    Typically, Sample_Submission.csv might look like: 
#    User_ID,Product_ID,Purchase
#    1000001,P000001,9655
#    ...
#    So we match that schema:

final_submission = test_data[["User_ID", "Product_ID"]].copy()
final_submission["Purchase"] = test_preds

# Save submission to CSV
final_submission.to_csv("my_submission.csv", index=False)

print("Submission file 'my_submission.csv' created successfully.")


Train columns: Index(['User_ID', 'Product_ID', 'Gender', 'Age', 'Occupation', 'City_Category',
       'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1',
       'Product_Category_2', 'Product_Category_3', 'Purchase'],
      dtype='object')
Test columns: Index(['User_ID', 'Product_ID', 'Gender', 'Age', 'Occupation', 'City_Category',
       'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1',
       'Product_Category_2', 'Product_Category_3'],
      dtype='object')
Missing values in train:
 User_ID                            0
Product_ID                         0
Gender                             0
Age                                0
Occupation                         0
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2            173638
Product_Category_3            383247
Purchase                           0
dtype: int64
Missing values