In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler, LabelEncoder
import datetime
from tqdm import tqdm

from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')


In [2]:

def impute_missing_categories(df, target_column, features=None):
    """
    Impute missing categories in a dataset using multiple classification models.
    
    Parameters:
    df: DataFrame containing data with missing values
    target_column: Name of the column with missing categories
    features: List of feature columns to use for prediction
    
    Returns:
    DataFrame: Original data with imputed values
    dict: Model performance results
    """
    # Create a copy of the original dataframe
    df_copy = df.copy()
    
    # If features not specified, use all columns except target
    if features is None:
        features = [col for col in df.columns if col != target_column]
    
    # Split data into sets with and without missing values
    train_mask = df[target_column].notna()
    train_data = df[train_mask]
    missing_data = df[~train_mask]
    
    # If no missing values, return original data
    if len(missing_data) == 0:
        return df_copy, {"message": "No missing values found"}
    
    # Prepare the data
    X_train = train_data[features].copy()
    X_missing = missing_data[features].copy()
    
    # Handle categorical features
    categorical_features = X_train.select_dtypes(include=['object']).columns
    label_encoders = {}
    
    for column in categorical_features:
        label_encoders[column] = LabelEncoder()
        X_train[column] = label_encoders[column].fit_transform(X_train[column])
        X_missing[column] = label_encoders[column].transform(X_missing[column])
    
    # Encode target variable
    target_encoder = LabelEncoder()
    y_train = target_encoder.fit_transform(train_data[target_column])
    
    # Initialize models
    models = {
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        #'Gradient Boosting': GradientBoostingClassifier(random_state=42),
        'SVM': SVC(probability=True, random_state=42),
        'KNN': KNeighborsClassifier(n_neighbors=5)
    }
    
    # Dictionary to store results
    results = {}
    
    # Train and evaluate models
    for name, model in models.items():
        # Cross-validation score
        cv_scores = cross_val_score(model, X_train, y_train, cv=5)
        
        # Train model on full training data
        model.fit(X_train, y_train)
        
        # Make predictions for missing values
        y_pred = model.predict(X_missing)
        
        # Store results
        results[name] = {
            'cv_mean': cv_scores.mean(),
            'cv_std': cv_scores.std(),
            'predictions': target_encoder.inverse_transform(y_pred)
        }
    
    # Find best model based on CV score
    best_model = max(results.items(), key=lambda x: x[1]['cv_mean'])
    
    # Fill missing values with predictions from best model
    df_copy.loc[~train_mask, target_column] = results[best_model[0]]['predictions']
    
    return df_copy, results

def process_both_datasets(train_df, test_df, target_column, features=None):
    """
    Process both training and test datasets separately for missing value imputation.
    
    Parameters:
    train_df: Training DataFrame
    test_df: Test DataFrame
    target_column: Name of the column with missing categories
    features: List of feature columns to use for prediction
    
    Returns:
    tuple: (Imputed train DataFrame, Imputed test DataFrame, Train results, Test results)
    """
    print("Processing training dataset...")
    train_imputed, train_results = impute_missing_categories(
        train_df, 
        target_column, 
        features
    )
    
    print("\nProcessing test dataset...")
    test_imputed, test_results = impute_missing_categories(
        test_df, 
        target_column, 
        features
    )
    
    return train_imputed, test_imputed, train_results, test_results

def print_imputation_results(train_results, test_results):
    """Print formatted results of model comparison for both datasets"""
    datasets = [("Training Dataset", train_results), ("Test Dataset", test_results)]
    
    for dataset_name, results in datasets:
        print(f"\n{dataset_name} Results:")
        print("-" * 50)
        if isinstance(results, dict) and "message" in results:
            print(results["message"])
            continue
            
        for model_name, result in results.items():
            print(f"\n{model_name}:")
            print(f"Cross-validation Score: {result['cv_mean']:.4f} (+/- {result['cv_std']*2:.4f})")
            print(f"Number of predictions made: {len(result['predictions'])}")

In [3]:
class StickerDataset(Dataset):
    def __init__(self, features, targets=None):
        self.features = torch.FloatTensor(features)
        self.targets = torch.FloatTensor(targets) if targets is not None else None
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        if self.targets is not None:
            return self.features[idx], self.targets[idx]
        return self.features[idx]

class DeepStickerNet(nn.Module):
    def __init__(self, input_dim):
        super(DeepStickerNet, self).__init__()
        
        self.network = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.3),
            
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.2),
            
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Dropout(0.1),
            
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            
            nn.Linear(32, 1)
        )
        
    def forward(self, x):
        return self.network(x).squeeze()

In [4]:
def prepare_data_for_nn(df, label_encoders=None, scaler=None, is_training=True):
    """Prepare data for neural network"""
    data = df.copy()
    
    # Handle categorical variables
    cat_cols = ['Brand', 'Material', 'Size','Laptop Compartment', 'Waterproof', 'Style', 'Color']
    if label_encoders is None:
        label_encoders = {}
        for col in cat_cols:
            label_encoders[col] = LabelEncoder()
            data[col] = label_encoders[col].fit_transform(data[col])
    else:
        for col in cat_cols:
            data[col] = label_encoders[col].transform(data[col])
    
    # Select features
    feature_cols = ['Brand', 'Material', 'Size', 'Compartments', 'Laptop Compartment',
       'Waterproof', 'Style', 'Color', 'Weight Capacity (kg)']
    
    X = data[feature_cols].values
    
    # Scale features
    if scaler is None:
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
    else:
        X = scaler.transform(X)
    
    if is_training:
        y = data['Price'].values
        return X, y, label_encoders, scaler
    return X, label_encoders, scaler

In [5]:
def train_model(model, train_loader, val_loader, criterion, optimizer, device, epochs=50):
    best_val_loss = float('inf')
    best_model = None
    
    for epoch in range(epochs):
        # Training phase
        model.train()
        train_loss = 0
        for X_batch, y_batch in tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs}'):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        # Validation phase
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)
                val_loss += criterion(outputs, y_batch).item()
        
        train_loss /= len(train_loader)
        val_loss /= len(val_loader)
        
        print(f'Epoch {epoch+1}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}')
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = model.state_dict().copy()
    
    return best_model

In [9]:
# Load data
train_data = pd.read_csv('playground-series-s5e2/train.csv')
test_data = pd.read_csv('playground-series-s5e2/test.csv')

In [10]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    300000 non-null  int64  
 1   Brand                 290295 non-null  object 
 2   Material              291653 non-null  object 
 3   Size                  293405 non-null  object 
 4   Compartments          300000 non-null  float64
 5   Laptop Compartment    292556 non-null  object 
 6   Waterproof            292950 non-null  object 
 7   Style                 292030 non-null  object 
 8   Color                 290050 non-null  object 
 9   Weight Capacity (kg)  299862 non-null  float64
 10  Price                 300000 non-null  float64
dtypes: float64(3), int64(1), object(7)
memory usage: 25.2+ MB


In [6]:
# def main():
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load data
train_data = pd.read_csv('playground-series-s5e2/train.csv')
test_data = pd.read_csv('playground-series-s5e2/test.csv')
#
train_data = train_data.ffill()
test_data = test_data.ffill()
#
# Impute missing values
# train_data, test_data = impute_missing_values(train_data, test_data)

# Prepare data
X_train, y_train, label_encoders, scaler = prepare_data_for_nn(train_data, is_training=True)
X_test, _, _ = prepare_data_for_nn(test_data, label_encoders, scaler, is_training=False)

# Create datasets
train_dataset = StickerDataset(X_train, y_train)
train_size = int(0.85 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

# Initialize model
model = DeepStickerNet(input_dim=X_train.shape[1]).to(device)

# Define loss and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-6)

# Train model
best_model_state = train_model(model, train_loader, val_loader, criterion, optimizer, device)

# Load best model
model.load_state_dict(best_model_state)

# Make predictions
model.eval()
test_dataset = StickerDataset(X_test)
test_loader = DataLoader(test_dataset, batch_size=64)

predictions = []
with torch.no_grad():
    for X_batch in test_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch)
        predictions.extend(outputs.cpu().numpy())

# Create submission
submission = pd.DataFrame({
    'id': test_data['id'],
    'Price': predictions
})

Epoch 1/50: 100%|█████████████████████████████████████████████████████████████████| 3985/3985 [00:22<00:00, 176.14it/s]


Epoch 1: Train Loss = 2533.9770, Val Loss = 1521.9274


Epoch 2/50: 100%|█████████████████████████████████████████████████████████████████| 3985/3985 [00:25<00:00, 154.28it/s]


Epoch 2: Train Loss = 1525.4779, Val Loss = 1521.5919


Epoch 3/50: 100%|█████████████████████████████████████████████████████████████████| 3985/3985 [00:25<00:00, 155.45it/s]


Epoch 3: Train Loss = 1524.8511, Val Loss = 1523.9452


Epoch 4/50: 100%|█████████████████████████████████████████████████████████████████| 3985/3985 [00:25<00:00, 156.46it/s]


Epoch 4: Train Loss = 1524.5991, Val Loss = 1520.6549


Epoch 5/50: 100%|█████████████████████████████████████████████████████████████████| 3985/3985 [00:25<00:00, 154.43it/s]


Epoch 5: Train Loss = 1524.3745, Val Loss = 1522.5002


Epoch 6/50: 100%|█████████████████████████████████████████████████████████████████| 3985/3985 [00:26<00:00, 150.68it/s]


Epoch 6: Train Loss = 1524.3612, Val Loss = 1520.8489


Epoch 7/50: 100%|█████████████████████████████████████████████████████████████████| 3985/3985 [00:27<00:00, 147.16it/s]


Epoch 7: Train Loss = 1524.2212, Val Loss = 1520.8471


Epoch 8/50: 100%|█████████████████████████████████████████████████████████████████| 3985/3985 [00:25<00:00, 157.45it/s]


Epoch 8: Train Loss = 1524.1230, Val Loss = 1522.2078


Epoch 9/50: 100%|█████████████████████████████████████████████████████████████████| 3985/3985 [00:25<00:00, 156.34it/s]


Epoch 9: Train Loss = 1524.1096, Val Loss = 1521.0246


Epoch 10/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:25<00:00, 153.54it/s]


Epoch 10: Train Loss = 1523.8657, Val Loss = 1520.8008


Epoch 11/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:25<00:00, 154.46it/s]


Epoch 11: Train Loss = 1523.7621, Val Loss = 1520.7031


Epoch 12/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:26<00:00, 151.72it/s]


Epoch 12: Train Loss = 1523.5219, Val Loss = 1521.6785


Epoch 13/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:25<00:00, 155.00it/s]


Epoch 13: Train Loss = 1523.4547, Val Loss = 1520.6493


Epoch 14/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:25<00:00, 154.94it/s]


Epoch 14: Train Loss = 1523.4755, Val Loss = 1521.7363


Epoch 15/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:26<00:00, 152.65it/s]


Epoch 15: Train Loss = 1523.5928, Val Loss = 1520.8234


Epoch 16/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:26<00:00, 152.58it/s]


Epoch 16: Train Loss = 1523.3260, Val Loss = 1521.0946


Epoch 17/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:25<00:00, 154.76it/s]


Epoch 17: Train Loss = 1523.4017, Val Loss = 1521.9218


Epoch 18/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:25<00:00, 154.28it/s]


Epoch 18: Train Loss = 1523.3547, Val Loss = 1521.4220


Epoch 19/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:25<00:00, 155.80it/s]


Epoch 19: Train Loss = 1523.0527, Val Loss = 1521.7923


Epoch 20/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:26<00:00, 151.97it/s]


Epoch 20: Train Loss = 1522.8407, Val Loss = 1521.0594


Epoch 21/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:25<00:00, 155.73it/s]


Epoch 21: Train Loss = 1523.0605, Val Loss = 1522.0773


Epoch 22/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:25<00:00, 153.48it/s]


Epoch 22: Train Loss = 1522.8686, Val Loss = 1521.2741


Epoch 23/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:25<00:00, 156.34it/s]


Epoch 23: Train Loss = 1522.8911, Val Loss = 1521.2392


Epoch 24/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:25<00:00, 155.22it/s]


Epoch 24: Train Loss = 1522.7195, Val Loss = 1521.2861


Epoch 25/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:24<00:00, 161.40it/s]


Epoch 25: Train Loss = 1522.6258, Val Loss = 1521.8224


Epoch 26/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:26<00:00, 148.74it/s]


Epoch 26: Train Loss = 1522.6420, Val Loss = 1521.1355


Epoch 27/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:25<00:00, 155.66it/s]


Epoch 27: Train Loss = 1522.4709, Val Loss = 1521.0460


Epoch 28/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:24<00:00, 164.01it/s]


Epoch 28: Train Loss = 1522.3338, Val Loss = 1521.0362


Epoch 29/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:23<00:00, 168.51it/s]


Epoch 29: Train Loss = 1522.1768, Val Loss = 1524.0080


Epoch 30/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:23<00:00, 167.12it/s]


Epoch 30: Train Loss = 1522.1996, Val Loss = 1521.6145


Epoch 31/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:24<00:00, 166.02it/s]


Epoch 31: Train Loss = 1522.2035, Val Loss = 1521.0774


Epoch 32/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:23<00:00, 168.19it/s]


Epoch 32: Train Loss = 1521.9228, Val Loss = 1522.1757


Epoch 33/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:20<00:00, 197.06it/s]


Epoch 33: Train Loss = 1522.0653, Val Loss = 1522.0417


Epoch 34/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:17<00:00, 223.55it/s]


Epoch 34: Train Loss = 1521.9598, Val Loss = 1521.0318


Epoch 35/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:18<00:00, 210.15it/s]


Epoch 35: Train Loss = 1521.9407, Val Loss = 1522.5051


Epoch 36/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:18<00:00, 218.23it/s]


Epoch 36: Train Loss = 1521.7858, Val Loss = 1522.9361


Epoch 37/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:18<00:00, 218.10it/s]


Epoch 37: Train Loss = 1521.7423, Val Loss = 1521.4333


Epoch 38/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:19<00:00, 204.19it/s]


Epoch 38: Train Loss = 1521.7751, Val Loss = 1521.9909


Epoch 39/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:18<00:00, 211.08it/s]


Epoch 39: Train Loss = 1521.5380, Val Loss = 1522.8265


Epoch 40/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:18<00:00, 216.50it/s]


Epoch 40: Train Loss = 1521.5414, Val Loss = 1522.0468


Epoch 41/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:17<00:00, 222.91it/s]


Epoch 41: Train Loss = 1520.9495, Val Loss = 1523.8479


Epoch 42/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:17<00:00, 232.63it/s]


Epoch 42: Train Loss = 1521.1682, Val Loss = 1522.3474


Epoch 43/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:17<00:00, 234.35it/s]


Epoch 43: Train Loss = 1521.1082, Val Loss = 1522.1764


Epoch 44/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:18<00:00, 217.11it/s]


Epoch 44: Train Loss = 1521.0764, Val Loss = 1521.9599


Epoch 45/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:17<00:00, 229.37it/s]


Epoch 45: Train Loss = 1521.0380, Val Loss = 1522.2785


Epoch 46/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:19<00:00, 204.08it/s]


Epoch 46: Train Loss = 1520.9861, Val Loss = 1524.1158


Epoch 47/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:21<00:00, 184.45it/s]


Epoch 47: Train Loss = 1520.7073, Val Loss = 1521.9525


Epoch 48/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:21<00:00, 185.91it/s]


Epoch 48: Train Loss = 1520.6640, Val Loss = 1522.4135


Epoch 49/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:21<00:00, 183.38it/s]


Epoch 49: Train Loss = 1520.4549, Val Loss = 1522.4629


Epoch 50/50: 100%|████████████████████████████████████████████████████████████████| 3985/3985 [00:22<00:00, 180.85it/s]


Epoch 50: Train Loss = 1520.7796, Val Loss = 1522.4254


In [8]:
from datetime import datetime
file_name = f"submission/deep_learning_predictions_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
submission.to_csv(file_name, index=False)
print("Predictions saved to {}".format(file_name))

Predictions saved to submission/deep_learning_predictions_20250209_012403.csv
