In [None]:
# Kaggle API Dataset Download
# This section handles downloading and accessing the dataset for the Playground Series S5E5 competition

import os
import sys
!{sys.executable} -m pip install --upgrade pip --user
!{sys.executable} -m pip install -r requirements.txt
!{sys.executable} -m pip install cudf
!{sys.executable} -m pip install cuml

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import time
import warnings
import torch
import torch.optim as optim
import cudf  # RAPIDS - GPU-accelerated pandas
import cuml  # RAPIDS - GPU-accelerated scikit-learn

from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings('ignore')



# This cell detects whether we're running on Kaggle or locally
IN_KAGGLE = os.path.exists('/kaggle/input')

if IN_KAGGLE:
    # If running on Kaggle, the data is already available in the /kaggle/input directory
    print("Running on Kaggle - dataset already available")

    # Competition data paths
    BASE_DIR = '/kaggle/input/playground-series-s5e5'

else:
    # If running locally, we need to download the data via the Kaggle API
    print("Running locally - downloading data via Kaggle API")

    # First, check if kaggle module is installed
    try:
        import kaggle
    except ImportError:
        print("Kaggle API not found. Installing...")
        !pip install kaggle
        import kaggle

    # Create directory for data if it doesn't exist
    os.makedirs('kaggle_data', exist_ok=True)

    # Download competition data
    # Note: You need to have your Kaggle API credentials in ~/.kaggle/kaggle.json
    # If not already set up, run the following commands in a cell:
    """
    # Run this if you haven't set up Kaggle API credentials:
    !mkdir -p ~/.kaggle
    !echo '{"username":"YOUR_USERNAME","key":"YOUR_KEY"}' > ~/.kaggle/kaggle.json
    !chmod 600 ~/.kaggle/kaggle.json
    """

    # Download all competition files
    !kaggle competitions download -c playground-series-s5e5 -p kaggle_data

    # Unzip the downloaded files
    import zipfile
    with zipfile.ZipFile('kaggle_data/playground-series-s5e5.zip', 'r') as zip_ref:
        zip_ref.extractall('kaggle_data')

    print("Dataset downloaded successfully!")

    # Set the base directory for data access
    BASE_DIR = 'kaggle_data'

# Now let's define paths to access the files in a consistent way
# This will work both on Kaggle and locally

train_path = os.path.join(BASE_DIR, 'train.csv')
test_path = os.path.join(BASE_DIR, 'test.csv')
sample_submission_path = os.path.join(BASE_DIR, 'sample_submission.csv')

# Load the datasets
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
sample_submission = pd.read_csv(sample_submission_path)

# Display basic information about the datasets
print("\n--- Dataset Information ---")
print(f"Training set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")
print(f"Sample submission shape: {sample_submission.shape}")

# Display a few rows of the training data
print("\n--- First few rows of training data ---")
train_df.head()

In [None]:
# GPU-Accelerated Calorie Prediction Model Training

## 1. Import necessary libraries

# GPU-accelerated libraries


# Check if CUDA is available
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU device: {torch.cuda.get_device_name(0)}")
    device = torch.device("cuda:0")
else:
    print("GPU not available, using CPU instead")
    device = torch.device("cpu")

## 2. Create the dataset from the provided data
df = train_df  # Assuming train_df is already loaded

## 3. Exploratory Data Analysis (EDA)
print("Dataset shape:", df.shape)
print("\nBasic statistics:")
print(df.describe())

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

# Visualize the target variable distribution
plt.figure(figsize=(10, 6))
plt.hist(df['Calories'], bins=10, edgecolor='black')
plt.title('Distribution of Calories')
plt.xlabel('Calories')
plt.ylabel('Frequency')
plt.show()

# Convert categorical to numerical first
df['Sex'] = df['Sex'].map({'male': 1, 'female': 0})

# Correlation matrix
plt.figure(figsize=(12, 8))
correlation_matrix = df.drop('id', axis=1).corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

## 4. GPU-accelerated Feature Engineering and Preprocessing
# Try to use RAPIDS cuDF if available, otherwise fall back to pandas
try:
    # Convert pandas DataFrame to cuDF DataFrame for GPU acceleration
    gpu_df = cudf.DataFrame.from_pandas(df)
    print("Using RAPIDS cuDF for GPU acceleration")
    
    # Separate features and target
    X_gpu = gpu_df.drop(['id', 'Calories'], axis=1)
    y_gpu = gpu_df['Calories']
    
    # Split the data
    X_train_gpu, X_test_gpu, y_train_gpu, y_test_gpu = train_test_split(
        X_gpu, y_gpu, test_size=0.2, random_state=42
    )
    
    # Feature scaling with cuML
    scaler = cuml.preprocessing.StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_gpu)
    X_test_scaled = scaler.transform(X_test_gpu)
    
    # Convert back to numpy for compatibility with some operations
    X_train_np = X_train_scaled.to_numpy()
    X_test_np = X_test_scaled.to_numpy()
    y_train_np = y_train_gpu.to_numpy()
    y_test_np = y_test_gpu.to_numpy()
    
except (ImportError, AttributeError) as e:
    print(f"RAPIDS not available or error: {e}")
    print("Falling back to CPU preprocessing")
    
    # Separate features and target using regular pandas
    X = df.drop(['id', 'Calories'], axis=1)
    y = df['Calories']
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Feature scaling
    scaler = StandardScaler()
    X_train_np = scaler.fit_transform(X_train)
    X_test_np = scaler.transform(X_test)
    y_train_np = y_train.values
    y_test_np = y_test.values
    
    # Store column names for later use
    feature_names = X.columns


## 5. PyTorch Neural Network Model (GPU-Accelerated)

# Custom dataset class for PyTorch
class CalorieDataset(Dataset):
    def __init__(self, features, targets):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.targets = torch.tensor(targets, dtype=torch.float32).reshape(-1, 1)
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

# Create datasets
train_dataset = CalorieDataset(X_train_np, y_train_np)
test_dataset = CalorieDataset(X_test_np, y_test_np)

# Create data loaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Define neural network model
class CaloriePredictor(nn.Module):
    def __init__(self, input_dim):
        super(CaloriePredictor, self).__init__()
        self.layer1 = nn.Linear(input_dim, 64)
        self.layer2 = nn.Linear(64, 32)
        self.layer3 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.dropout(x)
        x = self.relu(self.layer2(x))
        x = self.layer3(x)
        return x

# Initialize model, loss function, and optimizer
input_dim = X_train_np.shape[1]
model = CaloriePredictor(input_dim).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training function
def train_model(model, train_loader, criterion, optimizer, epochs=100):
    start_time = time.time()
    model.train()
    train_losses = []
    
    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            
            # Zero the parameter gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            
            # Backward pass and optimize
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
        
        epoch_loss = running_loss / len(train_loader)
        train_losses.append(epoch_loss)
        
        if (epoch + 1) % 10 == 0:
            print(f'Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}')
    
    training_time = time.time() - start_time
    print(f'Training completed in {training_time:.2f} seconds')
    return train_losses

# Evaluation function
def evaluate_model(model, data_loader, criterion):
    model.eval()
    predictions = []
    actuals = []
    running_loss = 0.0
    
    with torch.no_grad():
        for inputs, targets in data_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            running_loss += loss.item()
            
            # Move predictions and actuals to CPU for numpy conversion
            pred = outputs.cpu().numpy()
            actual = targets.cpu().numpy()
            
            predictions.extend(pred.flatten().tolist())
            actuals.extend(actual.flatten().tolist())
    
    # Calculate metrics
    mse = mean_squared_error(actuals, predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(actuals, predictions)
    r2 = r2_score(actuals, predictions)
    
    return {
        'predictions': predictions,
        'actuals': actuals,
        'mse': mse,
        'rmse': rmse,
        'mae': mae,
        'r2': r2,
        'avg_loss': running_loss / len(data_loader)
    }

## 6. Train the PyTorch model
print("\nTraining PyTorch Neural Network on GPU...")
train_losses = train_model(model, train_loader, criterion, optimizer, epochs=100)

# Plot training loss
plt.figure(figsize=(10, 6))
plt.plot(train_losses)
plt.title('Training Loss Over Time')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.grid(True)
plt.show()

## 7. Evaluate the PyTorch model
print("\nEvaluating model performance...")
train_results = evaluate_model(model, train_loader, criterion)
test_results = evaluate_model(model, test_loader, criterion)

print("\nTraining Results:")
print(f"MSE: {train_results['mse']:.2f}")
print(f"RMSE: {train_results['rmse']:.2f}")
print(f"MAE: {train_results['mae']:.2f}")
print(f"R2 Score: {train_results['r2']:.2f}")

print("\nTesting Results:")
print(f"MSE: {test_results['mse']:.2f}")
print(f"RMSE: {test_results['rmse']:.2f}")
print(f"MAE: {test_results['mae']:.2f}")
print(f"R2 Score: {test_results['r2']:.2f}")

## 8. Visualize predictions
def plot_actual_vs_predicted(actuals, predictions, title):
    plt.figure(figsize=(10, 6))
    plt.scatter(actuals, predictions)
    plt.plot([min(actuals), max(actuals)], [min(actuals), max(actuals)], 'r--')
    plt.xlabel('Actual Calories')
    plt.ylabel('Predicted Calories')
    plt.title(title)
    plt.grid(True)
    plt.show()

plot_actual_vs_predicted(
    test_results['actuals'], 
    test_results['predictions'], 
    'Actual vs Predicted Calories (GPU PyTorch Model)'
)

## 9. Feature Importance Analysis with GPU-Accelerated RAPIDS Random Forest (optional)
try:
    print("\nCalculating feature importance with RAPIDS Random Forest...")
    from cuml.ensemble import RandomForestRegressor as cuRFR
    
    # Train a GPU-accelerated Random Forest
    rf_model = cuRFR(n_estimators=100, max_depth=10, random_state=42)
    rf_model.fit(X_train_scaled, y_train_gpu)
    
    # Get feature importance
    importances = rf_model.feature_importances_
    feature_names = X_gpu.columns
    
    # Sort features by importance
    indices = np.argsort(importances.to_array())[::-1]
    
    plt.figure(figsize=(10, 6))
    plt.title('Feature Importance - RAPIDS Random Forest')
    plt.bar(range(X_train_scaled.shape[1]), importances[indices], align='center')
    plt.xticks(range(X_train_scaled.shape[1]), [feature_names[i] for i in indices], rotation=90)
    plt.tight_layout()
    plt.show()
    
    print("\nFeature Importance:")
    for i in indices:
        print(f"{feature_names[i]}: {importances[i]:.4f}")
        
except (ImportError, AttributeError, NameError) as e:
    print(f"Could not run RAPIDS Random Forest: {e}")
    print("Skipping feature importance analysis")

## 10. Function to predict calories for new data using the PyTorch model
def predict_calories(data_dict, model, scaler, device):
    """
    Predict calories for new data using the PyTorch model.
    
    Parameters:
    data_dict (dict): Dictionary with keys as feature names and values
    model: Trained PyTorch model
    scaler: Fitted scaler
    device: Device to run inference on
    
    Returns:
    float: Predicted calories
    """
    # Convert to DataFrame
    new_data = pd.DataFrame([data_dict])
    
    # Convert categorical to numerical
    if 'Sex' in new_data.columns:
        new_data['Sex'] = new_data['Sex'].map({'male': 1, 'female': 0})
    
    try:
        # Try to use cuDF for GPU acceleration
        new_data_gpu = cudf.DataFrame.from_pandas(new_data)
        new_data_scaled = scaler.transform(new_data_gpu)
        new_data_np = new_data_scaled.to_numpy()
    except (ImportError, AttributeError, NameError):
        # Fall back to CPU
        new_data_scaled = scaler.transform(new_data)
        new_data_np = new_data_scaled
    
    # Convert to PyTorch tensor
    new_data_tensor = torch.tensor(new_data_np, dtype=torch.float32).to(device)
    
    # Set model to evaluation mode
    model.eval()
    
    # Make prediction
    with torch.no_grad():
        prediction = model(new_data_tensor).item()
    
    return prediction

# Example usage
example_data = {
    'Sex': 'male',
    'Age': 30,
    'Height': 175,
    'Weight': 70,
    'Duration': 20,
    'Heart_Rate': 95,
    'Body_Temp': 40
}

predicted_calories = predict_calories(example_data, model, scaler, device)
print(f"\nPredicted calories for example data: {predicted_calories:.2f}")

## 11. Save the model (optional)
# Save PyTorch model
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
}, 'calorie_prediction_model_gpu.pt')

print("\nModel saved successfully as 'calorie_prediction_model_gpu.pt'")

# Save the scaler (you may need to convert it to CPU if it's a RAPIDS scaler)
try:
    import joblib
    joblib.dump(scaler, 'scaler_gpu.pkl')
    print("Scaler saved successfully!")
except Exception as e:
    print(f"Could not save scaler: {e}")

## 12. Conclusion
print("\nGPU-Accelerated Model Training Summary:")
print(f"Model: PyTorch Neural Network")
print(f"Sample size: {len(df)} records")
print(f"GPU used: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None (CPU used)'}")
print(f"Model performance (R2 Score): {test_results['r2']:.2f}")
print("\nAdvantages of GPU implementation:")
print("- Faster training, especially with larger datasets")
print("- Potential for more complex models")
print("- Efficient batch processing")
print("\nLimitations:")
print("- Requires compatible hardware")
print("- Some libraries may need specific versions/configurations")
print("- Small datasets may not show significant speedup")