In [None]:
# Kaggle API Dataset Download
# This section handles downloading and accessing the dataset for the Playground Series S5E5 competition

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# This cell detects whether we're running on Kaggle or locally
IN_KAGGLE = os.path.exists('/kaggle/input')

if IN_KAGGLE:
    # If running on Kaggle, the data is already available in the /kaggle/input directory
    print("Running on Kaggle - dataset already available")

    # Competition data paths
    BASE_DIR = '/kaggle/input/playground-series-s5e5'

else:
    # If running locally, we need to download the data via the Kaggle API
    print("Running locally - downloading data via Kaggle API")

    # First, check if kaggle module is installed
    try:
        import kaggle
    except ImportError:
        print("Kaggle API not found. Installing...")
        !pip install kaggle
        import kaggle

    # Create directory for data if it doesn't exist
    os.makedirs('kaggle_data', exist_ok=True)

    # Download competition data
    # Note: You need to have your Kaggle API credentials in ~/.kaggle/kaggle.json
    # If not already set up, run the following commands in a cell:
    """
    # Run this if you haven't set up Kaggle API credentials:
    !mkdir -p ~/.kaggle
    !echo '{"username":"YOUR_USERNAME","key":"YOUR_KEY"}' > ~/.kaggle/kaggle.json
    !chmod 600 ~/.kaggle/kaggle.json
    """

    # Download all competition files
    !kaggle competitions download -c playground-series-s5e5 -p kaggle_data

    # Unzip the downloaded files
    import zipfile
    with zipfile.ZipFile('kaggle_data/playground-series-s5e5.zip', 'r') as zip_ref:
        zip_ref.extractall('kaggle_data')

    print("Dataset downloaded successfully!")

    # Set the base directory for data access
    BASE_DIR = 'kaggle_data'

# Now let's define paths to access the files in a consistent way
# This will work both on Kaggle and locally
train_path = os.path.join(BASE_DIR, 'train.csv')
test_path = os.path.join(BASE_DIR, 'test.csv')
sample_submission_path = os.path.join(BASE_DIR, 'sample_submission.csv')

# Load the datasets
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
sample_submission = pd.read_csv(sample_submission_path)

# Display basic information about the datasets
print("\n--- Dataset Information ---")
print(f"Training set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")
print(f"Sample submission shape: {sample_submission.shape}")

# Display a few rows of the training data
print("\n--- First few rows of training data ---")
train_df.head()

In [None]:
# Calorie Prediction Model Training

## 1. Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

## 2. Create the dataset from the provided data

df = train_df

## 3. Exploratory Data Analysis (EDA)
print("Dataset shape:", df.shape)
print("\nDataset information:")
print(df.info())
print("\nBasic statistics:")
print(df.describe())

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

# Visualize the target variable distribution
plt.figure(figsize=(10, 6))
plt.hist(df['Calories'], bins=10, edgecolor='black')
plt.title('Distribution of Calories')
plt.xlabel('Calories')
plt.ylabel('Frequency')
plt.show()

# Convert categorical to numerical first
df['Sex'] = df['Sex'].map({'male': 1, 'female': 0})

# Correlation matrix
plt.figure(figsize=(12, 8))
correlation_matrix = df.drop('id', axis=1).corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

# Pairplot to visualize relationships between features
plt.figure(figsize=(12, 10))
sns.pairplot(df[['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'Calories']])
plt.suptitle('Pairplot of Numerical Features', y=1.02)
plt.show()

## 4. Feature Engineering and Preprocessing
# Separate features and target
X = df.drop(['id', 'Calories'], axis=1)
y = df['Calories']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 5. Build and Evaluate Different Models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}

results = {}

for name, model in models.items():
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store results
    results[name] = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2
    }
    
    print(f"\n{name} Results:")
    print(f"MSE: {mse:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"R2 Score: {r2:.2f}")

## 6. Hyperparameter Tuning for the Best Model
# Select the best model based on R2 score
best_model_name = max(results, key=lambda x: results[x]['R2'])
print(f"\nThe best model based on R2 score is: {best_model_name}")

# Hyperparameter tuning for the best model
if best_model_name == 'Random Forest':
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    model = RandomForestRegressor(random_state=42)
elif best_model_name == 'Gradient Boosting':
    param_grid = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 0.9, 1.0]
    }
    model = GradientBoostingRegressor(random_state=42)
elif best_model_name == 'Ridge Regression':
    param_grid = {
        'alpha': [0.001, 0.01, 0.1, 1, 10, 100]
    }
    model = Ridge()
elif best_model_name == 'Lasso Regression':
    param_grid = {
        'alpha': [0.001, 0.01, 0.1, 1, 10, 100]
    }
    model = Lasso()
else:  # Linear Regression has no hyperparameters to tune
    print("Linear Regression has no hyperparameters to tune.")
    
if best_model_name != 'Linear Regression':
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring='r2', n_jobs=-1)
    grid_search.fit(X_train_scaled, y_train)
    
    print(f"\nBest hyperparameters for {best_model_name}:")
    print(grid_search.best_params_)
    
    # Evaluate the tuned model
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test_scaled)
    
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"\nTuned {best_model_name} Results:")
    print(f"MSE: {mse:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"R2 Score: {r2:.2f}")

## 7. Feature Importance Analysis
if best_model_name in ['Random Forest', 'Gradient Boosting']:
    if best_model_name != 'Linear Regression':
        best_model = grid_search.best_estimator_
    else:
        best_model = models[best_model_name]
    
    # Get feature importance
    importances = best_model.feature_importances_
    feature_names = X.columns
    
    # Sort features by importance
    indices = np.argsort(importances)[::-1]
    
    plt.figure(figsize=(10, 6))
    plt.title(f'Feature Importance - {best_model_name}')
    plt.bar(range(X.shape[1]), importances[indices], align='center')
    plt.xticks(range(X.shape[1]), [feature_names[i] for i in indices], rotation=90)
    plt.tight_layout()
    plt.show()
    
    print("\nFeature Importance:")
    for i in indices:
        print(f"{feature_names[i]}: {importances[i]:.4f}")

## 8. Final Model and Predictions
# Train the final model on the entire dataset
if best_model_name != 'Linear Regression':
    final_model = grid_search.best_estimator_
else:
    final_model = LinearRegression()

X_scaled = scaler.fit_transform(X)
final_model.fit(X_scaled, y)

# Function to predict calories for new data
def predict_calories(data_dict):
    """
    Predict calories for new data.
    
    Parameters:
    data_dict (dict): Dictionary with keys as feature names and values as feature values
                     Example: {'Sex': 'male', 'Age': 30, 'Height': 175, 'Weight': 70, 
                              'Duration': 20, 'Heart_Rate': 95, 'Body_Temp': 40}
    
    Returns:
    float: Predicted calories
    """
    # Convert to DataFrame
    new_data = pd.DataFrame([data_dict])
    
    # Convert categorical to numerical
    if 'Sex' in new_data.columns:
        new_data['Sex'] = new_data['Sex'].map({'male': 1, 'female': 0})
    
    # Scale the features
    new_data_scaled = scaler.transform(new_data)
    
    # Make prediction
    prediction = final_model.predict(new_data_scaled)[0]
    
    return prediction

# Example usage
example_data = {
    'Sex': 'male',
    'Age': 30,
    'Height': 175,
    'Weight': 70,
    'Duration': 20,
    'Heart_Rate': 95,
    'Body_Temp': 40
}

predicted_calories = predict_calories(example_data)
print(f"\nPredicted calories for example data: {predicted_calories:.2f}")

## 9. Model Evaluation on Training Data
# Create a function to visualize actual vs predicted values
def plot_actual_vs_predicted(y_true, y_pred, title):
    plt.figure(figsize=(10, 6))
    plt.scatter(y_true, y_pred)
    plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], 'r--')
    plt.xlabel('Actual')
    plt.ylabel('Predicted')
    plt.title(title)
    plt.show()

# Get predictions on the entire dataset
y_pred_all = final_model.predict(X_scaled)

# Plot actual vs predicted
plot_actual_vs_predicted(y, y_pred_all, 'Actual vs Predicted Calories')

# Calculate metrics for the entire dataset
mse_all = mean_squared_error(y, y_pred_all)
rmse_all = np.sqrt(mse_all)
mae_all = mean_absolute_error(y, y_pred_all)
r2_all = r2_score(y, y_pred_all)

print("\nFinal Model Evaluation on All Data:")
print(f"MSE: {mse_all:.2f}")
print(f"RMSE: {rmse_all:.2f}")
print(f"MAE: {mae_all:.2f}")
print(f"R2 Score: {r2_all:.2f}")

## 10. Save the model (optional)
# If you want to save the model for future use, uncomment the following code
'''
import joblib
joblib.dump(final_model, 'calorie_prediction_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
print("Model and scaler saved successfully!")
'''

## 11. Conclusion
print("\nModel Training Summary:")
print(f"Best model: {best_model_name}")
print(f"Sample size: {len(df)} records")
print(f"Features used: {', '.join(X.columns)}")
print(f"Target variable: Calories")
print(f"Model performance (R2 Score): {r2_all:.2f}")
print("\nLimitations:")
print("- Small dataset size may limit model generalization")
print("- More features might be needed for better accuracy")
print("- Additional data collection recommended for improved predictions")