In [None]:
# =============================================================================
# KAGGLE PLAYGROUND SERIES S3E9 - CONCRETE STRENGTH PREDICTION
# =============================================================================
# This notebook implements a machine learning pipeline to predict concrete 
# compressive strength based on various material properties and mix ratios.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Load datasets
print("Loading datasets...")
train = pd.read_csv("data/train.csv")  # Changed to local path
test = pd.read_csv("data/test.csv")    # Changed to local path
submission = pd.read_csv("data/sample_submission.csv")

print(f"Training set shape: {train.shape}")
print(f"Test set shape: {test.shape}")
print(f"Submission template shape: {submission.shape}")

In [None]:
# =============================================================================
# EXPLORATORY DATA ANALYSIS (EDA)
# =============================================================================

print("=" * 50)
print("DATASET OVERVIEW")
print("=" * 50)

# Basic dataset information
print("\n1. TRAINING DATA INFO:")
print(f"Shape: {train.shape}")
print(f"Columns: {list(train.columns)}")

print("\n2. FIRST FEW ROWS:")
display(train.head())

print("\n3. DATA TYPES:")
print(train.dtypes)

print("\n4. STATISTICAL SUMMARY:")
display(train.describe())

# Missing values analysis
print("\n5. MISSING VALUES ANALYSIS:")
missing = train.isnull().sum()
missing_pct = (missing / len(train)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Missing Percentage': missing_pct
}).sort_values('Missing Count', ascending=False)

print(missing_df[missing_df['Missing Count'] > 0])

# Target variable analysis
print("\n6. TARGET VARIABLE ANALYSIS:")
print(f"Target column: 'target'")
print(f"Target range: {train['target'].min():.2f} - {train['target'].max():.2f}")
print(f"Target mean: {train['target'].mean():.2f}")
print(f"Target std: {train['target'].std():.2f}")

# Visualize target distribution
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
sns.histplot(train['target'], kde=True, bins=50)
plt.title("Target Distribution", fontsize=14, fontweight='bold')
plt.xlabel("Target Value")
plt.ylabel("Frequency")

plt.subplot(1, 2, 2)
sns.boxplot(y=train['target'])
plt.title("Target Box Plot", fontsize=14, fontweight='bold')
plt.ylabel("Target Value")

plt.tight_layout()
plt.show()

In [None]:
# =============================================================================
# BASELINE MODEL TRAINING
# =============================================================================

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

print("=" * 50)
print("BASELINE MODEL TRAINING")
print("=" * 50)

# Prepare features and target
print("1. PREPARING DATA...")
X = train.drop(columns=["target", "id"])  # Remove target and ID columns
y = train["target"]

print(f"Feature columns: {list(X.columns)}")
print(f"Number of features: {X.shape[1]}")

# Select only numeric features for baseline
X_numeric = X.select_dtypes(include=[np.number])
print(f"Numeric features selected: {X_numeric.shape[1]}")

# Train-validation split
print("\n2. SPLITTING DATA...")
X_train, X_val, y_train, y_val = train_test_split(
    X_numeric, y, 
    test_size=0.2, 
    random_state=42,
    stratify=None  # No stratification for regression
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")

# Initialize and train baseline model
print("\n3. TRAINING BASELINE MODEL...")
print("Model: Random Forest Regressor")
print("Parameters: n_estimators=100, random_state=42")

model = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1  # Use all available cores
)

# Train the model
model.fit(X_train, y_train)

# Make predictions
print("\n4. MAKING PREDICTIONS...")
train_preds = model.predict(X_train)
val_preds = model.predict(X_val)

# Calculate metrics
print("\n5. EVALUATION METRICS:")
print("-" * 30)

# Training metrics
train_rmse = mean_squared_error(y_train, train_preds, squared=False)
train_mae = mean_absolute_error(y_train, train_preds)
train_r2 = r2_score(y_train, train_preds)

print(f"TRAINING SET:")
print(f"  RMSE: {train_rmse:.4f}")
print(f"  MAE:  {train_mae:.4f}")
print(f"  R²:   {train_r2:.4f}")

# Validation metrics
val_rmse = mean_squared_error(y_val, val_preds, squared=False)
val_mae = mean_absolute_error(y_val, val_preds)
val_r2 = r2_score(y_val, val_preds)

print(f"\nVALIDATION SET:")
print(f"  RMSE: {val_rmse:.4f}")
print(f"  MAE:  {val_mae:.4f}")
print(f"  R²:   {val_r2:.4f}")

# Check for overfitting
overfitting = train_rmse - val_rmse
print(f"\nOverfitting check (Train RMSE - Val RMSE): {overfitting:.4f}")
if overfitting > 0.1:
    print("⚠️  Warning: Potential overfitting detected!")
else:
    print("✅ Model appears to generalize well")

# Feature importance
print("\n6. FEATURE IMPORTANCE (Top 10):")
feature_importance = pd.DataFrame({
    'feature': X_numeric.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print(feature_importance.head(10))

In [None]:
# =============================================================================
# NEXT STEPS FOR IMPROVEMENT
# =============================================================================

print("POTENTIAL IMPROVEMENTS TO TRY:")
print("=" * 40)
print("1. Feature Engineering:")
print("   - Create interaction features")
print("   - Polynomial features")
print("   - Domain-specific ratios")
print()
print("2. Model Improvements:")
print("   - Try XGBoost, LightGBM, or CattBoost")
print("   - Hyperparameter tuning")
print("   - Ensemble methods")
print()
print("3. Advanced Techniques:")
print("   - Cross-validation")
print("   - Feature selection")
print("   - Outlier detection and handling")
print()
print("4. Data Analysis:")
print("   - Correlation analysis")
print("   - Feature distributions")
print("   - Target vs feature relationships")