In [1]:
# =============================================================================
# KAGGLE PLAYGROUND SERIES S3E9 - CONCRETE STRENGTH PREDICTION
# =============================================================================
# This notebook implements a machine learning pipeline to predict concrete 
# compressive strength based on various material properties and mix ratios.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Load datasets
print("Loading datasets...")
train = pd.read_csv("data/train.csv")  # Changed to local path
test = pd.read_csv("data/test.csv")    # Changed to local path
submission = pd.read_csv("data/sample_submission.csv")

print(f"Training set shape: {train.shape}")
print(f"Test set shape: {test.shape}")
print(f"Submission template shape: {submission.shape}")

Loading datasets...
Training set shape: (250000, 102)
Test set shape: (150000, 101)
Submission template shape: (150000, 2)


In [2]:
# =============================================================================
# EXPLORATORY DATA ANALYSIS (EDA)
# =============================================================================

print("=" * 50)
print("DATASET OVERVIEW")
print("=" * 50)

# Basic dataset information
print("\n1. TRAINING DATA INFO:")
print(f"Shape: {train.shape}")
print(f"Columns: {list(train.columns)}")

print("\n2. FIRST FEW ROWS:")
display(train.head())

print("\n3. DATA TYPES:")
print(train.dtypes)

print("\n4. STATISTICAL SUMMARY:")
display(train.describe())

# Missing values analysis
print("\n5. MISSING VALUES ANALYSIS:")
missing = train.isnull().sum()
missing_pct = (missing / len(train)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Missing Percentage': missing_pct
}).sort_values('Missing Count', ascending=False)

print(missing_df[missing_df['Missing Count'] > 0])

# Target variable analysis
print("\n6. TARGET VARIABLE ANALYSIS:")
print(f"Target column: 'target'")
print(f"Target range: {train['target'].min():.2f} - {train['target'].max():.2f}")
print(f"Target mean: {train['target'].mean():.2f}")
print(f"Target std: {train['target'].std():.2f}")

# Visualize target distribution
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
sns.histplot(train['target'], kde=True, bins=50)
plt.title("Target Distribution", fontsize=14, fontweight='bold')
plt.xlabel("Target Value")
plt.ylabel("Frequency")

plt.subplot(1, 2, 2)
sns.boxplot(y=train['target'])
plt.title("Target Box Plot", fontsize=14, fontweight='bold')
plt.ylabel("Target Value")

plt.tight_layout()
plt.show()

DATASET OVERVIEW

1. TRAINING DATA INFO:
Shape: (250000, 102)
Columns: ['id', 'f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f50', 'f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59', 'f60', 'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69', 'f70', 'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f79', 'f80', 'f81', 'f82', 'f83', 'f84', 'f85', 'f86', 'f87', 'f88', 'f89', 'f90', 'f91', 'f92', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99', 'loss']

2. FIRST FEW ROWS:


Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f91,f92,f93,f94,f95,f96,f97,f98,f99,loss
0,0,-0.00235,59,0.766739,-1.35046,42.2727,16.6857,30.3599,1.2673,0.392007,...,-42.4399,26.854,1.45751,0.696161,0.941764,1.82847,0.92409,2.29658,10.4898,15
1,1,0.784462,145,-0.463845,-0.530421,27324.9,3.47545,160.498,0.828007,3.73586,...,-184.132,7.90137,1.70644,-0.494699,-2.0583,0.819184,0.439152,2.3647,1.14383,3
2,2,0.317816,19,-0.432571,-0.382644,1383.26,19.7129,31.1026,-0.515354,34.4308,...,7.43721,37.2181,3.25339,0.337934,0.615037,2.21676,0.745268,1.69679,12.3055,6
3,3,0.210753,17,-0.616454,0.946362,-119.253,4.08235,185.257,1.38331,-47.5214,...,9.66778,0.626942,1.49425,0.517513,-10.2221,2.62731,0.61727,1.45645,10.0288,2
4,4,0.439671,20,0.968126,-0.092546,74.302,12.3065,72.186,-0.233964,24.3991,...,290.657,15.6043,1.73557,-0.476668,1.39019,2.19574,0.826987,1.78485,7.07197,1



3. DATA TYPES:
id        int64
f0      float64
f1        int64
f2      float64
f3      float64
         ...   
f96     float64
f97     float64
f98     float64
f99     float64
loss      int64
Length: 102, dtype: object

4. STATISTICAL SUMMARY:


Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f91,f92,f93,f94,f95,f96,f97,f98,f99,loss
count,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,...,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0
mean,124999.5,0.511213,51.378476,0.107155,0.05001,3595.133426,8.205953,164.508753,0.375533,16.669745,...,4856.812768,22.5791,2.030554,0.079692,1.555097,2.417556,0.537484,1.5769,8.048805,6.81392
std,72168.927986,0.307884,42.396636,1.3222,0.792368,6072.401061,5.475723,183.335563,0.813597,99.758709,...,8501.609009,14.84939,0.900211,0.58778,9.253785,0.892563,0.226589,0.646306,5.647368,7.940179
min,0.0,-0.069273,-17.0,-7.89558,-1.47556,-7589.28,-3.29105,-40.9672,-4.14308,-502.813,...,-12695.7,-4.05917,0.0578,-1.9988,-24.6863,-1.13198,0.005249,-0.646967,-0.842397,0.0
25%,62499.75,0.251287,18.0,-0.611172,-0.719418,163.86475,4.110127,27.8949,-0.026245,-17.392025,...,73.2031,11.52545,1.47165,-0.408975,-4.004925,1.906718,0.359646,1.21581,3.7328,1.0
50%,124999.5,0.514962,41.0,0.253815,0.004099,943.0005,7.472445,91.00525,0.619862,8.714945,...,1060.025,19.9932,1.66083,0.21571,0.759942,2.34043,0.531348,1.451285,7.182205,4.0
75%,187499.25,0.777322,75.0,0.759249,0.765456,4115.355,11.03095,240.84375,0.933855,55.407625,...,5572.9825,32.271625,2.320085,0.503134,6.202503,2.91002,0.709807,1.901632,10.99855,10.0
max,249999.0,1.07207,273.0,9.76859,1.68019,37847.5,35.078,947.143,4.01038,465.956,...,54334.6,79.9124,5.40302,1.94419,42.8904,5.57604,1.1054,4.49262,34.0192,42.0



5. MISSING VALUES ANALYSIS:
Empty DataFrame
Columns: [Missing Count, Missing Percentage]
Index: []

6. TARGET VARIABLE ANALYSIS:
Target column: 'target'


KeyError: 'target'

In [None]:
# =============================================================================
# BASELINE MODEL TRAINING
# =============================================================================

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

print("=" * 50)
print("BASELINE MODEL TRAINING")
print("=" * 50)

# Prepare features and target
print("1. PREPARING DATA...")
X = train.drop(columns=["target", "id"])  # Remove target and ID columns
y = train["target"]

print(f"Feature columns: {list(X.columns)}")
print(f"Number of features: {X.shape[1]}")

# Select only numeric features for baseline
X_numeric = X.select_dtypes(include=[np.number])
print(f"Numeric features selected: {X_numeric.shape[1]}")

# Train-validation split
print("\n2. SPLITTING DATA...")
X_train, X_val, y_train, y_val = train_test_split(
    X_numeric, y, 
    test_size=0.2, 
    random_state=42,
    stratify=None  # No stratification for regression
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")

# Initialize and train baseline model
print("\n3. TRAINING BASELINE MODEL...")
print("Model: Random Forest Regressor")
print("Parameters: n_estimators=100, random_state=42")

model = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1  # Use all available cores
)

# Train the model
model.fit(X_train, y_train)

# Make predictions
print("\n4. MAKING PREDICTIONS...")
train_preds = model.predict(X_train)
val_preds = model.predict(X_val)

# Calculate metrics
print("\n5. EVALUATION METRICS:")
print("-" * 30)

# Training metrics
train_rmse = mean_squared_error(y_train, train_preds, squared=False)
train_mae = mean_absolute_error(y_train, train_preds)
train_r2 = r2_score(y_train, train_preds)

print(f"TRAINING SET:")
print(f"  RMSE: {train_rmse:.4f}")
print(f"  MAE:  {train_mae:.4f}")
print(f"  R²:   {train_r2:.4f}")

# Validation metrics
val_rmse = mean_squared_error(y_val, val_preds, squared=False)
val_mae = mean_absolute_error(y_val, val_preds)
val_r2 = r2_score(y_val, val_preds)

print(f"\nVALIDATION SET:")
print(f"  RMSE: {val_rmse:.4f}")
print(f"  MAE:  {val_mae:.4f}")
print(f"  R²:   {val_r2:.4f}")

# Check for overfitting
overfitting = train_rmse - val_rmse
print(f"\nOverfitting check (Train RMSE - Val RMSE): {overfitting:.4f}")
if overfitting > 0.1:
    print("⚠️  Warning: Potential overfitting detected!")
else:
    print("✅ Model appears to generalize well")

# Feature importance
print("\n6. FEATURE IMPORTANCE (Top 10):")
feature_importance = pd.DataFrame({
    'feature': X_numeric.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print(feature_importance.head(10))

In [None]:
# =============================================================================
# NEXT STEPS FOR IMPROVEMENT
# =============================================================================

print("POTENTIAL IMPROVEMENTS TO TRY:")
print("=" * 40)
print("1. Feature Engineering:")
print("   - Create interaction features")
print("   - Polynomial features")
print("   - Domain-specific ratios")
print()
print("2. Model Improvements:")
print("   - Try XGBoost, LightGBM, or CattBoost")
print("   - Hyperparameter tuning")
print("   - Ensemble methods")
print()
print("3. Advanced Techniques:")
print("   - Cross-validation")
print("   - Feature selection")
print("   - Outlier detection and handling")
print()
print("4. Data Analysis:")
print("   - Correlation analysis")
print("   - Feature distributions")
print("   - Target vs feature relationships")

In [None]:
# =============================================================================
# TEST PREDICTIONS AND SUBMISSION
# =============================================================================

print("=" * 50)
print("TEST PREDICTIONS")
print("=" * 50)

# Prepare test data
print("1. PREPARING TEST DATA...")
test_data = test[X_numeric.columns]  # Use same features as training
print(f"Test data shape: {test_data.shape}")

# Check for missing values in test set
test_missing = test_data.isnull().sum().sum()
if test_missing > 0:
    print(f"⚠️  Warning: {test_missing} missing values found in test set")
    # Fill missing values with training set means
    test_data = test_data.fillna(X_numeric.mean())
    print("✅ Missing values filled with training set means")
else:
    print("✅ No missing values in test set")

# Make predictions
print("\n2. MAKING TEST PREDICTIONS...")
test_predictions = model.predict(test_data)
print(f"Predictions shape: {test_predictions.shape}")
print(f"Prediction range: {test_predictions.min():.4f} - {test_predictions.max():.4f}")

# Prepare submission file
print("\n3. PREPARING SUBMISSION...")
submission["target"] = test_predictions

# Save submission file
submission_filename = "submission_baseline_rf.csv"
submission.to_csv(submission_filename, index=False)
print(f"✅ Submission saved as: {submission_filename}")

# Display submission preview
print("\n4. SUBMISSION PREVIEW:")
print(submission.head(10))

print(f"\n5. SUBMISSION STATISTICS:")
print(f"  Mean prediction: {submission['target'].mean():.4f}")
print(f"  Std prediction:  {submission['target'].std():.4f}")
print(f"  Min prediction:  {submission['target'].min():.4f}")
print(f"  Max prediction:  {submission['target'].max():.4f}")

print("\n" + "=" * 50)
print("BASELINE MODEL COMPLETE!")
print("=" * 50)

In [None]:
# Add this as a new cell at the beginning of your notebook
# =============================================================================
# DATA INSPECTION - CHECK ACTUAL COLUMN NAMES
# =============================================================================

print("=" * 50)
print("CHECKING ACTUAL DATA STRUCTURE")
print("=" * 50)

# Load data
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
submission = pd.read_csv("data/sample_submission.csv")

print("1. TRAINING DATA COLUMNS:")
print(f"Columns: {list(train.columns)}")
print(f"Shape: {train.shape}")

print("\n2. TEST DATA COLUMNS:")
print(f"Columns: {list(test.columns)}")
print(f"Shape: {test.shape}")

print("\n3. SUBMISSION TEMPLATE COLUMNS:")
print(f"Columns: {list(submission.columns)}")
print(f"Shape: {submission.shape}")

print("\n4. FIRST FEW ROWS OF TRAINING DATA:")
print(train.head())

print("\n5. DATA TYPES:")
print(train.dtypes)