# Concrete Strength Prediction

Predicting the strength of concrete's compressive strength using Random Forest and XGBoost and taking the weighted average of the predictions.

## 1. Import the Libraries

Import the required libraries.

In [9]:
# Import libraries like scikit-learn and xgboost
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

import xgboost as xgb

## 2. Prepare the Data

Load the dataset from Kaggle and convert into numpy array, further splitting for training and testing.

In [10]:
# Load the dataset
data = pd.read_csv('D:\PROJECTS\Concrete-compressive-strength\data\raw\concrete_data.csv')
data.head()

  data = pd.read_csv('D:\PROJECTS\Concrete-compressive-strength\data\raw\concrete_data.csv')


OSError: [Errno 22] Invalid argument: 'D:\\PROJECTS\\Concrete-compressive-strength\\data\raw\\concrete_data.csv'

In [None]:
# Split into training and testing
x_data = data.drop(columns = ['concrete_compressive_strength'])
y_data = data['concrete_compressive_strength']

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

In [None]:
# Check the shape and length of the data
shape = x_train.shape
length = len(y_test)

print(f"Shape of the training data: {shape}")
print(f"Length of the test data: {length}")

Shape of the training data: (824, 8)
Length of the test data: 206


## 3. XGBoost

* **XGBRegressor()**: XGBoost's implementation for regression using gradient boosting.

* **n_estimators = 100**: Number of boosting rounds or trees to build.

* **learning_rate = 0.1**: Controls the step size for weight updates.

In [None]:
xgb_regressor = xgb.XGBRegressor(n_estimators = 10000, learning_rate = 0.1, max_depth = 5, random_state = 2)
xgb_regressor.fit(x_train, y_train)

y_predicted_xgb = xgb_regressor.predict(x_test)


mse_xgb = mean_squared_error(y_test, y_predicted_xgb)
r2_xgb = r2_score(y_test, y_predicted_xgb)
print(f"Mean Squared Error for XGBoost: {mse_xgb}")
print(f"R2 Score for XGBoost: {r2_xgb}")

Mean Squared Error for XGBoost: 18.90292315579263
R2 Score for XGBoost: 0.9326625197968461


# 4. Comprehensive Model Training Pipeline

Training multiple machine learning models for concrete strength prediction and comparing their performance.

In [16]:
# Import additional libraries for comprehensive model training
import os
import warnings
warnings.filterwarnings('ignore')

# Additional ML algorithms
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor

# Metrics for comprehensive evaluation
from sklearn.metrics import mean_absolute_error, explained_variance_score
from sklearn.model_selection import cross_val_score
import datetime

In [17]:
# Load train/test data from pipeline
train_dir = 'D:/PROJECTS/Concrete-compressive-strength/data/3_train_test_data/train'
test_dir = 'D:/PROJECTS/Concrete-compressive-strength/data/3_train_test_data/test'

# Load training data
X_train = pd.read_csv(f'{train_dir}/X_train.csv')
y_train = pd.read_csv(f'{train_dir}/y_train.csv').values.ravel()

# Load test data
X_test = pd.read_csv(f'{test_dir}/X_test.csv')
y_test = pd.read_csv(f'{test_dir}/y_test.csv').values.ravel()

print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")
print(f"Training target shape: {y_train.shape}")
print(f"Test target shape: {y_test.shape}")
print(f"\nFeatures: {list(X_train.columns)}")

Training data shape: (804, 18)
Test data shape: (201, 18)
Training target shape: (804,)
Test target shape: (201,)

Features: ['log_age', 'cement', 'age_category_longterm', 'coarse_aggregate', 'cement_quality_index', 'total_material', 'age_category_early', 'water_powder_ratio', 'total_cement', 'water', 'total_powder', 'scm_cement_ratio', 'age', 'sqrt_age', 'superplasticizer', 'water_cement_ratio', 'age_category_extended', 'cement_material_ratio']


In [18]:
# Define comprehensive evaluation function
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    """
    Comprehensive model evaluation function
    """
    # Fit the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Calculate metrics
    metrics = {
        'Model': model_name,
        'Train_R2': r2_score(y_train, y_pred_train),
        'Test_R2': r2_score(y_test, y_pred_test),
        'Train_MSE': mean_squared_error(y_train, y_pred_train),
        'Test_MSE': mean_squared_error(y_test, y_pred_test),
        'Train_MAE': mean_absolute_error(y_train, y_pred_train),
        'Test_MAE': mean_absolute_error(y_test, y_pred_test),
        'Train_ExplainedVar': explained_variance_score(y_train, y_pred_train),
        'Test_ExplainedVar': explained_variance_score(y_test, y_pred_test),
        'RMSE_Train': np.sqrt(mean_squared_error(y_train, y_pred_train)),
        'RMSE_Test': np.sqrt(mean_squared_error(y_test, y_pred_test))
    }
    
    return metrics, model

In [19]:
# Initialize models dictionary
models = {
    'Linear Regression': LinearRegression(),
    'Polynomial Regression': None,  # Will be handled separately
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    'AdaBoost': AdaBoostRegressor(n_estimators=100, random_state=42),
    'K-Nearest Neighbors': KNeighborsRegressor(n_neighbors=5),
    'Extra Trees': ExtraTreesRegressor(n_estimators=100, random_state=42)
}

# Store results
results = []
trained_models = {}

print("🚀 Starting Comprehensive Model Training...")
print("=" * 60)

# Train each model
for model_name, model in models.items():
    if model_name == 'Polynomial Regression':
        continue  # Handle separately
    
    print(f"Training {model_name}...")
    
    try:
        metrics, trained_model = evaluate_model(model, X_train, X_test, y_train, y_test, model_name)
        results.append(metrics)
        trained_models[model_name] = trained_model
        
        print(f"✅ {model_name} - Test R²: {metrics['Test_R2']:.4f}, Test RMSE: {metrics['RMSE_Test']:.4f}")
        
    except Exception as e:
        print(f"❌ Error training {model_name}: {str(e)}")
        
print("\n🔄 Training Polynomial Regression...")

# Handle Polynomial Regression separately
try:
    poly_features = PolynomialFeatures(degree=2, include_bias=False)
    X_train_poly = poly_features.fit_transform(X_train)
    X_test_poly = poly_features.transform(X_test)
    
    poly_model = LinearRegression()
    poly_model.fit(X_train_poly, y_train)
    
    y_pred_train_poly = poly_model.predict(X_train_poly)
    y_pred_test_poly = poly_model.predict(X_test_poly)
    
    poly_metrics = {
        'Model': 'Polynomial Regression',
        'Train_R2': r2_score(y_train, y_pred_train_poly),
        'Test_R2': r2_score(y_test, y_pred_test_poly),
        'Train_MSE': mean_squared_error(y_train, y_pred_train_poly),
        'Test_MSE': mean_squared_error(y_test, y_pred_test_poly),
        'Train_MAE': mean_absolute_error(y_train, y_pred_train_poly),
        'Test_MAE': mean_absolute_error(y_test, y_pred_test_poly),
        'Train_ExplainedVar': explained_variance_score(y_train, y_pred_train_poly),
        'Test_ExplainedVar': explained_variance_score(y_test, y_pred_test_poly),
        'RMSE_Train': np.sqrt(mean_squared_error(y_train, y_pred_train_poly)),
        'RMSE_Test': np.sqrt(mean_squared_error(y_test, y_pred_test_poly))
    }
    
    results.append(poly_metrics)
    trained_models['Polynomial Regression'] = (poly_model, poly_features)
    
    print(f"✅ Polynomial Regression - Test R²: {poly_metrics['Test_R2']:.4f}, Test RMSE: {poly_metrics['RMSE_Test']:.4f}")
    
except Exception as e:
    print(f"❌ Error training Polynomial Regression: {str(e)}")

print(f"\n🎉 Model training completed! {len(results)} models trained successfully.")

🚀 Starting Comprehensive Model Training...
Training Linear Regression...
✅ Linear Regression - Test R²: 0.8345, Test RMSE: 6.9539
Training Decision Tree...
✅ Decision Tree - Test R²: 0.8907, Test RMSE: 5.6502
Training Random Forest...
✅ Random Forest - Test R²: 0.9131, Test RMSE: 5.0384
Training XGBoost...
✅ XGBoost - Test R²: 0.9294, Test RMSE: 4.5427
Training AdaBoost...
✅ AdaBoost - Test R²: 0.8153, Test RMSE: 7.3453
Training K-Nearest Neighbors...
✅ K-Nearest Neighbors - Test R²: 0.8475, Test RMSE: 6.6754
Training Extra Trees...
✅ Extra Trees - Test R²: 0.9210, Test RMSE: 4.8044

🔄 Training Polynomial Regression...
✅ Polynomial Regression - Test R²: 0.8911, Test RMSE: 5.6397

🎉 Model training completed! 8 models trained successfully.


In [20]:
# Create results DataFrame
results_df = pd.DataFrame(results)

# Sort by Test R² score (descending)
results_df = results_df.sort_values('Test_R2', ascending=False).reset_index(drop=True)

print("📊 MODEL PERFORMANCE COMPARISON")
print("=" * 80)
print(f"{'Rank':<4} {'Model':<20} {'Test R²':<10} {'Test RMSE':<12} {'Test MAE':<10}")
print("-" * 80)

for idx, row in results_df.iterrows():
    print(f"{idx+1:<4} {row['Model']:<20} {row['Test_R2']:<10.4f} {row['RMSE_Test']:<12.2f} {row['Test_MAE']:<10.2f}")

print("\n🏆 Best performing model:", results_df.iloc[0]['Model'])
print(f"Best Test R² Score: {results_df.iloc[0]['Test_R2']:.4f}")

# Display full results
print("\n📋 DETAILED RESULTS:")
print(results_df.round(4))

📊 MODEL PERFORMANCE COMPARISON
Rank Model                Test R²    Test RMSE    Test MAE  
--------------------------------------------------------------------------------
1    XGBoost              0.9294     4.54         3.18      
2    Extra Trees          0.9210     4.80         3.09      
3    Random Forest        0.9131     5.04         3.52      
4    Polynomial Regression 0.8911     5.64         4.13      
5    Decision Tree        0.8907     5.65         3.54      
6    K-Nearest Neighbors  0.8475     6.68         4.82      
7    Linear Regression    0.8345     6.95         5.34      
8    AdaBoost             0.8153     7.35         5.92      

🏆 Best performing model: XGBoost
Best Test R² Score: 0.9294

📋 DETAILED RESULTS:
                   Model  Train_R2  Test_R2  Train_MSE  Test_MSE  Train_MAE  \
0                XGBoost    0.9842   0.9294     4.0391   20.6363     1.3574   
1            Extra Trees    0.9960   0.9210     1.0124   23.0825     0.1424   
2          Random F