# Concrete Strength Prediction

Predicting the strength of concrete's compressive strength using Random Forest and XGBoost and taking the weighted average of the predictions.

## 1. Import the Libraries

Import the required libraries.

In [1]:
# Import libraries like scikit-learn and xgboost
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

import xgboost as xgb

## 2. Prepare the Data

Load the dataset from Kaggle and convert into numpy array, further splitting for training and testing.

In [2]:
# Load the dataset
data = pd.read_csv('D:\PROJECTS\Concrete-compressive-strength\data\concrete_data.csv')
data.head()

  data = pd.read_csv('D:\PROJECTS\Concrete-compressive-strength\data\concrete_data.csv')


Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,concrete_compressive_strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [3]:
# Split into training and testing
x_data = data.drop(columns = ['concrete_compressive_strength'])
y_data = data['concrete_compressive_strength']

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

In [4]:
# Check the shape and length of the data
shape = x_train.shape
length = len(y_test)

print(f"Shape of the training data: {shape}")
print(f"Length of the test data: {length}")

Shape of the training data: (824, 8)
Length of the test data: 206


## 3. XGBoost

* **XGBRegressor()**: XGBoost's implementation for regression using gradient boosting.

* **n_estimators = 100**: Number of boosting rounds or trees to build.

* **learning_rate = 0.1**: Controls the step size for weight updates.

In [5]:
xgb_regressor = xgb.XGBRegressor(n_estimators = 10000, learning_rate = 0.1, max_depth = 5, random_state = 2)
xgb_regressor.fit(x_train, y_train)

y_predicted_xgb = xgb_regressor.predict(x_test)


mse_xgb = mean_squared_error(y_test, y_predicted_xgb)
r2_xgb = r2_score(y_test, y_predicted_xgb)
print(f"Mean Squared Error for XGBoost: {mse_xgb}")
print(f"R2 Score for XGBoost: {r2_xgb}")

Mean Squared Error for XGBoost: 18.90292315579263
R2 Score for XGBoost: 0.9326625197968461


# 4. Comprehensive Model Training Pipeline

Training multiple machine learning models for concrete strength prediction and comparing their performance.

In [None]:
# Import additional libraries for comprehensive model training
import os
import warnings
warnings.filterwarnings('ignore')

# Additional ML algorithms
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor

# Metrics for comprehensive evaluation
from sklearn.metrics import mean_absolute_error, explained_variance_score
from sklearn.model_selection import cross_val_score
import datetime

In [None]:
# Load train/test data from pipeline
train_dir = 'D:/PROJECTS/Concrete-compressive-strength/data/3_train_test_data/train'
test_dir = 'D:/PROJECTS/Concrete-compressive-strength/data/3_train_test_data/test'

# Load training data
X_train = pd.read_csv(f'{train_dir}/X_train.csv')
y_train = pd.read_csv(f'{train_dir}/y_train.csv').values.ravel()

# Load test data
X_test = pd.read_csv(f'{test_dir}/X_test.csv')
y_test = pd.read_csv(f'{test_dir}/y_test.csv').values.ravel()

print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")
print(f"Training target shape: {y_train.shape}")
print(f"Test target shape: {y_test.shape}")
print(f"\nFeatures: {list(X_train.columns)}")

In [None]:
# Define comprehensive evaluation function
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    """
    Comprehensive model evaluation function
    """
    # Fit the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Calculate metrics
    metrics = {
        'Model': model_name,
        'Train_R2': r2_score(y_train, y_pred_train),
        'Test_R2': r2_score(y_test, y_pred_test),
        'Train_MSE': mean_squared_error(y_train, y_pred_train),
        'Test_MSE': mean_squared_error(y_test, y_pred_test),
        'Train_MAE': mean_absolute_error(y_train, y_pred_train),
        'Test_MAE': mean_absolute_error(y_test, y_pred_test),
        'Train_ExplainedVar': explained_variance_score(y_train, y_pred_train),
        'Test_ExplainedVar': explained_variance_score(y_test, y_pred_test),
        'RMSE_Train': np.sqrt(mean_squared_error(y_train, y_pred_train)),
        'RMSE_Test': np.sqrt(mean_squared_error(y_test, y_pred_test))
    }
    
    return metrics, model

In [None]:
# Initialize models dictionary
models = {
    'Linear Regression': LinearRegression(),
    'Polynomial Regression': None,  # Will be handled separately
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    'AdaBoost': AdaBoostRegressor(n_estimators=100, random_state=42),
    'K-Nearest Neighbors': KNeighborsRegressor(n_neighbors=5),
    'Extra Trees': ExtraTreesRegressor(n_estimators=100, random_state=42)
}

# Store results
results = []
trained_models = {}

print("🚀 Starting Comprehensive Model Training...")
print("=" * 60)

# Train each model
for model_name, model in models.items():
    if model_name == 'Polynomial Regression':
        continue  # Handle separately
    
    print(f"Training {model_name}...")
    
    try:
        metrics, trained_model = evaluate_model(model, X_train, X_test, y_train, y_test, model_name)
        results.append(metrics)
        trained_models[model_name] = trained_model
        
        print(f"✅ {model_name} - Test R²: {metrics['Test_R2']:.4f}, Test RMSE: {metrics['RMSE_Test']:.4f}")
        
    except Exception as e:
        print(f"❌ Error training {model_name}: {str(e)}")
        
print("\n🔄 Training Polynomial Regression...")

# Handle Polynomial Regression separately
try:
    poly_features = PolynomialFeatures(degree=2, include_bias=False)
    X_train_poly = poly_features.fit_transform(X_train)
    X_test_poly = poly_features.transform(X_test)
    
    poly_model = LinearRegression()
    poly_model.fit(X_train_poly, y_train)
    
    y_pred_train_poly = poly_model.predict(X_train_poly)
    y_pred_test_poly = poly_model.predict(X_test_poly)
    
    poly_metrics = {
        'Model': 'Polynomial Regression',
        'Train_R2': r2_score(y_train, y_pred_train_poly),
        'Test_R2': r2_score(y_test, y_pred_test_poly),
        'Train_MSE': mean_squared_error(y_train, y_pred_train_poly),
        'Test_MSE': mean_squared_error(y_test, y_pred_test_poly),
        'Train_MAE': mean_absolute_error(y_train, y_pred_train_poly),
        'Test_MAE': mean_absolute_error(y_test, y_pred_test_poly),
        'Train_ExplainedVar': explained_variance_score(y_train, y_pred_train_poly),
        'Test_ExplainedVar': explained_variance_score(y_test, y_pred_test_poly),
        'RMSE_Train': np.sqrt(mean_squared_error(y_train, y_pred_train_poly)),
        'RMSE_Test': np.sqrt(mean_squared_error(y_test, y_pred_test_poly))
    }
    
    results.append(poly_metrics)
    trained_models['Polynomial Regression'] = (poly_model, poly_features)
    
    print(f"✅ Polynomial Regression - Test R²: {poly_metrics['Test_R2']:.4f}, Test RMSE: {poly_metrics['RMSE_Test']:.4f}")
    
except Exception as e:
    print(f"❌ Error training Polynomial Regression: {str(e)}")

print(f"\n🎉 Model training completed! {len(results)} models trained successfully.")

In [None]:
# Create results DataFrame
results_df = pd.DataFrame(results)

# Sort by Test R² score (descending)
results_df = results_df.sort_values('Test_R2', ascending=False).reset_index(drop=True)

print("📊 MODEL PERFORMANCE COMPARISON")
print("=" * 80)
print(f"{'Rank':<4} {'Model':<20} {'Test R²':<10} {'Test RMSE':<12} {'Test MAE':<10}")
print("-" * 80)

for idx, row in results_df.iterrows():
    print(f"{idx+1:<4} {row['Model']:<20} {row['Test_R2']:<10.4f} {row['RMSE_Test']:<12.2f} {row['Test_MAE']:<10.2f}")

print("\n🏆 Best performing model:", results_df.iloc[0]['Model'])
print(f"Best Test R² Score: {results_df.iloc[0]['Test_R2']:.4f}")

# Display full results
print("\n📋 DETAILED RESULTS:")
print(results_df.round(4))

In [None]:
# Create results directory and save comprehensive results
results_dir = 'D:/PROJECTS/Concrete-compressive-strength/results'
os.makedirs(results_dir, exist_ok=True)

# Get current timestamp
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

print(f"💾 Saving results to: {results_dir}")

# 1. Save summary results
summary_file = f"{results_dir}/model_comparison_summary_{timestamp}.txt"
with open(summary_file, 'w') as f:
    f.write("CONCRETE STRENGTH PREDICTION - MODEL COMPARISON SUMMARY\n")
    f.write("=" * 70 + "\n\n")
    f.write(f"Training Date: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write(f"Dataset: Concrete Compressive Strength\n")
    f.write(f"Training Samples: {len(X_train)}\n")
    f.write(f"Test Samples: {len(X_test)}\n")
    f.write(f"Features: {X_train.shape[1]}\n\n")
    
    f.write("MODEL RANKING (by Test R² Score):\n")
    f.write("-" * 50 + "\n")
    for idx, row in results_df.iterrows():
        f.write(f"{idx+1:2d}. {row['Model']:<20} R²: {row['Test_R2']:.4f} RMSE: {row['RMSE_Test']:.2f}\n")
    
    f.write(f"\nBEST MODEL: {results_df.iloc[0]['Model']}\n")
    f.write(f"Best R² Score: {results_df.iloc[0]['Test_R2']:.4f}\n")
    f.write(f"Best RMSE: {results_df.iloc[0]['RMSE_Test']:.2f}\n")

# 2. Save detailed results
detailed_file = f"{results_dir}/detailed_model_results_{timestamp}.txt"
with open(detailed_file, 'w') as f:
    f.write("CONCRETE STRENGTH PREDICTION - DETAILED MODEL RESULTS\n")
    f.write("=" * 80 + "\n\n")
    
    for _, row in results_df.iterrows():
        f.write(f"MODEL: {row['Model']}\n")
        f.write("-" * 40 + "\n")
        f.write(f"R² Score (Train): {row['Train_R2']:.6f}\n")
        f.write(f"R² Score (Test):  {row['Test_R2']:.6f}\n")
        f.write(f"MSE (Train):      {row['Train_MSE']:.6f}\n")
        f.write(f"MSE (Test):       {row['Test_MSE']:.6f}\n")
        f.write(f"MAE (Train):      {row['Train_MAE']:.6f}\n")
        f.write(f"MAE (Test):       {row['Test_MAE']:.6f}\n")
        f.write(f"RMSE (Train):     {row['RMSE_Train']:.6f}\n")
        f.write(f"RMSE (Test):      {row['RMSE_Test']:.6f}\n")
        f.write(f"Explained Var (Train): {row['Train_ExplainedVar']:.6f}\n")
        f.write(f"Explained Var (Test):  {row['Test_ExplainedVar']:.6f}\n")
        f.write("\n")

# 3. Save CSV format for easy analysis
csv_file = f"{results_dir}/model_results_{timestamp}.csv"
results_df.to_csv(csv_file, index=False)

# 4. Save feature importance (for tree-based models)
importance_file = f"{results_dir}/feature_importance_{timestamp}.txt"
with open(importance_file, 'w') as f:
    f.write("FEATURE IMPORTANCE ANALYSIS\n")
    f.write("=" * 50 + "\n\n")
    
    for model_name, model in trained_models.items():
        if hasattr(model, 'feature_importances_'):
            f.write(f"{model_name.upper()} FEATURE IMPORTANCE:\n")
            f.write("-" * 30 + "\n")
            
            feature_importance = pd.DataFrame({
                'feature': X_train.columns,
                'importance': model.feature_importances_
            }).sort_values('importance', ascending=False)
            
            for _, row in feature_importance.iterrows():
                f.write(f"{row['feature']:<25}: {row['importance']:.6f}\n")
            f.write("\n")

print("✅ Results saved successfully!")
print(f"📁 Files created:")
print(f"   • Summary: {summary_file}")
print(f"   • Detailed: {detailed_file}")
print(f"   • CSV: {csv_file}")
print(f"   • Feature Importance: {importance_file}")

# Display final summary
print(f"\n🎯 FINAL SUMMARY:")
print(f"Best Model: {results_df.iloc[0]['Model']}")
print(f"Test R² Score: {results_df.iloc[0]['Test_R2']:.4f}")
print(f"Test RMSE: {results_df.iloc[0]['RMSE_Test']:.2f}")
print(f"Test MAE: {results_df.iloc[0]['Test_MAE']:.2f}")