### Part 1: Load and Preprocess Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

concrete_data = pd.read_excel('concrete_data.xls')

print("First 5 rows:")
print(concrete_data.head())

print("\nData shape:", concrete_data.shape)
print("\nMissing values per column:")
print(concrete_data.isnull().sum())


concrete_data = concrete_data.dropna()
print("\nData shape after handling missing values:", concrete_data.shape)

X = concrete_data.drop(columns=['Concrete compressive strength(MPa, megapascals) '])
y = concrete_data['Concrete compressive strength(MPa, megapascals) ']

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



First 5 rows:
   Cement (component 1)(kg in a m^3 mixture)  \
0                                      540.0   
1                                      540.0   
2                                      332.5   
3                                      332.5   
4                                      198.6   

   Blast Furnace Slag (component 2)(kg in a m^3 mixture)  \
0                                                0.0       
1                                                0.0       
2                                              142.5       
3                                              142.5       
4                                              132.4       

   Fly Ash (component 3)(kg in a m^3 mixture)  \
0                                         0.0   
1                                         0.0   
2                                         0.0   
3                                         0.0   
4                                         0.0   

   Water  (component 4)(kg in a m^3 mixtu

### Part 2: Build and Tune Random Forest Model

In [2]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestRegressor(random_state=42))
])


param_grid = {
    'rf__n_estimators': [100, 200, 300],
    'rf__max_depth': [None, 10, 20],
    'rf__min_samples_split': [2, 5],
    'rf__max_features': ['sqrt', 'log2']
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters: {'rf__max_depth': None, 'rf__max_features': 'log2', 'rf__min_samples_split': 2, 'rf__n_estimators': 300}


### Part 3: Evaluate Model and View Results

In [5]:
# Get best model
best_model = grid_search.best_estimator_

# Make predictions
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

# Calculate metrics
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
r2 = r2_score(y_test, y_test_pred)

# Display results
print(f"Train MSE: {train_mse:.4f}")
print(f"Test MSE: {test_mse:.4f}")
print(f"Test R^2 Score: {r2:.4f}")

# Feature importance
importances = best_model.named_steps['rf'].feature_importances_
feature_names = X.columns
print("\nFeature Importances:")
for name, importance in zip(feature_names, importances):
    print(f"{name}: {importance:.4f}")

Train MSE: 3.8242
Test MSE: 28.9508
Test R^2 Score: 0.8876

Feature Importances:
Cement (component 1)(kg in a m^3 mixture): 0.2470
Blast Furnace Slag (component 2)(kg in a m^3 mixture): 0.0643
Fly Ash (component 3)(kg in a m^3 mixture): 0.0469
Water  (component 4)(kg in a m^3 mixture): 0.1260
Superplasticizer (component 5)(kg in a m^3 mixture): 0.0768
Coarse Aggregate  (component 6)(kg in a m^3 mixture): 0.0548
Fine Aggregate (component 7)(kg in a m^3 mixture): 0.0574
Age (day): 0.3269
