In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR as SVM
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import skew, kurtosis
import warnings
warnings.filterwarnings('ignore')

In [None]:
concrete_data = pd.read_csv('https://raw.githubusercontent.com/Zaid-N-Ansari/PG-Mini-Project/refs/heads/main/Data/concrete_data.csv?token=GHSAT0AAAAAADCE7OAHRDDLMPBRTQ423TRS2AMNPFA')

print(f'Total Observations / Records: {concrete_data.shape[0]}')
print(f'Total Features / Columns / Variables: {concrete_data.shape[1]}')
print('Features:', end=' ')
col_names = concrete_data.columns.tolist()
print(*col_names, sep=', ')

In [None]:
concrete_data.head()

In [None]:
# Check for missing values
print("\nMissing values:")
print(concrete_data.isnull().sum())

# Check data types
print("\nData types:")
print(concrete_data.dtypes)

# Check for duplicates
print("\nNumber of duplicate rows:", concrete_data.duplicated().sum())

# Remove duplicates if any
concrete_data.drop_duplicates(inplace=True)
print("Shape of the dataset after removing duplicates:", concrete_data.shape)

In [None]:
# i. Univariate Analysis
concrete_data.hist(bins=20, figsize=(15, 10))
plt.suptitle('Univariate Distribution of Numerical Features', y=1.02)
plt.show()

# ii. Bivariate Analysis
sns.pairplot(concrete_data)
plt.suptitle('Pairwise Relationships between Features', y=1.02)
plt.show()

# iii. Multivariate Analysis (Correlation Heatmap)
corr_mat = concrete_data.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_mat, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

# Feature-target relationships
features = col_names[:-1]
target = 'strength'

plt.figure(figsize=(15, 10))
for i, feature in enumerate(features):
    plt.subplot(3, 3, i + 1)
    sns.scatterplot(x=concrete_data[feature], y=concrete_data[target])
    plt.title(f'{feature} vs {target}')
plt.tight_layout()
plt.show()

In [None]:
# Boxplots to visualize outliers
plt.figure(figsize=(15, 10))
for i, feature in enumerate(features):
    plt.subplot(3, 3, i + 1)
    sns.boxplot(x=concrete_data[feature])
    plt.title(f'Box Plot of {feature}')
plt.tight_layout()
plt.show()

# Function to remove outliers using IQR method
def remove_outliers_iqr(concrete_data, column):
    Q1 = concrete_data[column].quantile(0.25)
    Q3 = concrete_data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    concrete_data_filtered = concrete_data[(concrete_data[column] >= lower_bound) & (concrete_data[column] <= upper_bound)]
    return concrete_data_filtered

concrete_data_no_outliers = concrete_data.copy()
for col in features:
    concrete_data_no_outliers = remove_outliers_iqr(concrete_data_no_outliers, col)

print("Shape of the dataset before outlier removal:", concrete_data.shape)
print("Shape of the dataset after outlier removal:", concrete_data_no_outliers.shape)

# Let's proceed with the dataset after outlier removal for further steps
concrete_data = concrete_data_no_outliers

In [None]:
# Check Skewness and Kurtosis
print("\nSkewness of features:")
print(concrete_data[features].skew())
print("\nKurtosis of features:")
print(concrete_data[features].kurt())

# Visualize distributions after outlier removal
concrete_data[features].hist(bins=20, figsize=(15, 10))
plt.suptitle('Distribution of Numerical Features after Outlier Removal', y=1.02)
plt.show()

# Apply Log Transformation to features with high positive skewness (e.g., age)
# We'll identify highly skewed features based on the skewness values
highly_skewed_features = concrete_data[features].skew()[abs(concrete_data[features].skew()) > 0.5].index

for feature in highly_skewed_features:
    concrete_data[feature] = np.log1p(concrete_data[feature])

# Visualize distributions after transformation
concrete_data[highly_skewed_features].hist(bins=20, figsize=(10, 6))
plt.suptitle('Distribution of Highly Skewed Features after Log Transformation', y=1.02)
plt.show()

print("\nSkewness of features after transformation:")
print(concrete_data[features].skew())

In [None]:
X = concrete_data.drop(target, axis=1)
y = concrete_data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, )

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

In [None]:
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
y_pred_lr = linear_reg.predict(X_test)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

linear_reg_scaled = LinearRegression()
linear_reg_scaled.fit(X_train_scaled, y_train)
y_pred_lr_scaled = linear_reg_scaled.predict(X_test_scaled)

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred_lr_scaled, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Actual Strength')
plt.ylabel('Predicted Strength (Linear Regression - Scaled)')
plt.title('Actual vs. Predicted Concrete Strength (Linear Regression - Scaled)')
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred_lr_scaled, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Actual Strength')
plt.ylabel('Predicted Strength (Linear Regression - Scaled)')
plt.title('Actual vs. Predicted Concrete Strength (Linear Regression - Scaled)')
plt.grid(True)
plt.show()

In [None]:
diff_lr_scaled = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_lr_scaled})
diff_lr_scaled['Difference'] = diff_lr_scaled['Actual'] - diff_lr_scaled['Predicted']
print("\nDifference between Actual and Predicted (Linear Regression - Scaled):")
print(diff_lr_scaled.head())

In [None]:
def evaluate_model(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    n = len(y_true)
    p = X_test.shape[1]  # Number of features
    adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

    print(f"\nPerformance Metrics for {model_name}:")
    print(f"  MAE: {mae:.2f}")
    print(f"  MSE: {mse:.2f}")
    print(f"  RMSE: {rmse:.2f}")
    print(f"  R2 Score: {r2:.2f}")
    print(f"  Adjusted R2 Score: {adjusted_r2:.2f}")
    return mae, mse, rmse, r2, adjusted_r2

lr_metrics = evaluate_model(y_test, y_pred_lr_scaled, 'Linear Regression (Scaled)')

In [None]:
# --- Decision Tree Regressor ---
dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train_scaled, y_train)
y_pred_dt = dt_reg.predict(X_test_scaled)
dt_metrics = evaluate_model(y_test, y_pred_dt, 'Decision Tree Regressor')

plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred_dt, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Actual Strength')
plt.ylabel('Predicted Strength (Decision Tree)')
plt.title('Actual vs. Predicted Concrete Strength (Decision Tree)')
plt.grid(True)
plt.show()

# --- Random Forest Regressor ---
rf_reg = RandomForestRegressor(n_estimators=100)
rf_reg.fit(X_train_scaled, y_train)
y_pred_rf = rf_reg.predict(X_test_scaled)
rf_metrics = evaluate_model(y_test, y_pred_rf, 'Random Forest Regressor')

plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred_rf, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Actual Strength')
plt.ylabel('Predicted Strength (Random Forest)')
plt.title('Actual vs. Predicted Concrete Strength (Random Forest)')
plt.grid(True)
plt.show()

# --- Support Vector Machine ---
svm_reg = SVM()
svm_reg.fit(X_train_scaled, y_train)
y_pred_svm = svm_reg.predict(X_test_scaled)
svm_metrics = evaluate_model(y_test, y_pred_svm, 'Support Vector Machine')

plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred_svm, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Actual Strength')
plt.ylabel('Predicted Strength (SVM)')
plt.title('Actual vs. Predicted Concrete Strength (SVM)')
plt.grid(True)
plt.show()

# --- K-Nearest Neighbor ---
knn_reg = KNeighborsRegressor(n_neighbors=5) # You can experiment with different values of k
knn_reg.fit(X_train_scaled, y_train)
y_pred_knn = knn_reg.predict(X_test_scaled)
knn_metrics = evaluate_model(y_test, y_pred_knn, 'K-Nearest Neighbor')

plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred_knn, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Actual Strength')
plt.ylabel('Predicted Strength (KNN)')
plt.title('Actual vs. Predicted Concrete Strength (KNN)')
plt.grid(True)
plt.show()

In [30]:
cv = KFold(n_splits=5, shuffle=True)

def cross_validate_model(model, X, y, model_name):
    mae_scores = cross_val_score(model, X, y, cv=cv)
    mse_scores = cross_val_score(model, X, y, cv=cv)
    r2_scores = cross_val_score(model, X, y, cv=cv, scoring='r2')

    print(f"\nCross-Validation Results for {model_name}:")
    print(f"  Mean MAE: {-mae_scores.mean():.2f} (+/- {mae_scores.std():.2f})")
    print(f"  Mean MSE: {-mse_scores.mean():.2f} (+/- {mse_scores.std():.2f})")
    print(f"  Mean R2 Score: {r2_scores.mean():.2f} (+/- {r2_scores.std():.2f})")

# Cross-validate the models (using scaled data where applicable)
cross_validate_model(LinearRegression(), X_test, y_test, 'Linear Regression')
cross_validate_model(DecisionTreeRegressor(), X_test, y_test, 'Decision Tree Regressor')
cross_validate_model(RandomForestRegressor(n_estimators=100), X_test, y_test, 'Random Forest Regressor')
cross_validate_model(SVR(), X_test, y_test, 'Support Vector Machine')
cross_validate_model(KNeighborsRegressor(n_neighbors=5), X_test, y_test, 'K-Nearest Neighbor')


Cross-Validation Results for Linear Regression:
  Mean MAE: -0.78 (+/- 0.08)
  Mean MSE: -0.81 (+/- 0.04)
  Mean R2 Score: 0.80 (+/- 0.04)

Cross-Validation Results for Decision Tree Regressor:
  Mean MAE: -0.57 (+/- 0.13)
  Mean MSE: -0.49 (+/- 0.19)
  Mean R2 Score: 0.44 (+/- 0.15)

Cross-Validation Results for Random Forest Regressor:
  Mean MAE: -0.72 (+/- 0.08)
  Mean MSE: -0.75 (+/- 0.06)
  Mean R2 Score: 0.76 (+/- 0.07)

Cross-Validation Results for Support Vector Machine:
  Mean MAE: 0.08 (+/- 0.04)
  Mean MSE: 0.01 (+/- 0.03)
  Mean R2 Score: -0.04 (+/- 0.03)

Cross-Validation Results for K-Nearest Neighbor:
  Mean MAE: -0.03 (+/- 0.09)
  Mean MSE: -0.06 (+/- 0.12)
  Mean R2 Score: 0.07 (+/- 0.11)


In [33]:
import re


print("\n\nFinal Conclusion:")

results = {
    'Model': ['Linear Regression', 'Decision Tree Regressor', 'Random Forest Regressor', 'Support Vector Machine', 'K-Nearest Neighbor'],
    'MAE': [lr_metrics[0], dt_metrics[0], rf_metrics[0], svm_metrics[0], knn_metrics[0]],
    'MSE': [lr_metrics[1], dt_metrics[1], rf_metrics[1], svm_metrics[1], knn_metrics[1]],
    'RMSE': [lr_metrics[2], dt_metrics[2], rf_metrics[2], svm_metrics[2], knn_metrics[2]],
    'R2 Score': [lr_metrics[3], dt_metrics[3], rf_metrics[3], svm_metrics[3], knn_metrics[3]],
    'Adjusted R2': [lr_metrics[4], dt_metrics[4], rf_metrics[4], svm_metrics[4], knn_metrics[4]]
}

results_df = pd.DataFrame(results)
print("\nPerformance Comparison on Test Set:")

print("\nCross-Validation Summary:")
print("Based on the cross-validation results, the Random Forest Regressor and potentially Support Vector Machine tend to show promising performance in predicting concrete compressive strength. However, the best model might depend on the specific requirements of the application, such as interpretability versus prediction accuracy.")

print("\nFurther Considerations:")
print("- Hyperparameter tuning for each model could potentially improve their performance.")
print("- Exploring other feature engineering techniques based on domain knowledge might be beneficial.")
print("- Gathering more data could lead to more robust and accurate models.")

results_df.index = results_df.index + 1  # Start index from 1 for better readability
results_df



Final Conclusion:

Performance Comparison on Test Set:

Cross-Validation Summary:
Based on the cross-validation results, the Random Forest Regressor and potentially Support Vector Machine tend to show promising performance in predicting concrete compressive strength. However, the best model might depend on the specific requirements of the application, such as interpretability versus prediction accuracy.

Further Considerations:
- Hyperparameter tuning for each model could potentially improve their performance.
- Exploring other feature engineering techniques based on domain knowledge might be beneficial.
- Gathering more data could lead to more robust and accurate models.


Unnamed: 0,Model,MAE,MSE,RMSE,R2 Score,Adjusted R2
1,Linear Regression,5.478264,48.667816,6.976232,0.820655,0.812456
2,Decision Tree Regressor,5.01192,56.309438,7.503961,0.792495,0.783009
3,Random Forest Regressor,4.068554,31.512579,5.613607,0.883874,0.878565
4,Support Vector Machine,5.931762,63.633688,7.977073,0.765505,0.754785
5,K-Nearest Neighbor,5.26413,47.413533,6.885749,0.825277,0.81729
