In [1]:
# Importing all required libraries

import numpy as np
import pandas as pd

# Machine learning models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor

# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Evaluation metrics
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error


In [2]:
# Loading the dataset
df = pd.read_csv("corrosion_fuel.csv")

# Display first few rows
df.head()


Unnamed: 0,material,ethanol_pct,temperature_C,water_ppm,chloride_mg_L,dissolved_O2_ppm,TAN_mgKOH_g,conductivity_uS_cm,flow_velocity_m_s,exposure_hours,surface_roughness_Ra_um,inhibitor_present,pH,corrosion_rate_mm_per_y
0,Brass,15,55.09,255.6,0.67,7.09,0.177,77.7,1.049,87,1.314,0,6.09,0.01158
1,Al6061,0,58.24,369.7,4.75,7.46,0.005,27.35,0.215,274,0.771,0,6.59,0.00741
2,SS304,15,55.46,299.1,3.76,3.49,0.172,65.03,0.517,198,1.631,1,6.8,0.00109
3,LowCarbonSteel,5,33.69,514.9,6.12,0.78,0.095,50.44,1.054,60,1.135,1,6.73,0.03858
4,Brass,0,47.33,88.6,17.49,3.44,0.168,28.42,0.173,243,0.751,0,6.75,0.01547


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   material                 10000 non-null  object 
 1   ethanol_pct              10000 non-null  int64  
 2   temperature_C            10000 non-null  float64
 3   water_ppm                10000 non-null  float64
 4   chloride_mg_L            10000 non-null  float64
 5   dissolved_O2_ppm         10000 non-null  float64
 6   TAN_mgKOH_g              10000 non-null  float64
 7   conductivity_uS_cm       10000 non-null  float64
 8   flow_velocity_m_s        10000 non-null  float64
 9   exposure_hours           10000 non-null  int64  
 10  surface_roughness_Ra_um  10000 non-null  float64
 11  inhibitor_present        10000 non-null  int64  
 12  pH                       10000 non-null  float64
 13  corrosion_rate_mm_per_y  10000 non-null  float64
dtypes: float64(10), int64(3

In [4]:
# Label Encoding the 'material' column
le = LabelEncoder()
df['material'] = le.fit_transform(df['material'])


In [5]:
# Splitting into independent and dependent variables
X = df.iloc[:, :-1]     # all columns except last
y = df.iloc[:, -1]      # last column is target


In [6]:
# Splitting the dataset into training and testing sets (80–20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [7]:
# Linear Regression Model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predictions
lr_pred = lr.predict(X_test)

# Metrics
lr_r2 = r2_score(y_test, lr_pred)
lr_rmse = np.sqrt(mean_squared_error(y_test, lr_pred))

print("Linear Regression Results:")
print("R² Score:", lr_r2)
print("RMSE:", lr_rmse)


Linear Regression Results:
R² Score: 0.26410934460519864
RMSE: 0.015590564607538993


In [8]:
# Ridge Regression Model
ridge = Ridge()
ridge.fit(X_train, y_train)

# Predictions
ridge_pred = ridge.predict(X_test)

# Metrics
ridge_r2 = r2_score(y_test, ridge_pred)
ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_pred))

print("Ridge Regression Results:")
print("R² Score:", ridge_r2)
print("RMSE:", ridge_rmse)


Ridge Regression Results:
R² Score: 0.26407153384113835
RMSE: 0.015590965131437253


In [9]:
# Lasso Regression Model
lasso = Lasso()
lasso.fit(X_train, y_train)

# Predictions
lasso_pred = lasso.predict(X_test)

# Metrics
lasso_r2 = r2_score(y_test, lasso_pred)
lasso_rmse = np.sqrt(mean_squared_error(y_test, lasso_pred))

print("Lasso Regression Results:")
print("R² Score:", lasso_r2)
print("RMSE:", lasso_rmse)


Lasso Regression Results:
R² Score: -0.0023445853654151527
RMSE: 0.018195488661998105


In [10]:
# Gradient Boosting Regressor (400-tree ensemble)
gbr = GradientBoostingRegressor(
    n_estimators=400,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

gbr.fit(X_train, y_train)

# Predictions
gbr_pred = gbr.predict(X_test)

# Metrics
gbr_r2 = r2_score(y_test, gbr_pred)
gbr_rmse = np.sqrt(mean_squared_error(y_test, gbr_pred))
gbr_mae = mean_absolute_error(y_test, gbr_pred)

print("Gradient Boosting Results (400 Trees):")
print("R² Score:", gbr_r2)
print("RMSE:", gbr_rmse)
print("MAE:", gbr_mae)


Gradient Boosting Results (400 Trees):
R² Score: 0.8732667196611449
RMSE: 0.006469944233341715
MAE: 0.0036021862338662326


In [11]:
# AdaBoost Regressor
ada = AdaBoostRegressor(random_state=42)
ada.fit(X_train, y_train)

# Predictions
ada_pred = ada.predict(X_test)

# Metrics
ada_r2 = r2_score(y_test, ada_pred)
ada_rmse = np.sqrt(mean_squared_error(y_test, ada_pred))
ada_mae = mean_absolute_error(y_test, ada_pred)

print("AdaBoost Results:")
print("R² Score:", ada_r2)
print("RMSE:", ada_rmse)
print("MAE:", ada_mae)


AdaBoost Results:
R² Score: 0.6197115655915131
RMSE: 0.011207577689579539
MAE: 0.009717084608891998


In [12]:
# Comparing all model performances
results = {
    "Model": ["Linear Regression", "Ridge Regression", "Lasso Regression",
              "Gradient Boosting (400 Trees)", "AdaBoost"],
    
    "R² Score": [lr_r2, ridge_r2, lasso_r2, gbr_r2, ada_r2],
    "RMSE":     [lr_rmse, ridge_rmse, lasso_rmse, gbr_rmse, ada_rmse],
    "MAE":      ["-", "-", "-", gbr_mae, ada_mae]   # MAE only for Boosting models
}

results_df = pd.DataFrame(results)
results_df


Unnamed: 0,Model,R² Score,RMSE,MAE
0,Linear Regression,0.264109,0.015591,-
1,Ridge Regression,0.264072,0.015591,-
2,Lasso Regression,-0.002345,0.018195,-
3,Gradient Boosting (400 Trees),0.873267,0.00647,0.003602
4,AdaBoost,0.619712,0.011208,0.009717


In [13]:
# Final Interpretation Summary

print("INTERPRETATION SUMMARY:\n")

print("- Linear, Ridge, and Lasso regression models give basic baseline performance,")
print("  but they cannot fully capture the nonlinear relationships in the corrosion dataset.\n")

print("- Gradient Boosting (400 trees) performs the best overall with highest R² and")
print("  the lowest RMSE/MAE values. This indicates strong predictive power and the")
print("  ability to model complex interactions.\n")

print("- AdaBoost performs well but slightly below Gradient Boosting, making it a strong")
print("  secondary model.\n")

print("Conclusion: Gradient Boosting is the most effective model for predicting corrosion_rate_mm_per_y.")


INTERPRETATION SUMMARY:

- Linear, Ridge, and Lasso regression models give basic baseline performance,
  but they cannot fully capture the nonlinear relationships in the corrosion dataset.

- Gradient Boosting (400 trees) performs the best overall with highest R² and
  the lowest RMSE/MAE values. This indicates strong predictive power and the
  ability to model complex interactions.

- AdaBoost performs well but slightly below Gradient Boosting, making it a strong
  secondary model.

Conclusion: Gradient Boosting is the most effective model for predicting corrosion_rate_mm_per_y.
