In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [11]:
data_path = "Steel_industry_data.csv"
df = pd.read_csv(data_path)

In [23]:
df.head(10)

Unnamed: 0,date,Usage_kWh,Lagging_Current_Reactive.Power_kVarh,Leading_Current_Reactive_Power_kVarh,CO2(tCO2),Lagging_Current_Power_Factor,Leading_Current_Power_Factor,NSM,WeekStatus,Day_of_week,Load_Type
0,01/01/2018 00:15,3.17,2.95,0.0,0.0,73.21,100.0,900,Weekday,Monday,Light_Load
1,01/01/2018 00:30,4.0,4.46,0.0,0.0,66.77,100.0,1800,Weekday,Monday,Light_Load
2,01/01/2018 00:45,3.24,3.28,0.0,0.0,70.28,100.0,2700,Weekday,Monday,Light_Load
3,01/01/2018 01:00,3.31,3.56,0.0,0.0,68.09,100.0,3600,Weekday,Monday,Light_Load
4,01/01/2018 01:15,3.82,4.5,0.0,0.0,64.72,100.0,4500,Weekday,Monday,Light_Load
5,01/01/2018 01:30,3.28,3.56,0.0,0.0,67.76,100.0,5400,Weekday,Monday,Light_Load
6,01/01/2018 01:45,3.6,4.14,0.0,0.0,65.62,100.0,6300,Weekday,Monday,Light_Load
7,01/01/2018 02:00,3.6,4.28,0.0,0.0,64.37,100.0,7200,Weekday,Monday,Light_Load
8,01/01/2018 02:15,3.28,3.64,0.0,0.0,66.94,100.0,8100,Weekday,Monday,Light_Load
9,01/01/2018 02:30,3.78,4.72,0.0,0.0,62.51,100.0,9000,Weekday,Monday,Light_Load


In [None]:
print(f"Original data shape: {df.shape}")

Original data shape: (35040, 11)


In [22]:
missing_values = df.isnull().sum()
print("Total missing values per column: ")
print(missing_values)

Total missing values per column: 
date                                    0
Usage_kWh                               0
Lagging_Current_Reactive.Power_kVarh    0
Leading_Current_Reactive_Power_kVarh    0
CO2(tCO2)                               0
Lagging_Current_Power_Factor            0
Leading_Current_Power_Factor            0
NSM                                     0
WeekStatus                              0
Day_of_week                             0
Load_Type                               0
dtype: int64


In [42]:
target = 'Usage_kWh'
X = df.drop(target, axis = 1)
y = df[target]
print(y)

0        3.17
1        4.00
2        3.24
3        3.31
4        3.82
         ... 
35035    3.85
35036    3.74
35037    3.78
35038    3.78
35039    3.67
Name: Usage_kWh, Length: 35040, dtype: float64


In [43]:
X = X.drop('date', axis=1)
numeric_features = X.select_dtypes(include = np.number).columns
categorical_features = X.select_dtypes(include = 'object').columns


In [44]:
print(numeric_features)
print('-' * 60)
print(categorical_features)

Index(['Lagging_Current_Reactive.Power_kVarh',
       'Leading_Current_Reactive_Power_kVarh', 'CO2(tCO2)',
       'Lagging_Current_Power_Factor', 'Leading_Current_Power_Factor', 'NSM'],
      dtype='object')
------------------------------------------------------------
Index(['WeekStatus', 'Day_of_week', 'Load_Type'], dtype='object')


In [None]:
X = pd.get_dummies(X, columns=categorical_features, drop_first=True)

In [46]:
print(X.shape)

(35040, 15)


In [47]:
from sklearn.model_selection import train_test_split

# test_size=0.2 means 20% of data is saved for testing
# random_state=42 ensures you get the same split every time you run the code
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

X_train shape: (28032, 15)
X_test shape: (7008, 15)


In [48]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [49]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

In [56]:
svr = SVR()
param_grid = {
    'C': [0.1, 1, 10, 100],        # Regularization parameter
    'kernel': ['linear', 'rbf'],    # Type of SVR kernel
    'gamma': ['scale', 'auto']      # Kernel coefficient (for 'rbf' kernel)
}
grid_search = GridSearchCV(
    estimator=svr, 
    param_grid=param_grid, 
    cv=3, 
    scoring='neg_mean_squared_error',
    n_jobs=1,  # Use all available CPU cores
    verbose=2   # Shows a progress bar
)

In [57]:
grid_search.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   9.5s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   9.2s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   9.3s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=  12.6s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=  12.3s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=  12.1s
[CV] END ...................C=0.1, gamma=auto, kernel=linear; total time=   9.3s
[CV] END ...................C=0.1, gamma=auto, kernel=linear; total time=   9.2s
[CV] END ...................C=0.1, gamma=auto, kernel=linear; total time=   9.3s
[CV] END ......................C=0.1, gamma=auto, kernel=rbf; total time=  12.1s
[CV] END ......................C=0.1, gamma=auto, kernel=rbf; total time=  12.0s
[CV] END ......................C=0.1, gamma=auto

0,1,2
,estimator,SVR()
,param_grid,"{'C': [0.1, 1, ...], 'gamma': ['scale', 'auto'], 'kernel': ['linear', 'rbf']}"
,scoring,'neg_mean_squared_error'
,n_jobs,1
,refit,True
,cv,3
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,tol,0.001
,C,100
,epsilon,0.1
,shrinking,True
,cache_size,200
,verbose,False


In [None]:
from sklearn.metrics import mean_squared_error, r2_score
best_svr = grid_search.best_estimator_

y_pred = best_svr.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred)

rmse = np.sqrt(mse)

r2 = r2_score(y_test, y_pred)

print("--- Model Evaluation on Test Set ---")
print(f"R-squared (R²): {r2:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print("-" * 30)


--- Model Evaluation on Test Set ---
R-squared (R²): 0.9980
Mean Squared Error (MSE): 2.3101
Root Mean Squared Error (RMSE): 1.5199
------------------------------


In [59]:
print(y_train.describe())

count    28032.000000
mean        27.286074
std         33.375539
min          0.000000
25%          3.200000
50%          4.570000
75%         51.050000
max        157.180000
Name: Usage_kWh, dtype: float64
