In [8]:

# 1. IMPORT LIBRARIES

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.multiclass import unique_labels

import joblib
import warnings
warnings.filterwarnings('ignore')


# 2. LOAD DATA

df = pd.read_csv("load_data.csv")
print("Loaded data. Shape:", df.shape)
print(df.head())

# Show actual column names
print("\n Actual column names in CSV:")
print(df.columns.tolist())


# 3. DATA OVERVIEW

print("\n Dataset Info:")
print(df.info())
print("\n Missing values:\n", df.isnull().sum())
print("\n Summary statistics:\n", df.describe())


# 4. EDA: CLASS DISTRIBUTION

plt.figure(figsize=(6,4))
sns.countplot(x='Load_Type', data=df)
plt.title("Load Type Distribution")
plt.savefig("eda_load_type_distribution.png")
plt.close()


# 5. EDA: FEATURE DISTRIBUTIONS
features = [
    'Usage_kWh',
    'Lagging_Current_Reactive.Power_kVarh',
    'Leading_Current_Reactive_Power_kVarh',
    'CO2(tCO2)',
    'NSM'
]

df[features].hist(figsize=(12, 8), bins=20)
plt.tight_layout()
plt.savefig("eda_feature_distributions.png")
plt.close()


# 6. CONVERT DATE + FEATURE ENGINEERING

df['Date'] = pd.to_datetime(df['Date_Time'], format="%d-%m-%Y %H:%M")
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df = df.sort_values('Date')


# 7. LABEL ENCODING (TARGET)

le = LabelEncoder()
df['Load_Type_Label'] = le.fit_transform(df['Load_Type'])
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("\n Label mapping:", label_mapping)


# 8. TRAIN-TEST SPLIT (LAST DATE = TEST SET)

last_date = df['Date'].max()
print(" Last date in dataset:", last_date)

test_df = df[df['Date'] == last_date]
train_df = df[df['Date'] < last_date]

print(" Train shape:", train_df.shape)
print(" Test shape:", test_df.shape)


# 9. FEATURES AND SCALING

feature_cols = [
    'Usage_kWh',
    'Lagging_Current_Reactive.Power_kVarh',
    'Leading_Current_Reactive_Power_kVarh',
    'CO2(tCO2)',
    'NSM',
    'Year',
    'Month'
]

scaler = StandardScaler()
X_train = scaler.fit_transform(train_df[feature_cols])
X_test = scaler.transform(test_df[feature_cols])

y_train = train_df['Load_Type_Label']
y_test = test_df['Load_Type_Label']


# 10. MODEL TRAINING (Random Forest)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)


# 11. MODEL EVALUATION (WITH SAFE CLASS HANDLING)


labels_in_test = unique_labels(y_test, y_pred)
target_names_in_test = [le.classes_[i] for i in labels_in_test]

print("\n Classification Report:\n")
print(classification_report(y_test, y_pred, labels=labels_in_test, target_names=target_names_in_test))

print(" Confusion Matrix:\n")
cm = confusion_matrix(y_test, y_pred, labels=labels_in_test)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=target_names_in_test,
            yticklabels=target_names_in_test)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.savefig("confusion_matrix.png")
plt.close()


# 12. SAVE MODEL (OPTIONAL)

joblib.dump(clf, "random_forest_model.pkl")
joblib.dump(scaler, "scaler.pkl")
print("\n Model and scaler saved to disk.")









Loaded data. Shape: (35041, 9)
          Date_Time  Usage_kWh  Lagging_Current_Reactive.Power_kVarh  \
0  01-01-2018 00:15   8.753692                                  2.95   
1  01-01-2018 00:30   4.000000                                  4.46   
2  01-01-2018 00:45   3.240000                                  3.28   
3  01-01-2018 01:00   3.310000                                  3.56   
4  01-01-2018 01:15   3.820000                                  4.50   

   Leading_Current_Reactive_Power_kVarh  CO2(tCO2)  \
0                                   0.0        0.0   
1                                   0.0        0.0   
2                                   0.0        0.0   
3                                   0.0        0.0   
4                                   0.0        0.0   

   Lagging_Current_Power_Factor  Leading_Current_Power_Factor          NSM  \
0                     73.210000                         100.0   900.000000   
1                     66.770000                        

In [3]:
print("Column names in your CSV:")
print(df.columns.tolist())

Column names in your CSV:
['Date_Time', 'Usage_kWh', 'Lagging_Current_Reactive.Power_kVarh', 'Leading_Current_Reactive_Power_kVarh', 'CO2(tCO2)', 'Lagging_Current_Power_Factor', 'Leading_Current_Power_Factor', 'NSM', 'Load_Type']
