# 1. Load the Data

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import PolynomialFeatures

In [None]:
df = pd.read_csv('/content/5G_energy_consumption_dataset.csv')

In [None]:
df.head()

Unnamed: 0,Time,BS,Energy,load,ESMODE,TXpower
0,20230101 010000,B_0,64.275037,0.487936,0.0,7.101719
1,20230101 020000,B_0,55.904335,0.344468,0.0,7.101719
2,20230101 030000,B_0,57.698057,0.193766,0.0,7.101719
3,20230101 040000,B_0,55.156951,0.222383,0.0,7.101719
4,20230101 050000,B_0,56.053812,0.175436,0.0,7.101719


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92629 entries, 0 to 92628
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Time     92629 non-null  object 
 1   BS       92629 non-null  object 
 2   Energy   92629 non-null  float64
 3   load     92629 non-null  float64
 4   ESMODE   92629 non-null  float64
 5   TXpower  92629 non-null  float64
dtypes: float64(4), object(2)
memory usage: 4.2+ MB


# 2. Data Pre-Processing & EDA

In [None]:
df.describe()

Unnamed: 0,Energy,load,ESMODE,TXpower
count,92629.0,92629.0,92629.0,92629.0
mean,28.138997,0.244705,0.081361,6.765427
std,13.934645,0.234677,0.382317,0.309929
min,0.747384,0.0,0.0,5.381166
25%,18.236173,0.05737,0.0,6.427504
50%,24.06577,0.16555,0.0,6.875934
75%,35.724963,0.363766,0.0,6.875934
max,100.0,0.993957,4.0,8.375336


In [None]:
#check for misssing data
df.isnull().sum()

Unnamed: 0,0
Time,0
BS,0
Energy,0
load,0
ESMODE,0
TXpower,0


In [None]:
#check for duplicates
df.duplicated().sum()

np.int64(0)

In [None]:
#check for outliers
numeric_cols = df.select_dtypes(include='number')

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
    print(f"{col}: {len(outliers)} outliers")

Energy: 2192 outliers
load: 2974 outliers
ESMODE: 5154 outliers
TXpower: 1169 outliers


In [None]:
def drop_outliers_iqr(df):
    numeric_cols = df.select_dtypes(include='number')
    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

# Apply to your data
data_clean = drop_outliers_iqr(df)


In [None]:
data_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 81228 entries, 1 to 92628
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Time     81228 non-null  object 
 1   BS       81228 non-null  object 
 2   Energy   81228 non-null  float64
 3   load     81228 non-null  float64
 4   ESMODE   81228 non-null  float64
 5   TXpower  81228 non-null  float64
dtypes: float64(4), object(2)
memory usage: 4.3+ MB


In [None]:
#encode categorical features
df_encoded = pd.get_dummies(data_clean, columns=['Time', 'BS'])
print(df_encoded.head())

       Energy      load  ESMODE   TXpower  Time_20230101 010000  \
1   55.904335  0.344468     0.0  7.101719                 False   
2   57.698057  0.193766     0.0  7.101719                 False   
3   55.156951  0.222383     0.0  7.101719                 False   
4   56.053812  0.175436     0.0  7.101719                 False   
20  55.754858  0.210952     0.0  7.101719                 False   

    Time_20230101 020000  Time_20230101 030000  Time_20230101 040000  \
1                   True                 False                 False   
2                  False                  True                 False   
3                  False                 False                  True   
4                  False                 False                 False   
20                 False                 False                 False   

    Time_20230101 050000  Time_20230101 060000  ...  BS_B_990  BS_B_991  \
1                  False                 False  ...     False     False   
2             

In [None]:
#create a profling report on the data
!pip install ydata-profiling
from ydata_profiling  import ProfileReport
prof=ProfileReport(data_clean)
prof.to_file(output_file='output.html')



Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/6 [00:00<?, ?it/s][A
 17%|█▋        | 1/6 [00:00<00:01,  4.84it/s][A
100%|██████████| 6/6 [00:00<00:00,  8.83it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
data_clean.shape[0]

81228

# 3. Build the Model & Evaluate

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Target and features
y = data_clean['Energy']
X = pd.get_dummies(data_clean.drop('Energy', axis=1))  # Encode categorical columns

# Handle missing values
X = X.fillna(X.mean())

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Random Forest model
model = RandomForestRegressor(n_estimators=50, max_depth=10, random_state=42)  # Fewer trees, limited depth
model.fit(X_train_scaled, y_train)

# Make predictions
predictions = model.predict(X_test_scaled)

print("Root Mean Squared Error (RMSE):", root_mean_squared_error)
# Evaluate
print("Mean Squared Error (MSE):", mean_squared_error(y_test, predictions))
print("R-squared Score:", r2_score(y_test, predictions))

Root Mean Squared Error (RMSE): 4.884170618239621
Mean Squared Error (MSE): 23.855122628075208
R-squared Score: 0.821135012564407


In [None]:
root_mean_squared_error = np.sqrt(mean_squared_error(y_test, predictions))
print(root_mean_squared_error)

4.884170618239621


In [None]:
# Predict energy price for new data

new_data = pd.DataFrame({
    'Time': [20230103140000],
    'BS': ['B_0'],
    'load': [0.999],
    'ESMODE': [0],
    'TXpower': [8.50432]
})
new_data = pd.get_dummies(new_data)  # Encode categorical features
new_data = new_data.reindex(columns=X.columns, fill_value=0)  # Align columns with training data
new_data_scaled = scaler.transform(new_data)  # Scale the new data
new_prediction = model.predict(new_data_scaled)  # Predict energy price
print("Predicted Energy Price for New Data:", new_prediction[0])

Predicted Energy Price for New Data: 29.32516193209667


In [None]:
model.score(X_test_scaled, y_test)

0.821135012564407