# 1. Load the Data

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import PolynomialFeatures

In [None]:
df = pd.read_csv('/content/5G_energy_consumption_dataset.csv')

In [None]:
df.head()

In [None]:
df.info()

# 2. Data Pre-Processing & EDA

In [None]:
df.describe()

In [None]:
#check for misssing data
df.isnull().sum()

In [None]:
#check for duplicates
df.duplicated().sum()

In [None]:
#check for outliers
numeric_cols = df.select_dtypes(include='number')

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
    print(f"{col}: {len(outliers)} outliers")

In [None]:
def drop_outliers_iqr(df):
    numeric_cols = df.select_dtypes(include='number')
    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

# Apply to your data
data_clean = drop_outliers_iqr(df)


In [None]:
data_clean.info()

In [None]:
#encode categorical features
df_encoded = pd.get_dummies(data_clean, columns=['Time', 'BS'])
print(df_encoded.head())

In [None]:
#create a profling report on the data
!pip install ydata-profiling
from ydata_profiling  import ProfileReport
prof=ProfileReport(data_clean)
prof.to_file(output_file='output.html')

In [None]:
data_clean.shape[0]

# 3. Build the Model & Evaluate

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Target and features
y = data_clean['Energy']
X = pd.get_dummies(data_clean.drop('Energy', axis=1))  # Encode categorical columns

# Handle missing values
X = X.fillna(X.mean())

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Random Forest model
model = RandomForestRegressor(n_estimators=50, max_depth=10, random_state=42)  # Fewer trees, limited depth
model.fit(X_train_scaled, y_train)

# Make predictions
predictions = model.predict(X_test_scaled)

print("Root Mean Squared Error (RMSE):", root_mean_squared_error)
# Evaluate
print("Mean Squared Error (MSE):", mean_squared_error(y_test, predictions))
print("R-squared Score:", r2_score(y_test, predictions))

In [None]:
root_mean_squared_error = np.sqrt(mean_squared_error(y_test, predictions))
print(root_mean_squared_error)

In [None]:
# Predict energy price for new data

new_data = pd.DataFrame({
    'Time': [20230103140000],
    'BS': ['B_0'],
    'load': [0.999],
    'ESMODE': [0],
    'TXpower': [8.50432]
})
new_data = pd.get_dummies(new_data)  # Encode categorical features
new_data = new_data.reindex(columns=X.columns, fill_value=0)  # Align columns with training data
new_data_scaled = scaler.transform(new_data)  # Scale the new data
new_prediction = model.predict(new_data_scaled)  # Predict energy price
print("Predicted Energy Price for New Data:", new_prediction[0])

In [None]:
model.score(X_test_scaled, y_test)