# Electricity Market Forecasting Challenge

## Student solution for Data Science Competition

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# Load datasets
public_data_path = '/content/public_data.csv'
sample_submission_path = '/content/minta_beadando_rand.csv'

public_data = pd.read_csv(public_data_path)
sample_submission = pd.read_csv(sample_submission_path)

In [None]:
public_data.head()

Unnamed: 0,rowID,season,periodID,day_in_period,hour,minute,holyday,weekday,ke,hupx,afrr_fel,afrr_le,mfrr_fel,mfrr_le,afrr,solar_becsult_dayahead,rendszerterheles_terv,target_flag,target_reg
0,0,1,18,0,0,0,False,2,899.343121,287.7194,0.0,-103.196933,0.0,0.0,-212.845947,0.0,6514.140945,0.0,-100.188708
1,1,1,18,0,0,15,False,2,899.374587,287.7194,55.9286,-5.6172,0.0,0.0,263.42716,0.0,6482.184545,0.0,-100.292441
2,2,1,18,0,0,30,False,2,899.303875,287.7194,69.6826,-9.3558,0.0,0.0,273.92732,0.0,6310.363525,0.0,-100.059327
3,3,1,18,0,0,45,False,2,899.299787,287.7194,30.2718,-16.943567,0.0,0.0,132.62104,0.0,6158.798885,0.0,-100.04585
4,4,1,18,0,1,0,False,2,897.436947,283.6309,25.9714,-12.064167,0.0,0.0,151.88808,0.0,6137.148085,0.0,-104.954619


In [None]:
# Display basic information about the dataset
public_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89760 entries, 0 to 89759
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   rowID                   89760 non-null  int64  
 1   season                  89760 non-null  int64  
 2   periodID                89760 non-null  int64  
 3   day_in_period           89760 non-null  int64  
 4   hour                    89760 non-null  int64  
 5   minute                  89760 non-null  int64  
 6   holyday                 89760 non-null  bool   
 7   weekday                 89760 non-null  int64  
 8   ke                      71808 non-null  float64
 9   hupx                    71808 non-null  float64
 10  afrr_fel                71808 non-null  float64
 11  afrr_le                 71808 non-null  float64
 12  mfrr_fel                71808 non-null  float64
 13  mfrr_le                 71808 non-null  float64
 14  afrr                    71808 non-null

In [None]:
sample_submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17952 entries, 0 to 17951
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   rowID       17952 non-null  int64  
 1   prediction  17952 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 280.6 KB


In [None]:
sample_submission.head()

Unnamed: 0,rowID,prediction
0,384,0.37454
1,385,0.950714
2,386,0.731994
3,387,0.598658
4,388,0.156019


In [None]:
# Let's use Group-Wise Linear Interpolation
# Let's create a list with all the columns with missing values
target_colums = ['target_flag', 'target_reg']

In [None]:
nan_columns= [col for col in public_data.columns if public_data[col].isnull().any() if col not in target_colums]
nan_columns

['ke', 'hupx', 'afrr_fel', 'afrr_le', 'mfrr_fel', 'mfrr_le', 'afrr']

In [None]:
for col in nan_columns:
    public_data[col] = public_data.groupby('periodID')[col].apply(
        lambda x: x.interpolate(method='linear', limit_direction='both')
    ).reset_index(level=0, drop=True)

In [None]:
# Confirm missing values are handled
public_data.isnull().sum()

Unnamed: 0,0
rowID,0
season,0
periodID,0
day_in_period,0
hour,0
minute,0
holyday,0
weekday,0
ke,0
hupx,0


In [None]:
# Aggregated statistics for the past 4 days
grouped = public_data.groupby('periodID')

aggregates = grouped[['ke', 'hupx', 'afrr_fel', 'afrr_le', 'mfrr_fel', 'mfrr_le', 'afrr']].agg(
    ['mean', 'max', 'min', 'std']
)
aggregates.columns = ['_'.join(col) for col in aggregates.columns]
aggregates.reset_index(inplace=True)

# Merge aggregates back to the main dataset
public_data = public_data.merge(aggregates, on='periodID', how='left')


In [None]:
# Create lag features for numerical columns
lag_features = ['ke', 'hupx', 'afrr_fel', 'afrr_le', 'mfrr_fel', 'mfrr_le', 'afrr']
for col in lag_features:
    for lag in range(1, 5):  # Lags for the past 4 days
        public_data[f'{col}_lag{lag}'] = grouped[col].shift(lag * 24)

# Fill any new NaN values introduced by lagging
public_data.fillna(0, inplace=True)


In [None]:
# Split data into training and prediction sets
train_data = public_data[public_data['day_in_period'] < 4]
predict_rows = public_data[public_data['rowID'].isin(sample_submission['rowID'])]

# Define features and target variable for training
X_train = train_data.drop(columns=['target_flag', 'target_reg', 'rowID', 'day_in_period'])
y_train = train_data['target_flag']

# Test set for prediction
X_test = predict_rows.drop(columns=['target_flag', 'target_reg', 'rowID', 'day_in_period'])
test_rowIDs = predict_rows['rowID']


In [None]:
# Split training data for validation
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

# Train a Random Forest Classifier
clf = RandomForestClassifier(
    random_state=42,
    n_estimators=100,
    max_depth=10,  # Limit tree depth
    min_samples_split=10,  # Increase minimum samples to split
    min_samples_leaf=5
    )
clf.fit(X_train_split, y_train_split)

# Validate the model
y_val_pred_proba = clf.predict_proba(X_val)[:, 1]
validation_auc = roc_auc_score(y_val, y_val_pred_proba)
validation_log_loss = log_loss(y_val, y_val_pred_proba)

print("Validation AUC:", validation_auc)
print("Validation Log Loss:", validation_log_loss)

Validation AUC: 0.9965141478485023
Validation Log Loss: 0.12010606268919756


In [None]:
# Predict probabilities for the test set
y_test_pred_proba = clf.predict_proba(X_test)[:, 1]

# Prepare submission file
submission = pd.DataFrame({'rowID': test_rowIDs, 'prediction': y_test_pred_proba})
submission.to_csv('Submission01.csv', index=False)
print("Submission file created: submission01.csv")

Submission file created: submission01.csv


# Improving the first model

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


In [None]:

# Scaling the data before PCA
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_split)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Set number of components
num_components = 20

# Apply PCA to training data
pca = PCA(n_components=num_components)
X_train_pca = pca.fit_transform(X_train_scaled)
X_val_pca = pca.transform(X_val_scaled)
X_test_pca = pca.transform(X_test_scaled)

print("Original shape of X_train_split:", X_train_split.shape)
print("Shape of X_train_pca after PCA:", X_train_pca.shape)
print("Explained Variance Ratio:", pca.explained_variance_ratio_)


Original shape of X_train_split: (57446, 71)
Shape of X_train_pca after PCA: (57446, 20)
Explained Variance Ratio: [0.17005822 0.06855429 0.06344194 0.04752999 0.04470712 0.03391271
 0.03012499 0.02932554 0.02777304 0.02718388 0.02623968 0.02430997
 0.02203096 0.01994921 0.01861296 0.01599139 0.01520337 0.01510518
 0.01493345 0.0146758 ]


In [None]:
# Train a Random Forest Classifier using the PCA data
clf = RandomForestClassifier(random_state=42, n_estimators=100)
clf.fit(X_train_pca, y_train_split)

# Validate the model
y_val_pred_proba = clf.predict_proba(X_val_pca)[:, 1]
validation_auc = roc_auc_score(y_val, y_val_pred_proba)
validation_log_loss = log_loss(y_val, y_val_pred_proba)

print("Validation AUC:", validation_auc)
print("Validation Log Loss:", validation_log_loss)

# Predict probabilities for the test set
y_test_pred_proba = clf.predict_proba(X_test_pca)[:, 1]

# Prepare submission file
submission = pd.DataFrame({'rowID': test_rowIDs, 'prediction': y_test_pred_proba})
submission.to_csv('Submission02.csv', index=False)
print("Submission file created: submission02.csv")

Validation AUC: 0.9373596808758325
Validation Log Loss: 0.3222593917203982
Submission file created: submission02.csv


In [None]:

# Test set for prediction
X_test = predict_rows.drop(columns=['target_flag', 'target_reg', 'rowID', 'day_in_period'])
test_rowIDs = predict_rows['rowID']

# Split training data for validation
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)


# --------------------------------------
# PCA Implementation
# --------------------------------------

# Scaling the data before PCA
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_split)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)


# Set number of components
num_components = 20

# Apply PCA to training data
pca = PCA(n_components=num_components)
X_train_pca = pca.fit_transform(X_train_scaled)
X_val_pca = pca.transform(X_val_scaled)
X_test_pca = pca.transform(X_test_scaled)

print("Original shape of X_train_split:", X_train_split.shape)
print("Shape of X_train_pca after PCA:", X_train_pca.shape)
print("Explained Variance Ratio:", pca.explained_variance_ratio_)



# Train a Random Forest Classifier using the PCA data
clf = RandomForestClassifier(random_state=42, n_estimators=100)
clf.fit(X_train_pca, y_train_split)

# Validate the model
y_val_pred_proba = clf.predict_proba(X_val_pca)[:, 1]
validation_auc = roc_auc_score(y_val, y_val_pred_proba)
validation_log_loss = log_loss(y_val, y_val_pred_proba)

print("Validation AUC:", validation_auc)
print("Validation Log Loss:", validation_log_loss)

# Predict probabilities for the test set
y_test_pred_proba = clf.predict_proba(X_test_pca)[:, 1]

# Prepare submission file
submission = pd.DataFrame({'rowID': test_rowIDs, 'prediction': y_test_pred_proba})
submission.to_csv('Submission20.csv', index=False)
print("Submission file created: submission20.csv")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89760 entries, 0 to 89759
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   rowID                   89760 non-null  int64  
 1   season                  89760 non-null  int64  
 2   periodID                89760 non-null  int64  
 3   day_in_period           89760 non-null  int64  
 4   hour                    89760 non-null  int64  
 5   minute                  89760 non-null  int64  
 6   holyday                 89760 non-null  bool   
 7   weekday                 89760 non-null  int64  
 8   ke                      71808 non-null  float64
 9   hupx                    71808 non-null  float64
 10  afrr_fel                71808 non-null  float64
 11  afrr_le                 71808 non-null  float64
 12  mfrr_fel                71808 non-null  float64
 13  mfrr_le                 71808 non-null  float64
 14  afrr                    71808 non-null

Can we optimize hyperparameters further


before

model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,  # Limit tree depth
    min_samples_split=10,  # Increase minimum samples to split
    min_samples_leaf=5,  # Increase minimum samples in leaf
    random_state=42
)

then

model = RandomForestClassifier(
    n_estimators=200,  # Increased number of trees
    max_depth=15,  # Allow deeper trees for more complexity
    min_samples_split=15,  # Further increase minimum samples to split
    min_samples_leaf=8,  # Further increase minimum samples in leaf
    max_features='sqrt',  # Use square root of features for better generalization
    bootstrap=True,  # Enable bootstrapping for robustness
    random_state=42
)

In [None]:
# Train a Random Forest Classifier using the PCA data
clf = RandomForestClassifier(
    n_estimators=200,  # Increased number of trees
    max_depth=15,  # Allow deeper trees for more complexity
    min_samples_split=15,  # Further increase minimum samples to split
    min_samples_leaf=8,  # Further increase minimum samples in leaf
    max_features='sqrt',  # Use square root of features for better generalization
    bootstrap=True,  # Enable bootstrapping for robustness
    random_state=42)
clf.fit(X_train_pca, y_train_split)


In [None]:
# Validate the model
y_val_pred_proba = clf.predict_proba(X_val_pca)[:, 1]
validation_auc = roc_auc_score(y_val, y_val_pred_proba)
validation_log_loss = log_loss(y_val, y_val_pred_proba)

print("Validation AUC:", validation_auc)
print("Validation Log Loss:", validation_log_loss)

# Predict probabilities for the test set
y_test_pred_proba = clf.predict_proba(X_test_pca)[:, 1]

# Prepare submission file
submission = pd.DataFrame({'rowID': test_rowIDs, 'prediction': y_test_pred_proba})
submission.to_csv('Submission03.csv', index=False)
print("Submission file created: Submission03.csv")

Validation AUC: 0.9327358667791505
Validation Log Loss: 0.3328956179231922
Submission file created: Submission03.csv
