In [2]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
import os

file_path = 'C:/5. felev/DataScience/Vizsga/public_data.csv'
pub = pd.read_csv(file_path)

def assign_data_split(row):
    if row['day_in_period'] == 4:
        return "beadando"
    elif row['periodID'] >= pub['periodID'].max() - 5:
        return "kiertekelo" 
    else:
        return "train"

pub['train_test_validation'] = pub.apply(assign_data_split, axis=1)

pub['hour_category'] = pub['hour'].apply(lambda x: 1 if 5 <= x <= 20 else 0)

pub['rendszerterheles_holyday_interaction'] = pub['rendszerterheles_terv'] * pub['holyday']

pub['prev_day_target_flag'] = pub.groupby('periodID')['target_flag'].shift(1)
pub['prev_friday_target_flag'] = np.where(
    (pub['weekday'] == 0),  
    pub.groupby('periodID')['target_flag'].shift(3), 
    pub['prev_day_target_flag']
)

pub['solar_becsult_dayahead_shifted'] = pub.groupby('periodID')['solar_becsult_dayahead'].shift(-1)
pub['rendszerterheles_terv_shifted'] = pub.groupby('periodID')['rendszerterheles_terv'].shift(-1)


pub.drop(['solar_becsult_dayahead', 'rendszerterheles_terv'], axis=1, inplace=True)

marado_valtozok = ['periodID', 'day_in_period', 'hour', 'minute']
elhozandok = ['holyday', 'weekday', 'ke', 'hupx', 'afrr_fel', 'afrr_le', 'mfrr_fel', 'mfrr_le', 'afrr']

last_day = pub.copy()
last_day['day_in_period'] += 1
last_day = last_day[marado_valtozok + elhozandok]

for col in elhozandok:
    last_day = last_day.rename(columns={col: col + "_last_day"})

egy = pub.merge(last_day, on=marado_valtozok, how='left')

bemeno_valtozok = [
    'hour', 'minute', 'holyday', 'weekday', 
    'solar_becsult_dayahead_shifted', 
    'rendszerterheles_terv_shifted', 
    'rendszerterheles_holyday_interaction',    
    'holyday_last_day', 'weekday_last_day', 
    'ke_last_day', 'hupx_last_day', 
    'prev_friday_target_flag', 'season'
]

train_df = egy[egy['train_test_validation'] == "train"].copy()
test_df = egy[egy['train_test_validation'] == "kiertekelo"].copy()
val_df = egy[egy['train_test_validation'] == "beadando"].copy()

train_df[bemeno_valtozok] = train_df[bemeno_valtozok].apply(pd.to_numeric, errors='coerce')
test_df[bemeno_valtozok] = test_df[bemeno_valtozok].apply(pd.to_numeric, errors='coerce')
val_df[bemeno_valtozok] = val_df[bemeno_valtozok].apply(pd.to_numeric, errors='coerce')


xgb_best_model = XGBClassifier(
    random_state=42,
    n_estimators=300,
    max_depth=3,
    learning_rate=0.05,
    colsample_bytree=0.6,
    subsample=0.8,
    min_child_weight=3,
    gamma=0.2
)

xgb_best_model.fit(train_df[bemeno_valtozok], train_df['target_flag'])

test_df['tipp'] = xgb_best_model.predict_proba(test_df[bemeno_valtozok])[:, 1]
print("ROC AUC Score:", roc_auc_score(test_df['target_flag'], test_df['tipp']))

test_df['predicted_class'] = (test_df['tipp'] >= 0.5).astype(int)
print("Accuracy on test set:", accuracy_score(test_df['target_flag'], test_df['predicted_class']))

val_df['prediction'] = xgb_best_model.predict_proba(val_df[bemeno_valtozok])[:, 1]

beadando = val_df[['rowID', 'prediction']]
output_dir = os.path.dirname(file_path)
output_file_path = os.path.join(output_dir, "megoldas_best_xgboost_shifted.csv")
beadando.to_csv(output_file_path, index=False)


ROC AUC Score: 0.8629821855509872
Accuracy on test set: 0.8172743055555556
