In [309]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [310]:
df_generation = pd.read_csv("/kaggle/input/enerjisa-enerji-veri-maratonu/generation.csv", sep=';')
df_temperature = pd.read_csv("/kaggle/input/enerjisa-enerji-veri-maratonu/temperature.csv", sep=';')
sample_submission = pd.read_csv("/kaggle/input/enerjisa-enerji-veri-maratonu/sample_submission.csv", sep=',')

In [311]:
df_generation = df_generation[df_generation['DateTime'].notna()]

df = pd.merge(df_generation, df_temperature, on='DateTime')
sub = pd.merge(sample_submission, df_temperature, on='DateTime')
sub['Generation'] = sub['Generation'].astype(str)

df = pd.concat([df, sub], ignore_index=True)
df.drop('WWCode', axis=1, inplace=True)

def rep_to_float(df, column):
    df[column] = df[column].apply(lambda x: float(x.replace(',', '.')))

columns = ['Generation', 'AirTemperature', 'ComfortTemperature',
           'RelativeHumidity', 'WindSpeed', 'EffectiveCloudCover']

for column in columns:
    rep_to_float(df, column)

In [312]:
df.info()
print(df.columns)

In [313]:
import re

months = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4,
          'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8,
          'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}

re_list = {'Hour': r'(?<= )\d+', 'Day': r'^\d+', 'Month': r'[a-zA-Z]+', 'Year': r'\d+(?= )'}

def DateTimeParser(df, column):
    if column != 'Month':
        df[column] = df['DateTime'].apply(lambda x: re.search(re_list[column], x).group(0)).astype(int)
    else:
        df[column] = df['DateTime'].apply(lambda x: re.search(re_list[column], x).group(0)).map(months).astype(int)

columns = ['Hour', 'Day', 'Month', 'Year']

for column in columns:
    DateTimeParser(df, column)

df.drop('DateTime', axis=1, inplace=True)

In [314]:
df.head()

In [315]:
def celcius_to_kelvin(df, columns):
    df[column] = df[column]+273.15

columns = ['AirTemperature', 'ComfortTemperature']

for column in columns:
    celcius_to_kelvin(df, column)

In [316]:
import matplotlib.pyplot as plt
%matplotlib inline

x = np.random.normal(size = 1000)
plt.hist(df['WindDirection'], bins=45)
plt.gca().set(title='Frequency Histogram', ylabel='Frequency')

df['WindDirection'].describe()

In [317]:
df.loc[((df['WindDirection'] >= 0) & (df['WindDirection'] < 90)) | ((df['WindDirection'] > 315) & (df['WindDirection'] <= 360)), 'WindDirection'] = 0
df.loc[(df['WindDirection'] >= 90) & (df['WindDirection'] < 180), 'WindDirection'] = 1
df.loc[(df['WindDirection'] >= 180) & (df['WindDirection'] < 270), 'WindDirection'] = 2
df.loc[(df['WindDirection'] >= 270) & (df['WindDirection'] <= 315), 'WindDirection'] = 3

In [318]:
wind_dir = {0: 'NNE', 1: 'SE', 2: 'SW', 3: 'WNW'}

df['WindDirection'] = df['WindDirection'].map(wind_dir)

In [319]:
df_dummies = pd.get_dummies(df['WindDirection'])

df.drop('WindDirection', axis=1, inplace = True)

df = pd.concat([df, df_dummies], axis=1)

In [320]:
df['Temperature'] = df['AirTemperature'] * df['ComfortTemperature']

df.drop('AirTemperature', axis=1, inplace = True)
sub.drop('AirTemperature', axis=1, inplace = True)

In [321]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10,10))
sns.heatmap(df.corr(), annot=True, cmap="YlGnBu")
plt.show()

In [322]:
y = df['Generation'].copy()
X = df.drop('Generation', axis=1).copy()

In [323]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X = scaler.fit_transform(X)

In [324]:
pd.DataFrame(X)

In [325]:
df[(df['Month']==12) & (df['Year']==2021)]

In [326]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X[:25560], y[:25560], train_size=0.7, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.8, random_state=42)

In [327]:
print(X_train.shape[0])
print(X_val.shape[0])
print(X_test.shape[0])

In [328]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)
# dsub = xgb.DMatrix(X[25560:], label=y[25560:])

In [329]:
def get_model_rmse(params):
    model = xgb.train(params, dtrain, num_boost_round=100, evals=[(dval, 'eval')], early_stopping_rounds=10, verbose_eval=0)
    results = model.eval(dval)
    rmse = np.float(re.search(r'[\d.]+$', results).group(0))
    return rmse

In [330]:
def objective(trial):
    learning_rate = trial.suggest_loguniform('learning_rate', 0.00001, 10.0)
    max_depth = trial.suggest_int('max_depth', 4, 8)
    l1_reg = trial.suggest_loguniform('l1_reg', 0.00001, 10.0)
    l2_reg = trial.suggest_loguniform('l2_reg', 0.00001, 10.0)
    
    params = {'learning_rate': learning_rate, 'max_depth': max_depth, 'alpha': l1_reg, 'lambda': l2_reg}
    
    return get_model_rmse(params)

In [331]:
import optuna

study = optuna.create_study()
study.optimize(objective, n_trials=100, show_progress_bar=True)

In [341]:
best_params = study.best_params
study.best_params

In [342]:
model = xgb.train(best_params, dtrain, num_boost_round=10000, evals=[(dval, 'eval')], early_stopping_rounds=20)

In [343]:
y_true = np.array(y_test, dtype=np.float64)
y_pred = np.array(model.predict(dtest), dtype=np.float64)

In [344]:
from sklearn.metrics import r2_score

r2 = r2_score(y_true, y_pred)

print(f'R^2 Score: {r2}')

In [336]:
y_sub = np.array(model.predict(dsub), dtype=np.float64)

In [337]:
sample_submission['Generation'] = y_sub

In [338]:
sample_submission

In [339]:
sample_submission.to_csv("./sample_submission3.csv", index=False)

In [340]:
pd.read_csv("./sample_submission3.csv")