In [38]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [39]:
df_generation = pd.read_csv("/kaggle/input/enerjisa-enerji-veri-maratonu/generation.csv", sep=';')
df_temperature = pd.read_csv("/kaggle/input/enerjisa-enerji-veri-maratonu/temperature.csv", sep=';')
sample_submission = pd.read_csv("/kaggle/input/enerjisa-enerji-veri-maratonu/sample_submission.csv", sep=',')

# Preprocessing

In [40]:
def onehot_encode(df, column):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

def rep_to_float(df, column):
    if column == 'Generation':
        df[column] = df[column][df[column].notna()].apply(lambda x: float(x.replace(',', '.')))
    else:
        df[column] = df[column].apply(lambda x: float(x.replace(',', '.')))

def DateTimeParser(df, column):
    df[column] = df['DateTime'].apply(lambda x: getattr(x, column.lower()))

def celcius_to_kelvin(df, column):
    df[column] = df[column]+273.15

In [41]:
def preprocess_inputs(df1, df2, ):
    df1 = df1.copy()
    df2 = df2.copy()
    
    # Merge two datasets on 'DateTime' column with right join.
    df = pd.merge(df1, df2, on='DateTime', how='left') # df_temperature should be on the right!
    # ----------
    
    # Drop the NaN rows from DateTime column. 
    df = df[df['DateTime'].notna()]
    # ----------
    
    # Replace ',' with '.' and change the types of 'objects' to 'float'.
    columns = ['Generation', 'AirTemperature', 'ComfortTemperature',
               'RelativeHumidity', 'WindSpeed', 'EffectiveCloudCover']
    for column in columns:
        rep_to_float(df, column)
    # ----------
    
    # Parse and split the Date&Time data into seperate columns with integer values.
    df['DateTime'] = pd.to_datetime(df['DateTime'])

    for column in ['Hour', 'Day', 'Month', 'Year']:
        DateTimeParser(df, column)

    df.drop('DateTime', axis=1, inplace=True) # Drop the 'DateTime' column at the end.
    # ----------
    
    # Change the Temperatures from Celcius to Kelvin for more accurate calculations.
    for column in ['AirTemperature', 'ComfortTemperature']:
        celcius_to_kelvin(df, column)
    # ----------
    
    # Divide Wind Directions into seperate directions and apply get_dummies (360 degree is the same as 0 degree) | Comment out 2 commented lines to see the proof that 360 degrees being equal to 0 degrees
#     print(df['WindDirection'][(df['WindDirection'] >= 0) & (df['WindDirection']<45)].count()+df['WindDirection'][(df['WindDirection'] == 360)].count())
    step = 45
    for degree in range(0, 360+1):
        if degree == 360:
            step = 361
        df.loc[(df['WindDirection'] == degree), 'WindDirection'] = str(degree//step)
#     print(df['WindDirection'].describe())
    # ----------
    
    # WWCode
    df['WWCode'] = df['WWCode'][df['WWCode'].notna()].astype(int).astype(str)
    df['WWCode'] = df['WWCode'].replace(np.NaN, 'Unknown')
    # ----------
    
    # One-hot encode the nominal features
    for column in ['WindDirection', 'WWCode']:
        df = onehot_encode(df, column=column)
    
    # Generation
    df['Generation'] = df['Generation'].replace(np.NaN, float(0))
    # ----------

    pred = df.iloc[-744:]
    df = df.iloc[:-744]
    
    # Split df into X and y
    y = df['Generation']
    X = df.drop('Generation', axis=1)

    # Scale X
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    # scaler.fit(X_train)
    X = scaler.fit_transform(X)
    # X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    # X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

    # Train-test split
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.8, random_state=42)

    return df, pred, X_train, X_test, y_train, y_test, X_val, y_val

In [42]:
df, pred, X_train, X_test, y_train, y_test, X_val, y_val = preprocess_inputs(df_temperature, df_generation)

In [43]:
# df.info()
df.head()
# df.tail()
# pred.info()
# pred.head()
# pred.tail()

# EDA

In [44]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10,10))
sns.heatmap(df.iloc[:, :10].corr(), annot=True, cmap="YlGnBu") # Non-encoded columns only
plt.show()

In [45]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.hist(df.iloc[:, 10:18][df.iloc[:, 10:18] == 1])
plt.gca().set(title='Frequency Histogram', ylabel='Frequency')

# Feature Engineering

# Training

In [46]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
models = {
    "    Linear Regression": LinearRegression(),
    "Ridge (L2) Regression": Ridge(),
    "Lasso (L1) Regression": Lasso()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

# Results

In [47]:
for name, model in models.items():
    print(name + f" R^2 Score: {model.score(X_test, y_test)}")

# XGBOOST

In [None]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

In [None]:
import re
def get_model_rmse(params):
    model = xgb.train(params, dtrain, num_boost_round=100, evals=[(dval, 'eval')], early_stopping_rounds=10, verbose_eval=0)
    results = model.eval(dval)
    rmse = np.float(re.search(r'[\d.]+$', results).group(0))
    return rmse

In [None]:
def objective(trial):
    learning_rate = trial.suggest_loguniform('learning_rate', 0.00001, 10.0)
    max_depth = trial.suggest_int('max_depth', 4, 8)
    l1_reg = trial.suggest_loguniform('l1_reg', 0.00001, 10.0)
    l2_reg = trial.suggest_loguniform('l2_reg', 0.00001, 10.0)
    
    params = {'learning_rate': learning_rate, 'max_depth': max_depth, 'alpha': l1_reg, 'lambda': l2_reg}
    
    return get_model_rmse(params)

In [None]:
import optuna

study = optuna.create_study()
study.optimize(objective, n_trials=100, show_progress_bar=True)

In [None]:
best_params = study.best_params
study.best_params

In [None]:
model = xgb.train(best_params, dtrain, num_boost_round=10000, evals=[(dval, 'eval')], early_stopping_rounds=20)

In [None]:
y_true = np.array(y_test, dtype=np.float64)
y_pred = np.array(model.predict(dtest), dtype=np.float64)

In [None]:
from sklearn.metrics import r2_score

r2 = r2_score(y_true, y_pred)

print(f'R^2 Score: {r2}')

In [None]:
y_sub = np.array(model.predict(dsub), dtype=np.float64)

In [None]:
sample_submission['Generation'] = y_sub

In [None]:
sample_submission

In [None]:
sample_submission.to_csv("./sample_submission3.csv", index=False)

In [None]:
pd.read_csv("./sample_submission3.csv")