In [106]:
import numpy as np
import pandas as pd
import seaborn as sns
import time
from matplotlib import pyplot as plt

from sklearn.preprocessing import MaxAbsScaler, RobustScaler, StandardScaler
from sklearn.compose import ColumnTransformer 
from sklearn.metrics import make_scorer, accuracy_score, auc, classification_report, confusion_matrix, f1_score
from sklearn.pipeline import Pipeline, Parallel

from sklearn.model_selection import GridSearchCV, KFold, learning_curve, RandomizedSearchCV, TimeSeriesSplit, train_test_split
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB 
from sklearn.linear_model import LogisticRegression

In [284]:
#Data Import

y = pd.read_csv('generation_data.csv', parse_dates=['DateTime'], index_col=["DateTime"])
X = pd.read_csv('features_data.csv', parse_dates=['DateTime'], index_col=["DateTime"])
sample_submission = pd.read_csv("sample_submission.csv")

In [285]:
X.columns = [i.replace(' ', '_').replace("'", '').replace(',', '') for i in X.columns]
X["Month"] = [month for month in X.index.month]

def SeasonSelector(x):
    if x in (1, 2, 12):
        return 'Winter'
    elif x in (3, 4, 5):
        return 'Spring'
    elif x in (6, 7, 8):
        return 'Summer'
    elif x in (9, 10, 11):
        return 'Autumn'
    
    
X["Season"] = [SeasonSelector(month)  for month in X.index.month]

In [286]:
X = X.merge(pd.get_dummies(X["Season"]), how='left', left_index=True, right_index=True)
X = X.drop("Season", axis=1)

In [287]:
X = X.drop(['Commemoration_of_Ataturk_Youth_and_Sports_Day',
                          'Democracy_and_National_Unity_Day', 'Labour_Day',
                          'National_Sovereignty_and_Childrens_Day', 'New_Years_Day',
                          'Ramadan_Feast', 'Ramadan_Feast_Holiday', 'Republic_Day',
                          'Sacrifice_Feast', 'Sacrifice_Feast_Holiday', 'Victory_Day', 
                          'Day_Friday', 'Day_Monday', 'Day_Saturday', 'Day_Sunday',
                          'Day_Thursday', 'Day_Tuesday', 'Day_Wednesday'], axis=1)

In [288]:
X["WWCode"].describe()

count    20050.000000
mean        13.095711
std         24.471306
min          0.000000
25%          0.000000
50%          3.000000
75%         10.000000
max         99.000000
Name: WWCode, dtype: float64

In [71]:
y = X[["WWCode"]]
X2 = X.drop('WWCode', axis=1)
y_train = y[~y["WWCode"].isna()]
y_test = y[y["WWCode"].isna()]

X_train = X2[X2.index.isin(y_train.index)]
X_test = X2[X2.index.isin(y_test.index)]

In [72]:
X_train.head()

Unnamed: 0_level_0,AirTemperature,ComfortTemperature,RelativeHumidity,WindSpeed,WindDirection,EffectiveCloudCover,is_holiday,Hour,DayGroup,Month,Autumn,Spring,Summer,Winter
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2019-01-01 00:00:00,-1.7,-6.1,75.3,3.6,60.0,6.3,1,0,0,1,0,0,0,1
2019-01-01 01:00:00,-1.8,-5.3,75.3,2.6,70.0,4.5,1,1,0,1,0,0,0,1
2019-01-01 02:00:00,-2.0,-6.0,74.7,3.1,80.0,5.5,1,2,0,1,0,0,0,1
2019-01-01 03:00:00,-1.9,-5.9,76.4,3.1,60.0,7.6,1,3,0,1,0,0,0,1
2019-01-01 04:00:00,-2.0,-7.1,76.4,4.6,60.0,6.5,1,4,0,1,0,0,0,1


In [73]:
y_train = y_train.replace({'WWCode': {99:96, 87:86, 84:83, 82:81, 72:73, 62: 63}})
y_val = y_val.replace({'WWCode': {99:96, 87:86, 84:83, 82:81, 72:73, 62: 63}})

In [74]:
y_train["WWCode"].value_counts()

0.0     6294
2.0     2206
3.0     1646
5.0     1432
1.0     1226
10.0    1207
4.0     1147
80.0     969
25.0     895
6.0      748
60.0     404
95.0     331
7.0      212
70.0     203
71.0     155
21.0     120
29.0      97
22.0      91
61.0      91
26.0      76
45.0      70
51.0      66
17.0      46
68.0      43
81.0      32
85.0      30
50.0      28
23.0      26
73.0      21
83.0      20
77.0      20
41.0      12
27.0       9
40.0       9
42.0       8
86.0       6
48.0       6
13.0       6
46.0       6
91.0       6
8.0        5
44.0       5
96.0       4
63.0       4
47.0       4
20.0       3
28.0       3
43.0       2
Name: WWCode, dtype: int64

In [75]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, stratify=y_train)

In [90]:
gnb = GaussianNB()
bnb = BernoulliNB()
bnb_columns = ["is_holiday", "Hour","Month", "DayGroup", "Autumn", "Spring", "Summer", "Winter"]
gnb_columns = X_train.drop(bnb_columns, axis=1).columns

In [112]:
gnb.fit(X_train[gnb_columns], np.ravel(y_train))
y_train_gnb = gnb.predict(X_train[gnb_columns])
y_val_gnb = gnb.predict(X_val[gnb_columns])

y_train_prob_gnb = gnb.predict_proba(X_train[gnb_columns])
y_val_prob_gnb = gnb.predict_proba(X_val[gnb_columns])

###########################################

bnb.fit(X_train[bnb_columns], np.ravel(y_train))
y_train_bnb = bnb.predict(X_train[bnb_columns])
y_val_bnb = bnb.predict(X_val[bnb_columns])

y_train_prob_bnb = bnb.predict_proba(X_train[bnb_columns])
y_val_prob_bnb = bnb.predict_proba(X_val[bnb_columns])

In [96]:
print("Gaussian Train Accuracy:",accuracy_score(y_train, y_train_gnb))
print("Gaussian Val Accuracy:",accuracy_score(y_val, y_val_gnb))
print()
print("Bernoulli Train Accuracy:",accuracy_score(y_train, y_train_bnb))
print("Bernoulli Val Accuracy:",accuracy_score(y_val, y_val_bnb))

Gaussian Train Accuracy: 0.42767839329653523
Gaussian Val Accuracy: 0.4211051266706563

Bernoulli Train Accuracy: 0.31010174901908627
Bernoulli Val Accuracy: 0.3167763814083383


In [113]:
lgbm = LGBMClassifier(random_state=42)
lgbm.fit(X_train, np.ravel(y_train))
lgbm.fit(X_train, np.ravel(y_train))
y_train_lgbm = lgbm.predict(X_train)
y_val_lgbm = lgbm.predict(X_val)

y_train_prob_lgbm = lgbm.predict_proba(X_train)
y_val_prob_lgbm = lgbm.predict_proba(X_val)

print("LGBM Train Accuracy:",accuracy_score(y_train, y_train_lgbm))
print("LGBM Val Accuracy:",accuracy_score(y_val, y_val_lgbm))

LGBM Train Accuracy: 0.23761388574848707
LGBM Val Accuracy: 0.23498902852583284


In [176]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, np.ravel(y_train))
rf.fit(X_train, np.ravel(y_train))
y_train_rf = rf.predict(X_train)
y_val_rf = rf.predict(X_val)

y_train_prob_rf = rf.predict_proba(X_train)
y_val_prob_rf = rf.predict_proba(X_val)

print("Random Forest Train Accuracy:",accuracy_score(y_train, y_train_rf))
print("Random Forest Val Accuracy:",accuracy_score(y_val, y_val_rf))

Random Forest Train Accuracy: 0.9999334973731462
Random Forest Val Accuracy: 0.6351486136046279


In [175]:
logit = LogisticRegression(max_iter=2000)


scaler = Pipeline(steps=[
        ('standard_scaler', StandardScaler())])

preprocessor = ColumnTransformer(
        remainder='passthrough', #passthough features not listed
        transformers=[
            ('scaler',  scaler, gnb_columns)
        ])


pipeline = Pipeline(steps=([('preprocessor', preprocessor), ('logit', logit)]))


pipeline.fit(X_train, np.ravel(y_train))
pipeline.fit(X_train, np.ravel(y_train))
y_train_logit = pipeline.predict(X_train)
y_val_logit = pipeline.predict(X_val)

y_train_prob_logit = pipeline.predict_proba(X_train)
y_val_prob_logit = pipeline.predict_proba(X_val)

print("Logit Train Accuracy:",accuracy_score(y_train, y_train_logit))
print("Logit Val Accuracy:",accuracy_score(y_val, y_val_logit))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logit Train Accuracy: 0.4641218328123961
Logit Val Accuracy: 0.4703770197486535


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [177]:
from scipy.sparse import csr_matrix, hstack

sparse_lgbm = csr_matrix(y_train_prob_lgbm)
sparse_gnb = csr_matrix(y_train_prob_gnb) 
sparse_bnb = csr_matrix(y_train_prob_bnb) 
sparse_logit = csr_matrix(y_train_prob_logit)
sparse_rf = csr_matrix(y_train_prob_rf)
sparse_merged = hstack((sparse_lgbm, sparse_bnb, sparse_logit, sparse_gnb, sparse_rf), format='csr')

sparse_lgbm_val = csr_matrix(y_val_prob_lgbm)
sparse_gnb_val = csr_matrix(y_val_prob_gnb) 
sparse_bnb_val = csr_matrix(y_val_prob_bnb) 
sparse_logit_val = csr_matrix(y_val_prob_logit)
sparse_rf_val = csr_matrix(y_val_prob_rf)
sparse_merged_val = hstack((sparse_lgbm_val, sparse_gnb_val, sparse_bnb_val, sparse_logit_val, y_val_prob_rf), format='csr')

In [178]:
mnb_sparse = MultinomialNB()
mnb_sparse.fit(sparse_merged, np.ravel(y_train))
sparse_predict = mnb_sparse.predict(sparse_merged)
sparse_predict_val = mnb_sparse.predict(sparse_merged_val)
print("Multinomial Naive Bayes")
print("Train Overall Accuracy:", accuracy_score(y_train, sparse_predict))
print("Val Overall Accuracy:", accuracy_score(y_val, sparse_predict_val))

Multinomial Naive Bayes
Train Overall Accuracy: 0.8455809004455676
Val Overall Accuracy: 0.6070217434669858


In [179]:
logit_sparse = LogisticRegression()
logit_sparse.fit(sparse_merged, np.ravel(y_train))
sparse_predict = logit_sparse.predict(sparse_merged)
sparse_predict_val = logit_sparse.predict(sparse_merged_val)
print("Logistic Regression")
print("Train Overall Accuracy:", accuracy_score(y_train, sparse_predict))
print("Val Overall Accuracy:", accuracy_score(y_val, sparse_predict_val))


Logistic Regression
Train Overall Accuracy: 0.9989359579703398
Val Overall Accuracy: 0.637542389786555


In [180]:
lgbm_sparse = LGBMClassifier(random_state=42, max_depth=40, n_estimators=1000)
lgbm_sparse.fit(sparse_merged, np.ravel(y_train))
sparse_predict = lgbm_sparse.predict(sparse_merged)
sparse_predict_val = lgbm_sparse.predict(sparse_merged_val)
print("LGBM")
print("Train Overall Accuracy:", accuracy_score(y_train, sparse_predict))
print("Val Overall Accuracy:", accuracy_score(y_val, sparse_predict_val))


LGBM
Train Overall Accuracy: 0.375207820708918
Val Overall Accuracy: 0.14402553361260723


**Hyperparameter tuning for Random Forest**

In [None]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, np.ravel(y_train))
rf.fit(X_train, np.ravel(y_train))
y_train_rf = rf.predict(X_train)
y_val_rf = rf.predict(X_val)

y_train_prob_rf = rf.predict_proba(X_train)
y_val_prob_rf = rf.predict_proba(X_val)

print("Random Forest Train Accuracy:",accuracy_score(y_train, y_train_rf))
print("Random Forest Val Accuracy:",accuracy_score(y_val, y_val_rf))

In [194]:

rf = RandomForestClassifier(random_state=42, n_estimators=500)

params = {
    "criterion": ['gini', 'entropy'],
    "max_depth": [15, 30, 40, 50],
    "min_samples_split": [5, 10, 20, 50],
#    "min_samples_leaf": [5, 10, 20, 30],
 #   "min_impurity_decrease": [0, 0.001, 0.0001],
    "class_weight": ["balanced", None],
    "max_samples": [None, 0.75, 0.9],
 #   "ccp_alpha": [0, 0.001]
}


grid1 = GridSearchCV(rf, param_grid=params, cv=5, n_jobs=-1, scoring='accuracy', verbose=2)
grid1.fit(X_train, y_train)

print(grid1.best_params_)

y_train_grid1 = grid1.predict(X_train)
y_val_grid1 = grid1.predict(X_val)


print("Train Set Accuracy:", accuracy_score(y_train, y_train_grid1))

print()
print("Val Set Accuracy:", accuracy_score(y_val, y_val_grid1))


Fitting 5 folds for each of 192 candidates, totalling 960 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   52.9s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed: 12.2min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 22.2min
[Parallel(n_jobs=-1)]: Done 960 out of 960 | elapsed: 35.1min finished
  self.best_estimator_.fit(X, y, **fit_params)


{'class_weight': None, 'criterion': 'gini', 'max_depth': 30, 'max_samples': 0.9, 'min_samples_split': 5}
Train Set Accuracy: 0.9703398284232227

Val Set Accuracy: 0.642130460801915


In [210]:
from sklearn.model_selection import cross_val_score 
from sklearn.preprocessing import StandardScaler 
import joblib 

import optuna 
from optuna.samplers import TPESampler

In [242]:
# define the search space and the objecive function

def objective(trial):
    # Define the search space
    criterions = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    max_depths = trial.suggest_int('max_depth', 2, 500, 1)
    n_estimators = trial.suggest_int('n_estimators', 100, 1000, 100)
    class_weights = trial.suggest_categorical('class_weight', ['balanced', 'balanced_subsample', None])
    min_samples_splits = trial.suggest_int('min_samples_split', 2, 100, 1)
    min_samples_leafs = trial.suggest_int('min_samples_leaf', 2, 100, 1)
    max_sampless = trial.suggest_categorical("max_samples", np.arange(0.5, 1.0001, 0.0001))

    clf = RandomForestClassifier(n_estimators=n_estimators,
                                 criterion=criterions,
                                 max_depth=max_depths,
                                 class_weight=class_weights,
                                 min_samples_split=min_samples_splits,
                                 min_samples_leaf=min_samples_leafs,
                                 max_samples=max_sampless,
                                 n_jobs=-1)
    score = cross_val_score(clf, X_train, np.ravel(y_train), scoring="accuracy").mean()

    return score

In [243]:
# create a study object and pass the objective function to method optimize()

study = optuna.create_study(study_name="randomForest_optimization",
                            direction="maximize",
                            sampler=TPESampler())

study.optimize(objective, n_trials=20)

[32m[I 2022-02-26 22:12:39,831][0m A new study created in memory with name: randomForest_optimization[0m

The least populated class in y has only 2 members, which is less than n_splits=5.

[32m[I 2022-02-26 22:12:48,737][0m Trial 0 finished with value: 0.38677810286636144 and parameters: {'criterion': 'entropy', 'max_depth': 183, 'n_estimators': 200, 'class_weight': 'balanced_subsample', 'min_samples_split': 38, 'min_samples_leaf': 80, 'max_samples': 0.7839999999999687}. Best is trial 0 with value: 0.38677810286636144.[0m

The least populated class in y has only 2 members, which is less than n_splits=5.

[32m[I 2022-02-26 22:12:51,630][0m Trial 1 finished with value: 0.586021446412272 and parameters: {'criterion': 'entropy', 'max_depth': 429, 'n_estimators': 100, 'class_weight': None, 'min_samples_split': 35, 'min_samples_leaf': 40, 'max_samples': 0.6476999999999837}. Best is trial 1 with value: 0.586021446412272.[0m

The least populated class in y has only 2 members, which is


The least populated class in y has only 2 members, which is less than n_splits=5.

[32m[I 2022-02-26 22:16:14,315][0m Trial 17 finished with value: 0.5942013847122671 and parameters: {'criterion': 'entropy', 'max_depth': 331, 'n_estimators': 400, 'class_weight': None, 'min_samples_split': 17, 'min_samples_leaf': 39, 'max_samples': 0.8032999999999666}. Best is trial 14 with value: 0.6362312405804895.[0m

The least populated class in y has only 2 members, which is less than n_splits=5.

[32m[I 2022-02-26 22:16:32,783][0m Trial 18 finished with value: 0.6059718812133391 and parameters: {'criterion': 'gini', 'max_depth': 384, 'n_estimators': 900, 'class_weight': None, 'min_samples_split': 14, 'min_samples_leaf': 19, 'max_samples': 0.8051999999999664}. Best is trial 14 with value: 0.6362312405804895.[0m

The least populated class in y has only 2 members, which is less than n_splits=5.

[32m[I 2022-02-26 22:16:49,399][0m Trial 19 finished with value: 0.6163464327915713 and parameter

In [244]:
# print best parameters 
print(study.best_params)

{'criterion': 'entropy', 'max_depth': 18, 'n_estimators': 500, 'class_weight': None, 'min_samples_split': 7, 'min_samples_leaf': 2, 'max_samples': 0.6025999999999887}


In [245]:
# print best score 
print(study.best_value)

0.6362312405804895


In [246]:
# Plot the high-dimentional parameter relationships in a study.

optuna.visualization.plot_parallel_coordinate(study, params=['criterion', 'max_depth','n_estimators'])


In [247]:
# Plot hyperparameter importances.

optuna.visualization.plot_param_importances(study)

In [249]:
rf_op = RandomForestClassifier( random_state=42,criterion= 'entropy', max_depth= 18, n_estimators= 500,
                                class_weight= None, min_samples_split= 7, min_samples_leaf= 2, max_samples=0.6025999999999887)

rf_op.fit(X_train, np.ravel(y_train))
rf_op.fit(X_train, np.ravel(y_train))
y_train_rf_op = rf_op.predict(X_train)
y_val_rf_op = rf_op.predict(X_val)

print("Random Forest Train Accuracy:",accuracy_score(y_train, y_train_rf_op))
print("Random Forest Val Accuracy:",accuracy_score(y_val, y_val_rf_op))

Random Forest Train Accuracy: 0.8341424486267207
Random Forest Val Accuracy: 0.6459206064232994


**Predict the Missing Data in WWCode**

In [252]:
y_test_pred = rf_op.predict(X_test) 

In [259]:
pd.Series(y_test_pred).value_counts()

0.0     2740
3.0      904
2.0      683
4.0      608
5.0      404
6.0      180
1.0      165
10.0     151
80.0     146
25.0     132
95.0      78
60.0      38
70.0       9
22.0       5
7.0        4
29.0       4
17.0       3
dtype: int64

In [262]:
X_test["WWCode"] = pd.Series(y_test_pred, index=X_test.index)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [324]:
X_copy = X.copy()
X_copy = X_copy[["WWCode"]].fillna(X_test[["WWCode"]])

X_copy[X_copy.index.isin(X_test.index)].describe()

Unnamed: 0,WWCode
count,6254.0
mean,5.898625
std,16.69779
min,0.0
25%,0.0
50%,2.0
75%,4.0
max,95.0


In [333]:
X_imputed = X.copy()
X_imputed["WWCode"] = X_copy
X_imputed.describe()

Unnamed: 0,AirTemperature,ComfortTemperature,RelativeHumidity,WindSpeed,WindDirection,WWCode,EffectiveCloudCover,is_holiday,Hour,DayGroup,Month,Autumn,Spring,Summer,Winter
count,26304.0,26304.0,26304.0,26304.0,26304.0,26304.0,26304.0,26304.0,26304.0,26304.0,26304.0,26304.0,26304.0,26304.0,26304.0
mean,13.741522,13.239397,54.352832,1.655931,87.46396,11.384542,2.614051,0.038321,11.5,0.75,6.521898,0.249088,0.251825,0.251825,0.247263
std,9.379876,9.767058,22.566688,1.353196,100.762932,23.067932,2.497609,0.191974,6.922318,1.050813,3.449052,0.432493,0.434069,0.434069,0.431429
min,-13.1,-19.3,1.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,6.0,5.1,36.6,0.5,20.0,0.0,0.0,0.0,5.75,0.0,4.0,0.0,0.0,0.0,0.0
50%,13.4,13.0,55.1,1.5,30.0,2.0,2.1,0.0,11.5,0.0,7.0,0.0,0.0,0.0,0.0
75%,21.0,21.2,72.5,2.6,170.0,6.0,5.0,0.0,17.25,2.0,10.0,0.0,1.0,1.0,0.0
max,39.0,37.2,100.0,13.4,360.0,99.0,8.0,1.0,23.0,3.0,12.0,1.0,1.0,1.0,1.0


In [334]:
X_imputed.to_csv("X_imputed.csv")