In [12]:
import tensorflow as tf
import catboost
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import accuracy_score, confusion_matrix, plot_confusion_matrix, recall_score, plot_precision_recall_curve, precision_score, roc_auc_score
from sklearn.model_selection import train_test_split

import pandas as pd     
import matplotlib.pyplot as plt
import warnings
import numpy as np
warnings.simplefilter(action='ignore', category=FutureWarning)
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

from imblearn.over_sampling import RandomOverSampler

pio.renderers.default='notebook'
import xgboost as xgb




In [6]:
def normalize(df, how="min-max"):
    if how == "min-max" :
        return (df - df.min())/(df.max()-df.min())
    if how == "mean" :
        return (df-df.mean())/df.std()




def get_cp_indices_df(df_cp, df_indices, selected_coutry = None, data_resolution = "M", normalization="mean", normalize_cp=True, smoothing = False, smoothing_duration=6, indices=None) :


    if not selected_coutry : selected_coutry = df_cp.country.unique()

    df_cp_filter = df_cp[df_cp.country.isin(selected_coutry) ]
    df_cp_filter = df_cp_filter.groupby([df_cp_filter.index.to_period(data_resolution)]).mean().reset_index()
    df_cp_filter = df_cp_filter.set_index(df_cp_filter.timestamp).drop(["timestamp"], axis=1)


    if indices : df_indices = df_indices[indices]
    df_indices_filter = df_indices.groupby([df_indices.index.to_period(data_resolution)]).mean().reset_index()
    df_indices_filter = df_indices_filter.set_index(df_indices_filter.timestamp).drop(["timestamp"], axis=1)

    df_cp_indices = df_cp_filter.join(df_indices_filter)
    df_cp_indices = df_cp_indices[df_cp_indices.index.year>=1982]
    df_cp_indices = normalize(df_cp_indices,how=normalization)


    if not normalize_cp :  
        df_cp_indices.capacity_factor = df_cp_filter[df_cp_filter.index.year>=1982]


    if smoothing : 
        df_cp_indices = df_cp_indices.rolling(smoothing_duration, center=True).mean()


    return df_cp_indices

    

def compare_cp_index(df_cp_indices, smoothing = False, smoothing_duration=6, indices = None, start=1990, end=2019, show_plot = True):
    if indices : 
        df_cp_indices = df_cp_indices[indices+["capacity_factor"]] 
    df_cp_indices = df_cp_indices[(df_cp_indices.index.year > start) & (df_cp_indices.index.year < end)]
    
    if show_plot : 
        fig = px.line(df_cp_indices.set_index(df_cp_indices.index.astype("str")), 
            title= ("normalized capacity factor and climate indices, smoothing={}".format(smoothing_duration) if smoothing else "normalized capacity factor and climate indices"))
        fig.update_traces(visible='legendonly')
        fig.update_traces(visible=True, selector=dict(name="capacity_factor"))
        fig.show()
        
    
    return df_cp_indices.corr().capacity_factor[:-1]


def timeseries_generator(dataset,window_size,target="capacity_factor") :
    X = dataset.drop(target, axis = 1 )
    new_y = dataset[target].iloc[window_size:]
    new_X = np.array([X[i:i+window_size] for i in range(X.shape[0]-window_size)],dtype=object)
    return new_X, new_y

 

In [7]:
df_cp = pd.read_csv("dataset_with_timestamp")
df_cp = df_cp.drop(['hour', 'month', 'year', 'day'], axis=1)
df_cp['timestamp'] = pd.to_datetime(df_cp['timestamp'], format='%Y-%m-%d %H:%M:%S')

df_cp = df_cp.groupby(["timestamp","country"]).mean().reset_index()
df_cp = df_cp.set_axis(df_cp.timestamp)
df_cp = df_cp.drop(["timestamp"], axis=1)

indices = ["nao", "ao","mjo80e","mjo40w","mjo20e","mjo160e","mjo10w","nino34"]

df_indices = pd.read_csv("daily_indices_82_to_19.csv")
df_indices['timestamp'] = pd.to_datetime(df_indices['timestamp'], format='%Y-%m-%d')
df_indices = df_indices[["timestamp"]+indices]
df_indices = df_indices.set_axis(df_indices.timestamp)
df_indices = df_indices.drop(["timestamp"], axis=1)




compute correlation between LWP and CF of countries :

In [9]:
cp = []
for country in df_cp.country.unique() :
    cp.append(df_cp[df_cp.country==country].capacity_factor.to_list())

cp_countries = pd.DataFrame(np.array(cp).T, columns = df_cp.country.unique())
LWP_countries = pd.DataFrame((np.array(cp).T <=0.1), columns = df_cp.country.unique())

# With capacity factor of other countries :

## All countries

In [None]:
data_resolution = "d"
normalize_cp=True

indices = ["nao","ao"]
main_country = ["DE"]

df_cp_indices = get_cp_indices_df(df_cp, df_indices,indices=indices, selected_coutry=[country], data_resolution=data_resolution, normalization="mean",normalize_cp=False, smoothing=False, smoothing_duration=1).dropna()
df_cp_indices["LWP"] = (df_cp_indices_country.capacity_factor<0.1).astype("int")
countries = df_cp.country.unique()

for window_size in [1,10,50,100,200] :
    print("window_size",window_size)



    Xs = []
    ys = []

    for country in countries :
        df_cp_indices_country = get_cp_indices_df(df_cp, df_indices,indices=indices, selected_coutry=[country], data_resolution=data_resolution, normalization="mean",normalize_cp=False, smoothing=False, smoothing_duration=1).dropna()
        df_cp_indices[country] = df_cp_indices_country.capacity_factor



    X, y = timeseries_generator(df_cp_indices,window_size,target="LWP")


    

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,shuffle = True)

    X_train=X_train.reshape(-1,X_train.shape[1]*X_train.shape[2])
    X_test=X_test.reshape(-1,X_test.shape[1]*X_test.shape[2])


    oversample = RandomOverSampler(sampling_strategy='minority')
    X_train, y_train = oversample.fit_resample(X_train, y_train)


    clf = catboost.CatBoostClassifier()
    clf.fit(X_train,y_train, verbose=False)
    y_pred = clf.predict(X_test)


    df_test = pd.DataFrame(y_test, columns =["LWP"])
    df_test["LWP_pred"] = y_pred
    

    print("""Recall :
    What proportion of actual positives was identified correctly""",recall_score(y_test,y_pred))
    print("""Precision:
    What proportion of positive identifications was actually correctly""",precision_score(y_test,y_pred))
    print("""Accuracy:
    """,accuracy_score(y_test,y_pred))




## Using neigbors

In [None]:
data_resolution = "d"
normalize_cp=True

indices = ["nao","ao"]
main_country = ["DE"]

df_cp_indices = get_cp_indices_df(df_cp, df_indices,indices=indices, selected_coutry=[country], data_resolution=data_resolution, normalization="mean",normalize_cp=False, smoothing=False, smoothing_duration=1).dropna()
df_cp_indices["LWP"] = (df_cp_indices_country.capacity_factor<0.1).astype("int")
countries = LWP_countries.corr()["DE"].sort_values(ascending=False).index[1:11]

for window_size in [20] :
    print("window_size",window_size)



    Xs = []
    ys = []

    for country in countries :
        if country not in main_country:
            df_cp_indices_country = get_cp_indices_df(df_cp, df_indices,indices=indices, selected_coutry=[country], data_resolution=data_resolution, normalization="mean",normalize_cp=False, smoothing=False, smoothing_duration=1).dropna()
            df_cp_indices[country] = df_cp_indices_country.capacity_factor



    X, y = timeseries_generator(df_cp_indices,window_size,target="LWP")


    

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,shuffle = True)

    X_train=X_train.reshape(-1,X_train.shape[1]*X_train.shape[2])
    X_test=X_test.reshape(-1,X_test.shape[1]*X_test.shape[2])


    oversample = RandomOverSampler(sampling_strategy='minority')
    X_train, y_train = oversample.fit_resample(X_train, y_train)


    clf = catboost.CatBoostClassifier()
    clf.fit(X_train,y_train, verbose=True)
    y_pred = clf.predict(X_test)


    df_test = pd.DataFrame(y_test, columns =["LWP"])
    df_test["LWP_pred"] = y_pred
    

    print("""Recall :
    What proportion of actual positives was identified correctly""",recall_score(y_test,y_pred))
    print("""Precision:
    What proportion of positive identifications was actually correctly""",precision_score(y_test,y_pred))
    print("""Accuracy:
    """,accuracy_score(y_test,y_pred))




# With LWP of other countries :

## All countries

In [None]:
data_resolution = "d"
normalize_cp=True

indices = ["nao","ao"]
main_country = ["DE"]

df_cp_indices = get_cp_indices_df(df_cp, df_indices,indices=indices, selected_coutry=[country], data_resolution=data_resolution, normalization="mean",normalize_cp=False, smoothing=False, smoothing_duration=1).dropna()
df_cp_indices["LWP"] = (df_cp_indices_country.capacity_factor<0.1).astype("int")
countries = df_cp.country.unique()

for window_size in [50,100,200] :
    print("window_size",window_size)



    Xs = []
    ys = []

    for country in countries :
        if country not in main_country:
            df_cp_indices_country = get_cp_indices_df(df_cp, df_indices,indices=indices, selected_coutry=[country], data_resolution=data_resolution, normalization="mean",normalize_cp=False, smoothing=False, smoothing_duration=1).dropna()
            df_cp_indices[country] = (df_cp_indices_country.capacity_factor<0.1)



    X, y = timeseries_generator(df_cp_indices,window_size,target="LWP")


    

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,shuffle = True)

    X_train=X_train.reshape(-1,X_train.shape[1]*X_train.shape[2])
    X_test=X_test.reshape(-1,X_test.shape[1]*X_test.shape[2])


    oversample = RandomOverSampler(sampling_strategy='minority')
    X_train, y_train = oversample.fit_resample(X_train, y_train)


    clf = catboost.CatBoostClassifier()
    clf.fit(X_train,y_train, verbose=True)
    y_pred = clf.predict(X_test)


    df_test = pd.DataFrame(y_test, columns =["LWP"])
    df_test["LWP_pred"] = y_pred
    

    print("""Recall :
    What proportion of actual positives was identified correctly""",recall_score(y_test,y_pred))
    print("""Precision:
    What proportion of positive identifications was actually correctly""",precision_score(y_test,y_pred))
    print("""Accuracy:
    """,accuracy_score(y_test,y_pred))




## Using Neigbors

In [20]:
df_cp_indices

Unnamed: 0_level_0,capacity_factor,nao,ao,LWP,LU,BE,CH,ES,NL,DE,CZ,GB,HU,AT
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1982-01-01,0.432496,-1.625401,-2.135096,0,False,False,False,False,True,False,False,True,True,True
1982-01-02,0.451882,-1.548444,-2.175087,0,False,False,False,False,False,False,False,False,False,False
1982-01-03,0.591650,-1.449843,-2.246796,0,False,False,False,False,False,False,False,False,False,False
1982-01-04,0.804432,-0.925572,-1.635895,0,False,False,False,False,False,False,False,False,False,False
1982-01-05,0.763074,-0.843805,-1.331134,0,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-27,0.314128,-0.190872,-1.449040,0,True,True,False,True,False,False,False,False,False,False
2019-12-28,0.128708,-0.039362,-0.585091,0,True,True,False,True,False,False,False,False,False,False
2019-12-29,0.134690,0.086895,0.400899,0,True,False,False,True,False,False,False,False,False,False
2019-12-30,0.126536,0.282896,1.286912,0,True,False,True,True,False,False,False,False,True,True


In [25]:
data_resolution = "d"
normalize_cp=True
window_size = 10
indices = ["nao","ao"]
for main_country in  ["FR","DE","GB","DK","NO"] :

    print("country: ", main_country)

    df_cp_indices = get_cp_indices_df(df_cp, df_indices,indices=indices, selected_coutry=[main_country], data_resolution=data_resolution, normalization="mean",normalize_cp=False, smoothing=False, smoothing_duration=1).dropna()
    df_cp_indices["LWP"] = (df_cp_indices.capacity_factor<0.1).astype("int")
    countries = LWP_countries.corr()[main_country].sort_values(ascending=False).index[1:11]




    Xs = []
    ys = []

    for country in countries :
        if country not in main_country:
            df_cp_indices_country = get_cp_indices_df(df_cp, df_indices,indices=indices, selected_coutry=[country], data_resolution=data_resolution, normalization="mean",normalize_cp=False, smoothing=False, smoothing_duration=1).dropna()
            df_cp_indices[country] = (df_cp_indices_country.capacity_factor<0.1)



    X, y = timeseries_generator(df_cp_indices,window_size,target="LWP")


    

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,shuffle = True)

    X_train=X_train.reshape(-1,X_train.shape[1]*X_train.shape[2])
    X_test=X_test.reshape(-1,X_test.shape[1]*X_test.shape[2])


    oversample = RandomOverSampler(sampling_strategy='minority')
    X_train, y_train = oversample.fit_resample(X_train, y_train)


    clf = catboost.CatBoostClassifier()
    clf.fit(X_train,y_train, verbose=False)
    y_pred = clf.predict(X_test)


    df_test = pd.DataFrame(y_test, columns =["LWP"])
    df_test["LWP_pred"] = y_pred

    y_pred_proba = clf.predict_proba(X_test)[:, 1] 

    print("""   Recall: What proportion of actual positives was identified correctly""",recall_score(y_test,y_pred))
    print("""   Precision: What proportion of positive identifications was actually positive""",precision_score(y_test,y_pred))
    print("""   Accuracy:""",accuracy_score(y_test,y_pred))
    print("""   Roc auc:""",roc_auc_score(y_test.values, y_pred_proba))





country:  FR
   Recall: What proportion of actual positives was identified correctly 0.5473071324599709
   Precision: What proportion of positive identifications was actually positive 0.43119266055045874
   Accuracy: 0.8236836355691501
   Roc auc: 0.8219900988987551
country:  DE
   Recall: What proportion of actual positives was identified correctly 0.533249686323714
   Precision: What proportion of positive identifications was actually positive 0.43859649122807015
   Accuracy: 0.7998689097662224
   Roc auc: 0.8162696753035524
country:  GB
   Recall: What proportion of actual positives was identified correctly 0.5775316455696202
   Precision: What proportion of positive identifications was actually positive 0.4661558109833972
   Accuracy: 0.850338649770592
   Roc auc: 0.8611064317915645
country:  DK
   Recall: What proportion of actual positives was identified correctly 0.4458041958041958
   Precision: What proportion of positive identifications was actually positive 0.3965785381026439

In [26]:
data_resolution = "d"
normalize_cp=True
window_size = 10
indices = ["nao","ao"]
for main_country in  ["FR","DE","GB","DK","NO"] :

    print("country: ", main_country)

    df_cp_indices = get_cp_indices_df(df_cp, df_indices,indices=indices, selected_coutry=[main_country], data_resolution=data_resolution, normalization="mean",normalize_cp=False, smoothing=False, smoothing_duration=1).dropna()
    df_cp_indices["LWP"] = (df_cp_indices.capacity_factor<0.1).astype("int")
    df_cp_indices = df_cp_indices.drop(["capacity_factor"], axis=1)
    countries = LWP_countries.corr()[main_country].sort_values(ascending=False).index[1:11]




    Xs = []
    ys = []

    for country in countries :
        if country not in main_country:
            df_cp_indices_country = get_cp_indices_df(df_cp, df_indices,indices=indices, selected_coutry=[country], data_resolution=data_resolution, normalization="mean",normalize_cp=False, smoothing=False, smoothing_duration=1).dropna()
            df_cp_indices[country] = (df_cp_indices_country.capacity_factor<0.1)



    X, y = timeseries_generator(df_cp_indices,window_size,target="LWP")


    

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,shuffle = True)

    X_train=X_train.reshape(-1,X_train.shape[1]*X_train.shape[2])
    X_test=X_test.reshape(-1,X_test.shape[1]*X_test.shape[2])


    oversample = RandomOverSampler(sampling_strategy='minority')
    X_train, y_train = oversample.fit_resample(X_train, y_train)


    clf = catboost.CatBoostClassifier()
    clf.fit(X_train,y_train, verbose=False)
    y_pred = clf.predict(X_test)


    df_test = pd.DataFrame(y_test, columns =["LWP"])
    df_test["LWP_pred"] = y_pred

    y_pred_proba = clf.predict_proba(X_test)[:, 1] 

    print("""   Recall: What proportion of actual positives was identified correctly""",recall_score(y_test,y_pred))
    print("""   Precision: What proportion of positive identifications was actually positive""",precision_score(y_test,y_pred))
    print("""   Accuracy:""",accuracy_score(y_test,y_pred))
    print("""   Roc auc:""",roc_auc_score(y_test.values, y_pred_proba))





country:  FR
   Recall: What proportion of actual positives was identified correctly 0.37117903930131
   Precision: What proportion of positive identifications was actually positive 0.3512396694214876
   Accuracy: 0.8027091981647367
   Roc auc: 0.7379100668679817
country:  DE
   Recall: What proportion of actual positives was identified correctly 0.5084525357607282
   Precision: What proportion of positive identifications was actually positive 0.42778993435448576
   Accuracy: 0.803146165610662
   Roc auc: 0.7791853574980057
country:  GB
   Recall: What proportion of actual positives was identified correctly 0.476038338658147
   Precision: What proportion of positive identifications was actually positive 0.4287769784172662
   Accuracy: 0.8415993008520866
   Roc auc: 0.7879790209620567
country:  DK
   Recall: What proportion of actual positives was identified correctly 0.2950257289879931
   Precision: What proportion of positive identifications was actually positive 0.3132969034608379
  


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.



In [31]:
data_resolution = "d"
normalize_cp=True
indices = ["nao","ao"]
for main_country in  ["FR","DE","GB","DK","NO"] :

    print("country: ", main_country)

    df_cp_indices = get_cp_indices_df(df_cp, df_indices,indices=indices, selected_coutry=[main_country], data_resolution=data_resolution, normalization="mean",normalize_cp=False, smoothing=False, smoothing_duration=1).dropna()
    df_cp_indices["LWP"] = (df_cp_indices.capacity_factor<0.1).astype("int")
    countries = LWP_countries.corr()[main_country].sort_values(ascending=False).index[1:11]




    Xs = []
    ys = []

    for country in countries :
        if country not in main_country:
            df_cp_indices_country = get_cp_indices_df(df_cp, df_indices,indices=indices, selected_coutry=[country], data_resolution=data_resolution, normalization="mean",normalize_cp=False, smoothing=False, smoothing_duration=1).dropna()
            df_cp_indices[country] = (df_cp_indices_country.capacity_factor<0.1)

    for window_size in [1,10,20,50,200] :
        print("""   window_size =""", window_size)

        X, y = timeseries_generator(df_cp_indices,window_size,target="LWP")


        

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,shuffle = True)

        X_train=X_train.reshape(-1,X_train.shape[1]*X_train.shape[2])
        X_test=X_test.reshape(-1,X_test.shape[1]*X_test.shape[2])


        oversample = RandomOverSampler(sampling_strategy='minority')
        X_train, y_train = oversample.fit_resample(X_train, y_train)


        clf = catboost.CatBoostClassifier()
        clf.fit(X_train,y_train, verbose=False)
        y_pred = clf.predict(X_test)


        df_test = pd.DataFrame(y_test, columns =["LWP"])
        df_test["LWP_pred"] = y_pred

        y_pred_proba = clf.predict_proba(X_test)[:, 1] 

        print("""       Recall:""",recall_score(y_test,y_pred))
        print("""       Precision:""",precision_score(y_test,y_pred))
        print("""       Accuracy:""",accuracy_score(y_test,y_pred))
        print("""       Roc auc:""",roc_auc_score(y_test.values, y_pred_proba))





country:  FR
   window_size = 1
       Recall: 0.6463932107496464
       Precision: 0.35619641465315666
       Accuracy: 0.7650655021834061
       Roc auc: 0.8049255517562378
   window_size = 10
       Recall: 0.5155367231638418
       Precision: 0.4269005847953216
       Accuracy: 0.8180030587721214
       Roc auc: 0.8206134375369627
   window_size = 20
       Recall: 0.4862914862914863
       Precision: 0.43149807938540335
       Accuracy: 0.825098382160035
       Roc auc: 0.8185097561546929
   window_size = 50
       Recall: 0.4692982456140351
       Precision: 0.4579172610556348
       Accuracy: 0.8372042068361086
       Roc auc: 0.8259582052209562
   window_size = 200
       Recall: 0.46107784431137727
       Precision: 0.43441466854724964
       Accuracy: 0.8314507198228128
       Roc auc: 0.8130633715672373
country:  DE
   window_size = 1
       Recall: 0.7193211488250653
       Precision: 0.3932905067808708
       Accuracy: 0.767467248908297
       Roc auc: 0.8268455093985194
 