In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
import plotly.graph_objects as go
import plotly
plotly.offline.init_notebook_mode(connected=True)

file_name = '/home/cj/Bureau/Master2/big-data/03_MILESTONES/belgium_information.csv'  
bel_info = pd.read_csv(file_name)
years = bel_info['Years']
bel_info = bel_info.drop(["Years", "fossil consumption"], axis=1)
names = bel_info.columns
# Scaled the data first
scaler = preprocessing.StandardScaler()
# Fit your data on the scaler object
scaled_data = scaler.fit_transform(bel_info)
scaled_bel_info = pd.DataFrame(scaled_data, columns=bel_info.columns)

Let's show the graph of the variation of the variables in function of time.

In [2]:
temporal_var_dict = {}
for i in range(len(bel_info.columns)):
    variation = list()
    for j in range(38):
        cur_val = scaled_bel_info[bel_info.columns[i]].iloc[j+1]
        prev_val = scaled_bel_info[bel_info.columns[i]].iloc[j]
        variation.append((cur_val - prev_val)/100)
    temporal_var_dict[bel_info.columns[i]] = variation

temporal_var = pd.DataFrame.from_dict(temporal_var_dict, orient='index').transpose()

In [3]:
corr = bel_info.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,population,gdp per capita,Carbon intensity of fossil energy,primary intensity,nuclear consumption,renewable consumption
population,1.0,0.91,-0.94,-0.78,0.23,0.94
gdp per capita,0.91,1.0,-0.97,-0.92,0.47,0.75
Carbon intensity of fossil energy,-0.94,-0.97,1.0,0.89,-0.49,-0.78
primary intensity,-0.78,-0.92,0.89,1.0,-0.65,-0.64
nuclear consumption,0.23,0.47,-0.49,-0.65,1.0,0.037
renewable consumption,0.94,0.75,-0.78,-0.64,0.037,1.0


1. Population
2. carbon intensity of fossil energy
3. gdp per capita
4. primary intensity
5. nuclear consumption
6. renewable energy

In [4]:
# Correlation matrix
corr = temporal_var.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,population,gdp per capita,Carbon intensity of fossil energy,primary intensity,nuclear consumption,renewable consumption
population,1.0,0.028,0.04,0.047,-0.31,0.64
gdp per capita,0.028,1.0,-0.28,-0.62,0.086,-0.036
Carbon intensity of fossil energy,0.04,-0.28,1.0,0.085,-0.28,0.3
primary intensity,0.047,-0.62,0.085,1.0,-0.1,0.012
nuclear consumption,-0.31,0.086,-0.28,-0.1,1.0,-0.52
renewable consumption,0.64,-0.036,0.3,0.012,-0.52,1.0


This correlation matrix is not used yet ..

In [5]:
from plotly.subplots import make_subplots

def plotly_descriptive_variable(fig, names, temporal_var, v, v_list):
    
    row = [1, 1, 2, 2, 3]
    col = [1, 2, 1, 2, 1]    

    for i in range(len(row)):
        fig.add_trace(go.Scatter(x=years, y=temporal_var[names[v]], opacity=0.8, line=dict(color="Black")),
                      row=row[i], col=col[i])
        fig.add_trace(go.Scatter(x=years, y=temporal_var[names[v_list[i]]]), row=row[i], col=col[i])
        fig.update_xaxes(title_text="Years", row=row[i], col=col[i])
        fig.update_yaxes(title_text="Pourcentage", row=row[i], col=col[i])
    
    # Update title and height
    fig.update_layout(title_text="Variation in function of time of "+names[v]+ " with respect "+
                                  "to other predictive variables"+"\n",
                      height=1000, width=1000, showlegend=False)

    fig.show()
    
# Initialize figure with subplots
fig = make_subplots(
    rows=3, cols=2, subplot_titles=("Population - Gdp per capita", "Population - Carbon intensity of fossil energy",
                                    "Population - Primary intensity", "Population - Nuclear consumption",
                                    "Population - Renewable consumption"))

plotly_descriptive_variable(fig, names, temporal_var, 0, [1,2,3,4,5]) # population

In [7]:
# Initialize figure with subplots
fig = make_subplots(
    rows=3, cols=2, subplot_titles=("Gdp per capita - Population",
                                    "Gdp per capita - Carbon intensity of fossil energy",
                                    "Gdp per capita - Primary intensity", "Gdp per capita - Nuclear consumption",
                                    "Gdp per capita - Renewable consumption"))

plotly_descriptive_variable(fig, names, temporal_var, 1, [0,2,3,4,5])

In [8]:
# Initialize figure with subplots
fig = make_subplots(
    rows=3, cols=2, subplot_titles=("Carbon intensity of fossil energy - Population",
                                    "Carbon intensity of fossil energy - Gdp per capita",
                                    "Carbon intensity of fossil energy - Primary intensity",
                                    "Carbon intensity of fossil energy - Nuclear consumption",
                                    "Carbon intensity of fossil energy - Renewable consumption"))

plotly_descriptive_variable(fig, names, temporal_var, 2, [0,1,3,4,5])

Subset selection on a regression model

- Best subset selection
- Forward stepwise selection
- Criteria for choosing the optimal model : Cp, AIC, BIC, R2adj

Explanations from the book "Introduction to Statistical Learning (ISLR)" Chapter 6

In [10]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import r2_score
from RegscorePy import *
import itertools
from tqdm.notebook import trange

import math  
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge

from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import GridSearchCV

data = scaled_bel_info
#data = bel_info

def regression_model(X, y_true, model):
    
    if model == 'poly':
        model = make_pipeline(PolynomialFeatures(4), Ridge())
    if model == 'kr':
        model = GridSearchCV(KernelRidge(kernel='rbf', gamma=0.1), cv=5, iid=True,
                  param_grid={"alpha": [1e0, 0.1, 1e-2, 1e-3],
                              "gamma": np.logspace(-2, 2, 5)})
    
    a = 39-12 ; b = 39
    
    x_train = X[0:a]; y_train = y_true[0:a]
    x_test = X[a:b]; y_test = y_true[a:b]
    
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    mse = mean_squared_error(y_test, y_pred) 
    r2 = r2_score(y_test, y_pred)
    bics = bic.bic(y_test, y_pred, x_test.shape[1])
    
    return mse, r2, bics

def criteria_matrix(nb_param_list, name_var_list, criteria, name_criteria):
    criteria_dict = {}
    for i in range(len(criteria)): criteria_dict[i] = [nb_param_list[i], list(name_var_list[i]), criteria[i]]
    
    criteria_res = pd.DataFrame.from_dict(criteria_dict, orient='index')
    criteria_res.columns = ["number of variables", "variables", name_criteria]
    
    return criteria_res

def best_combo(n, nb_param_list, name_var_list, criteria, name_criteria):
    
    criteria_mat = criteria_matrix(nb_param_list, name_var_list, criteria, name_criteria)
    
    nb_var = list()
    
    for i in range(n):
        is_i = criteria_mat['number of variables'] == i+1
        if name_criteria == "r2":
            nb_var.append(criteria_mat[is_i].sort_values(by=[name_criteria], ascending = False)[0:1])
        else:
            nb_var.append(criteria_mat[is_i].sort_values(by=[name_criteria])[0:1])
    
    return pd.concat(nb_var)

def regression_metrics(data, var_name, model):
    y_train = data[var_name]
    x_train = data.drop(var_name, axis=1)
    mse_list, r2_list, bic_list, nb_var_list, name_var_list  = list(), list(), list(), list(), list()
    
    for k in trange(1, len(x_train.columns) + 1, desc = 'Loop...'): 
    
        for combo in itertools.combinations(x_train.columns, k):
            reg_result = regression_model(x_train[list(combo)], y_train, model)
            mse_list.append(reg_result[0])
            r2_list.append(reg_result[1])
            bic_list.append(reg_result[2])
            nb_var_list.append(len(combo))
            name_var_list.append(combo)
        
    return nb_var_list, name_var_list, mse_list, r2_list, bic_list

## POPULATION ##

In [11]:
# model_lin, model_poly_4, kr
result = regression_metrics(data, 'population', 'poly')
# result number of var, name of var, mse, r2, bic

# mse
best_combo_mse = best_combo(5, result[0], result[1], result[2], "mse")
# r2
best_combo_r2 = best_combo(5, result[0], result[1], result[3], "r2")
# bic
best_combo_bic = best_combo(5, result[0], result[1], result[4], "bic")

best_combo_bic

HBox(children=(IntProgress(value=0, description='Loop...', max=5, style=ProgressStyle(description_width='initi…




Unnamed: 0,number of variables,variables,bic
1,1,[Carbon intensity of fossil energy],-9.169248
5,2,"[gdp per capita, Carbon intensity of fossil en...",-11.438068
15,3,"[gdp per capita, Carbon intensity of fossil en...",-10.282001
25,4,"[gdp per capita, Carbon intensity of fossil en...",7.518375
30,5,"[gdp per capita, Carbon intensity of fossil en...",24.784559


### Conclusion ###

Assumption : None of these variables can explain the population variable.

## ASSUMPTION FOR THE OTHER VARIABLES ##

BIC value represent the lost of information when using a specific model.

## gdp per capita ##

In [12]:
result = regression_metrics(data, 'gdp per capita', 'kr') # model_poly_4
# result number of var, name of var, mse, r2, bic

# mse
best_combo_mse = best_combo(5, result[0], result[1], result[2], "mse")
# r2
best_combo_r2 = best_combo(5, result[0], result[1], result[3], "r2")
# bic
best_combo_bic = best_combo(5, result[0], result[1], result[4], "bic")

best_combo_bic

HBox(children=(IntProgress(value=0, description='Loop...', max=5, style=ProgressStyle(description_width='initi…




Unnamed: 0,number of variables,variables,bic
1,1,[Carbon intensity of fossil energy],-25.608485
6,2,"[population, primary intensity]",-37.807048
21,3,"[Carbon intensity of fossil energy, primary in...",-31.436369
29,4,"[Carbon intensity of fossil energy, primary in...",-13.398715
30,5,"[population, Carbon intensity of fossil energy...",-3.604507


### Conclusion ###

With a polynomial regression of degree we have a BIC equal to **-27** and the choosen variables are **population and primary intensity**.

With a ridge regression we have a BIC equal to **-37**  and the choosen variables are **population and primary intensity**.

# Carbon intensity of fossil energy ##

In [13]:
result = regression_metrics(data, 'Carbon intensity of fossil energy', 'kr') # model_poly_4
# result number of var, name of var, mse, r2, bic

# mse
best_combo_mse = best_combo(5, result[0], result[1], result[2], "mse")
# r2
best_combo_r2 = best_combo(5, result[0], result[1], result[3], "r2")
# bic
best_combo_bic = best_combo(5, result[0], result[1], result[4], "bic")

best_combo_bic

HBox(children=(IntProgress(value=0, description='Loop...', max=5, style=ProgressStyle(description_width='initi…




Unnamed: 0,number of variables,variables,bic
1,1,[gdp per capita],-34.154942
9,2,"[gdp per capita, primary intensity]",-10.918491
20,3,"[population, nuclear consumption, renewable co...",-14.007072
27,4,"[population, gdp per capita, nuclear consumpti...",-27.268288
30,5,"[population, gdp per capita, primary intensity...",-26.642802


### Conclusion ###

With a polynomial regression of degree we have a BIC equal to **-26** and the choosen variables are **population and gdp**.

With a ridge regression we have a BIC equal to **-34**  and the choosen variables are **population**.

## primary intensity ##

In [14]:
result = regression_metrics(data, 'primary intensity', 'kr') # model_poly_4
# result number of var, name of var, mse, r2, bic

# mse
best_combo_mse = best_combo(5, result[0], result[1], result[2], "mse")
# r2
best_combo_r2 = best_combo(5, result[0], result[1], result[3], "r2")
# bic
best_combo_bic = best_combo(5, result[0], result[1], result[4], "bic")

best_combo_bic

HBox(children=(IntProgress(value=0, description='Loop...', max=5, style=ProgressStyle(description_width='initi…




Unnamed: 0,number of variables,variables,bic
2,1,[Carbon intensity of fossil energy],-46.936742
10,2,"[gdp per capita, nuclear consumption]",-34.17506
21,3,"[gdp per capita, Carbon intensity of fossil en...",-36.559097
25,4,"[population, gdp per capita, Carbon intensity ...",-29.297197
30,5,"[population, gdp per capita, Carbon intensity ...",-14.277249


### Conclusion ###

With a polynomial regression of degree we have a BIC equal to **-43** and the choosen variables are **gdp per capita**.

With a ridge regression we have a BIC equal to **-46**  and the choosen variables are **Carbon intensity of fossil energy**.

## nuclear consumption ##

In [15]:
result = regression_metrics(data, 'nuclear consumption', 'kr') # model_poly_4
# result number of var, name of var, mse, r2, bic

# mse
best_combo_mse = best_combo(5, result[0], result[1], result[2], "mse")
# r2
best_combo_r2 = best_combo(5, result[0], result[1], result[3], "r2")
# bic
best_combo_bic = best_combo(5, result[0], result[1], result[4], "bic")

best_combo_bic

HBox(children=(IntProgress(value=0, description='Loop...', max=5, style=ProgressStyle(description_width='initi…




Unnamed: 0,number of variables,variables,bic
0,1,[population],-8.639573
8,2,"[population, renewable consumption]",-5.636557
22,3,"[gdp per capita, Carbon intensity of fossil en...",-3.256243
26,4,"[population, gdp per capita, Carbon intensity ...",-0.108183
30,5,"[population, gdp per capita, Carbon intensity ...",2.168793


### Conclusion ###

With a polynomial regression of degree we have a BIC equal to **-8** and the choosen variables are **gdp per capita and Carbon intensity of fossil energy**.

With a ridge regression we have a BIC equal to **-8**  and the choosen variables are **population**.

## renewable consumption ##

In [16]:
result = regression_metrics(data, 'renewable consumption', 'kr') # model_poly_4
# result number of var, name of var, mse, r2, bic

# mse
best_combo_mse = best_combo(5, result[0], result[1], result[2], "mse")
# r2
best_combo_r2 = best_combo(5, result[0], result[1], result[3], "r2")
# bic
best_combo_bic = best_combo(5, result[0], result[1], result[4], "bic")

best_combo_bic

HBox(children=(IntProgress(value=0, description='Loop...', max=5, style=ProgressStyle(description_width='initi…




Unnamed: 0,number of variables,variables,bic
0,1,[population],13.473836
8,2,"[population, nuclear consumption]",13.197902
19,3,"[population, Carbon intensity of fossil energy...",13.638265
28,4,"[population, Carbon intensity of fossil energy...",16.671063
30,5,"[population, gdp per capita, Carbon intensity ...",23.116817


With a polynomial regression of degree we have a BIC equal to **11** and the choosen variables are **gdp per capita and Carbon intensity of fossil energy**.

With a ridge regression we have a BIC equal to **13**  and the choosen variables are **population and nuclear consumption**.

# Initialization of the objects representing the predictive variables #

We are going to use a function called Predictive_var in order to deal with the evolution of each variable.

Gdp -> Poly : **population and primary intensity**, Kr : **population and primary intensity**

Carbon intensity of fossil energy -> Poly : **population and gdp**, Kr : **population**

Primary energy -> : Poly : **gdp per capita**, Kr : **Carbon intensity of fossil energy**

Nuclear -> Poly : **gdp per capita and Carbon intensity of fossil energy**, Kr : **population**

Ren -> Poly : **gdp per capita and Carbon intensity of fossil energy**, Kr : **population and nuclear consumption**

In [373]:
class Predictive_Var:
    years = list()
    for i in range(39) : years.append(1980+i)
        
    def __init__(self, name, data, var_names=False, cursor=False, model=False):
        self.name = name
        self.model = model
        self.data = data
        self.data_2018 = data[38]
        self.last_value = self.data_2018 # Value of this variable in 2018
        self.slope = 0 # Evolution in function of time of the value of this variable
        self.actual_data = list()
        self.type = None # How is going to evolve. Ex : lin, exp, quadra.    
        self.cursor = cursor
        self.var_names = var_names
        self.evol = list()
        self.init = True
    
    def cursor_scenario(self, level, init=True): # initial scenario induced by the value of a new cursor
        
        if level is not False:
            self.slope = level/100
        else:
            self.slope = 0
            
        self.last_value = self.data_2018
        
    def forecast_new_value(self, list_pred):
        if self.model is False: # for population var
            new = self.last_value + self.slope*self.last_value
            self.last_value = new
            self.actual_data.append(new)
        else:
            x_test = list_pred[self.var_names]
            y_pred = float(self.model.predict(x_test))
            
            slope_effect = float(self.slope*(self.last_value/100))
            self.actual_data.append(float(y_pred + slope_effect))
        
        evol = float(((self.actual_data[len(self.actual_data)-1] - self.last_value)/abs(self.last_value))*100)
        self.evol.append(evol)
        """
        print(self.name+"\n")
        print("Prediction by the model : " + str(y_pred))
        print("Slope effect : " + str(slope_effect))
        print(self.evol)
        print("Actual value : "+str(self.actual_data[len(self.actual_data)-1]))
        """
        self.last_value = self.actual_data[len(self.actual_data)-1]
        
        
    # to modify
    def show_data(self, data=False):
        fig = go.Figure()
        fig.add_trace(go.Scatter(x=years, y=self.data, name=self.name, opacity=0.8))
        
        if data is not False:
            years_f = list()
            for i in range(13) : years_f.append(2018+i)
            data.insert(0, self.data[len(self.data)-1])
            fig.add_trace(go.Scatter(x=years_f, y=data, name=self.name, opacity=0.8))
        
        # Use date string to set xaxis range
        fig.update_layout(title_text=self.name)

        fig.update_xaxes(title_text="Years")
        fig.update_yaxes(title_text="[ ]"),
        fig.update_layout(legend_orientation="h")
        fig.show()

def model_trained(data, y_name, x_name, model):
    
    if model == 'poly':
        model = make_pipeline(PolynomialFeatures(4), Ridge())
    if model == 'kr':
        model = GridSearchCV(KernelRidge(kernel='rbf', gamma=0.1), cv=5, iid=True,
                  param_grid={"alpha": [1e0, 0.1, 1e-2, 1e-3],
                              "gamma": np.logspace(-2, 2, 5)})

    y_train = data[y_name] # y_test = data[y_name]
    x_train = data[x_name] # x_test = data[x_name]
    
    model.fit(x_train, y_train)
    
    #y_pred = model.predict(x_test)
    #bics = bic.bic(y_test, y_pred, x_test.shape[1])
    #print(bics)

    return model

def get_pred(dict_var):
    pred = {}
    for name in dict_var:
        var = dict_var[name]
        pred[name] = var.last_value
        
    pred = pd.DataFrame.from_dict(pred, orient='index').transpose()

    return pred

def init_variable(data):

    # Instances of Predictive_Var

    var_pop = Predictive_Var("population", data['population'], cursor =[-3, -1, 0, 5, 10])

    #carbon_combi = ['population', 'gdp per capita']
    carbon_combi = ['population', 'gdp per capita', 'nuclear consumption', 'primary intensity']
    model_carbon = model_trained(data, 'Carbon intensity of fossil energy', carbon_combi, 'kr')
    var_carbon = Predictive_Var('Carbon intensity of fossil energy', data['Carbon intensity of fossil energy'],
                                carbon_combi, model = model_carbon, cursor=[10, 4, 0, -2, -4])

    #gdp_combi = ['population', 'primary intensity']
    gdp_combi = ['population', 'primary intensity', 'nuclear consumption', 'Carbon intensity of fossil energy']
    model_gdp = model_trained(data, 'gdp per capita', gdp_combi, 'kr')
    var_gdp = Predictive_Var("gdp per capita", data['gdp per capita'], gdp_combi,
                             model = model_gdp, cursor=[-4, -2, 0, 4, 10])

    #prim_combi = ['gdp per capita']
    prim_combi = ['population', 'gdp per capita', 'nuclear consumption', 'Carbon intensity of fossil energy']
    model_prim = model_trained(data, 'primary intensity', prim_combi, 'kr')
    var_prim = Predictive_Var("primary intensity", data['primary intensity'], prim_combi,
                              model = model_prim, cursor=[10, 4, 0, -2, -4])

    #nuc_combi = ['gdp per capita', 'Carbon intensity of fossil energy']
    nuc_combi = ['population', 'gdp per capita', 'primary intensity', 'Carbon intensity of fossil energy']
    model_nuc = model_trained(data, 'nuclear consumption', nuc_combi, 'kr')
    var_nuc = Predictive_Var("nuclear consumption", data['nuclear consumption'], nuc_combi,
                             model = model_nuc, cursor=[40, 20, 0, -5, -10])

    ren_combi = ['nuclear consumption', 'Carbon intensity of fossil energy']
    model_ren = model_trained(data, 'renewable consumption', ren_combi, 'kr') 
    var_ren = Predictive_Var("renewable consumption", data['renewable consumption'],
                             ren_combi,
                             model = model_ren, cursor=[-2, -1, 0, 2, 7])

    list_var = [var_pop, var_gdp, var_carbon, var_prim, var_nuc, var_ren]

    var_dict = {}
    for i in range(len(names)): var_dict[names[i]] = list_var[i]
        
    return var_dict

def total_forecast(data, pop_sce=False, carbon_sce=False, gdp_sce=False,
                   prim_sce=False, nuc_sce=False, ren_sce=False):
    var_dict = init_variable(data)
    var_dict['population'].cursor_scenario(pop_sce)
    var_dict['Carbon intensity of fossil energy'].cursor_scenario(carbon_sce)
    var_dict['gdp per capita'].cursor_scenario(gdp_sce)
    var_dict['primary intensity'].cursor_scenario(prim_sce)
    var_dict['nuclear consumption'].cursor_scenario(nuc_sce)
    var_dict['renewable consumption'].cursor_scenario(ren_sce)
    for i in range(12):
        
        predictions = get_pred(var_dict)
        
        var_dict['population'].forecast_new_value(predictions)
        var_dict['Carbon intensity of fossil energy'].forecast_new_value(predictions)
        var_dict['gdp per capita'].forecast_new_value(predictions)
        var_dict['primary intensity'].forecast_new_value(predictions)
        var_dict['nuclear consumption'].forecast_new_value(predictions)
        var_dict['renewable consumption'].forecast_new_value(predictions)
        predictions = get_pred(var_dict)
    
    return var_dict
#total_forecast(data, pop_sce=3)
#for name in names : var_dict[name].show_data(var_dict[name].actual_data)

def model_cons_co2(data, y_name, x_name, model):
    
    a = 39-12 ; b = 39
    
    if model == 'poly':
        model = make_pipeline(PolynomialFeatures(4), Ridge())
    if model == 'kr':
        model = GridSearchCV(KernelRidge(kernel='rbf', gamma=0.1), cv=5, iid=True,
                  param_grid={"alpha": [1e0, 0.1, 1e-2, 1e-3],
                              "gamma": np.logspace(-2, 2, 5)})
    if model == 'lin':
        model = LinearRegression()

    x_train = data[x_name][0:a]
    y_train = data[y_name][0:a]

    x_test = data[x_name][a:b]
    y_test = data[y_name][a:b]
    
    model.fit(x_train, y_train)
    
    y_train_predict = model.predict(x_train)
    y_test_predict = model.predict(x_test)
    
    return [y_train_predict, y_test_predict, y_train, y_test]

def evaluation_model(y_true, y_pred):
    list_pred = list()
    
    y_true = list(np.resize(y_true, (1,len(y_true)))[0])
    
    for i in range(len(y_pred)):
        pred = list(np.resize(y_pred[i], (1,len(y_pred[i])))[0])
        list_pred.append(math.sqrt(mean_squared_error(y_true, pred)))
    
    return list_pred

def plot_evaluation(y_test, y_test_predict, name_model, color_name):
    
    a = 39-12 ; b = 39
    
    fig = go.Figure()
    
    y_test = np.resize(y_test, (1,len(y_test)))[0]
    y_test = pd.Series(y_test)
    
    for i in range(len(name_model)):
        
        test_predict = np.resize(y_test_predict[i], (1,len(y_test_predict[i])))[0]
        test_predict = pd.Series(test_predict)
        
        fig.add_trace(go.Scatter(x=years[a:b], y=(test_predict-y_test)*(test_predict-y_test),
                                 name=name_model[i], opacity=0.8, line=dict(color=color_name[i])))

    # Use date string to set xaxis range
    fig.update_layout(title_text="Evaluation of the models")

    fig.update_xaxes(title_text="Years")
    fig.update_yaxes(title_text="Square of residuals [Mtoe^2]"),
    fig.update_layout(legend_orientation="h")
    fig.show()

def fig_init():

    # Initialize figure with subplots
    fig = make_subplots(rows=1, cols=2, subplot_titles=("Fossil consumption between 1980 and 2006",
                                                        "Fossil consumption between 2006 and 2018"))
    # Update xaxis properties
    fig.update_xaxes(title_text="", row=1, col=1); fig.update_xaxes(title_text="", row=1, col=2)

    # Update yaxis properties
    fig.update_yaxes(title_text="Consumption [Mtoe]", row=1, col=1)
    fig.update_yaxes(title_text="Consumption [Mtoe]", row=1, col=2)

    # Update title and height
    fig.update_layout(title_text="",
                      height=400, width=1000, showlegend=True)
    fig.update_layout(legend_orientation="h")
    
    return fig

def plotly_plot(fig, y_train_predict, y_test_predict, name_model, color_name, years):
    
    a = 39-12 ; b = 39
    
    for i in range(len(name_model)):
    
        train_predict = np.resize(y_train_predict[i], (1,len(y_train_predict[i])))[0]
        train_predict = pd.Series(train_predict)
    
        fig.add_trace(go.Scatter(x=years[0:a], y=train_predict, opacity=0.8, line=dict(color=color_name[i]),
                                 showlegend=False), row=1, col=1)
                                 
        test_predict = np.resize(y_test_predict[i], (1,len(y_test_predict[i])))[0]
        test_predict = pd.Series(test_predict)
                      
        fig.add_trace(go.Scatter(x=years[a:b], y=test_predict, opacity=0.8, line=dict(color=color_name[i]),
                                 name = name_model[i]),row=1, col=2)

    fig.show()
    
def CO2_consumption_model(data, y_name, x_name, model):
    
    if model == 'poly':
        model = make_pipeline(PolynomialFeatures(4), Ridge())
    if model == 'kr':
        model = GridSearchCV(KernelRidge(kernel='rbf', gamma=0.1), cv=5, iid=True,
                  param_grid={"alpha": [1e0, 0.1, 1e-2, 1e-3],
                              "gamma": np.logspace(-2, 2, 5)})

    y_train = data[y_name] # y_test = data[y_name]
    x_train = data[x_name] # x_test = data[x_name]
    
    model.fit(x_train, y_train)

    return model

def input_predictive_var(names, dict_var):
    pred_dict = {}
    for name in names:
        length = len(dict_var[name].actual_data)
        pred_dict[name] = dict_var[name].actual_data
    
    forecast_pred_var = pd.DataFrame.from_dict(pred_dict, orient='index').transpose()
    
    return forecast_pred_var

def get_data(file_name, col_delete):
    
    data = pd.read_csv(file_name)
    years = data['Years']
    data = data.drop(col_delete, axis=1)
    names = data.columns
    # Scaled the data first
    scaler = preprocessing.StandardScaler()
    # Fit your data on the scaler object
    scaled_data = scaler.fit_transform(data)
    scaled_data = pd.DataFrame(scaled_data, columns=data.columns)
    
    nb_columns = len(scaled_data.columns)
    x_name = list(scaled_data.columns)[0:nb_columns-1]
    y_name = list(scaled_data.columns).pop()
    
    return years, scaled_data, names, x_name, y_name

def compare_models_plot(data, y_name, x_name, show=False):
    res_kr = model_cons_co2(data, y_name, x_name, 'kr')
    res_poly = model_cons_co2(data, y_name, x_name, 'poly')
    res_lin = model_cons_co2(data, y_name, x_name, 'lin')
    
    y_train_predict = [res_kr[2], res_kr[0], res_poly[0], res_lin[0]]
    y_test_predict = [res_kr[3], res_kr[1], res_poly[1], res_lin[1]]

    name_model = ["real", "kr", "poly deg 4", "lin"]

    list_colors = ['Black', '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
                   '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '17becf']
    color_name = [list_colors[0], list_colors[1], list_colors[2], list_colors[3], list_colors[4]]

    fig = fig_init()
    if show is True:
        plotly_plot(fig, y_train_predict, y_test_predict, name_model, color_name, years)
        
def var_after_scenario(data, names, years, pop_sce=False, carbon_sce=False, gdp_sce=False, prim_sce=False,
                       nuc_sce=False, ren_sce=False, show=False):
    
    dict_var = total_forecast(data2, pop_sce=pop_sce, gdp_sce=gdp_sce,
                              carbon_sce=carbon_sce, prim_sce= prim_sce, nuc_sce=nuc_sce, ren_sce=ren_sce)
    if show is True:
        show_pred_var(dict_var, names, years)
        #for name in names2: dict_var[name].show_data(data = dict_var[name].actual_data) # PLOT
    
    return dict_var

def show_pred_var(dict_var, names, years):
    
    fig = make_subplots(
    rows=3, cols=2, subplot_titles=("Population", "Gdp per capita", "Carbon intensity of fossil energy",
                                    "Primary intensity", "Renewable consumption"))
    
    row = [1, 1, 2, 2, 3]
    col = [1, 2, 1, 2, 1]
    
    years_f = list()
    for i in range(13) : years_f.append(2018+i)
    
    for i in range(len(row)):
        
        data = dict_var[names[i]].data
        actual_data = dict_var[names[i]].actual_data
        
        fig.add_trace(go.Scatter(x=years, y=data, opacity=0.8, line=dict(color="Black")),
                      row=row[i], col=col[i])        
        
        actual_data.insert(0, data[len(data)-1])
        actual_data = pd.Series(actual_data)
        
        fig.add_trace(go.Scatter(x=years_f, y=actual_data, opacity=0.8), row=row[i], col=col[i])
        fig.update_xaxes(title_text="Years", row=row[i], col=col[i])
        fig.update_yaxes(title_text="[ ]", row=row[i], col=col[i])
        
        
        # Update title and height F
    fig.update_layout(title_text="Forecast of the predictive variables for a specific scenario",
                      height=1000, width=1000, showlegend=False)

    fig.show()

def show_consumption_emission(data_cons, data_emi):
    
    years_f = list()
    for i in range(13+38) : years_f.append(1980+i)
        
    a = 39-13; b = 39
        
    fig = make_subplots(rows=1, cols=2, subplot_titles=("Consumption of fossil energy",
                                                        "Emission of CO2 due to fossil energy"))
    
    fig.add_trace(go.Scatter(x=years_f[0:a], y=data_cons[0:a], opacity=0.8), row=1, col=1)
    fig.add_trace(go.Scatter(x=years_f[a-1:b], y=data_cons[a-1:b], opacity=0.8), row=1, col=1)
    fig.update_xaxes(title_text="Years", row=1, col=1)
    fig.update_yaxes(title_text="[ ]", row=1, col=1)
    
    fig.add_trace(go.Scatter(x=years_f[0:a], y=data_emi[0:a], opacity=0.8), row=1, col=2)
    fig.add_trace(go.Scatter(x=years_f[a-1:b], y=data_emi[a-1:b], opacity=0.8), row=1, col=2)
    fig.update_xaxes(title_text="Years", row=1, col=2)
    fig.update_yaxes(title_text="[ ]", row=1, col=2)
    
    # Update title and height
    fig.update_layout(title_text=" ", height=500, width=1000, showlegend=False)

    fig.show()
    
def consumption_emission_CO2(model, data, old_data, show=False):

    data_cons_f = model_CO2.predict(data)
    data_emi_f = [data['Carbon intensity of fossil energy'][i]*data_cons_f[i]
                          for i in range(data.shape[0])]
    
    data_cons = list(itertools.chain(list(old_data['fossil consumption']), data_cons_f))
    
    old_data_emi = [old_data['Carbon intensity of fossil energy'][i]*old_data['fossil consumption'][i]
                for i in range(old_data.shape[0])]
    data_emi = list(itertools.chain(old_data_emi, data_emi_f))
    
    return data_cons, data_emi

In [375]:
file_name = '/home/cj/Bureau/Master2/big-data/03_MILESTONES/belgium_information.csv'  
years, data1, names1, x1_name, y1_name = get_data(file_name, 'Years')

file_name = '/home/cj/Bureau/Master2/big-data/03_MILESTONES/belgium_information.csv'  
years, data2, names2, x2_name, y2_name = get_data(file_name, ['Years', 'fossil consumption'])

compare_models_plot(data1, y1_name, x1_name) # PLOT

dict_var = var_after_scenario(data, names2, years, pop_sce=3, show=False)
# scenario algorithm have to be mofified !

model_CO2 = CO2_consumption_model(data1, y1_name, x1_name, "kr")
input_data = input_predictive_var(names2, dict_var)
data_cons, data_emi = consumption_emission_CO2(model_CO2, input_data, data1)


show_consumption_emission(data_cons, data_emi)

# OBSERVATIONS -> INVERSION !!!!

# Comments #

- Every model that lead to an increase of the carbon is a bad model or bad choice variables !
- An increase of the population cannot lead to an over-enrichment of the population
- A too important decrease of the primary intensity is not acceptable either. 
- An augmentation of the population cannot lead to a decrease of the nuclear consumption neither the renewable energy
- One conclusion could be to delete the renewable variable because of the lack of data ..

## TODO and done ##
- kernel based regression -> to be understood !
- for now just work with poly 4 
- pca ? on peut le faire dans le sens des variables apportant le plus de variabilité !!!
- ou alors pour chaque variable on regarde quel variable est le plus atteint par une modification de celle-ci, puis on reprend cette variable choisie et on fait de même ainsi de suite. Je pense que celà peut etre fait avec PCA justement. OU TU PEUX TOUT SIMPLEMENT REGARDER LA MATRICE DE CORRELATION (UNE DES DEUX)
- validation and testing set ! The BIC choice has to be made on the testing set ..
- arima model on consumption in order to verify -> second milestone will be better
- say everything I done in the report
- we will have to choose which model to take for each variables.
- understand kr !
- mettre un terme d'erreur qui evolue en fonction du temps pour le forecast
- sliding window..
- courbe choisie curseur compare a courbe modofifée par les interactions 
- le model ne peut pas aller en dessous de 0 ..
- essayez de travailler avec des données non scaled aussi .. 