# Prédictions

Notebook contenant les predictions pour toutes les regions de France. Avant de faire les forecasts, il faut changer les types de certaines colonnes et en rajouter quelques-unes. 

# Load Packages

In [135]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import operator



import enchant
d = enchant.Dict("fr")
import string
import re
# Stemming
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("french")

from nltk.corpus import stopwords
stop_words = stopwords.words('french')


import unidecode


from plotly import __version__
import cufflinks as cf
import matplotlib.pyplot as plt
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import seaborn as sns 
%matplotlib inline

from dateutil.relativedelta import relativedelta # working with dates with style
from scipy.optimize import minimize              # for function minimization

import statsmodels.formula.api as smf            # statistics and econometrics
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as scs

from itertools import product                    # some useful functions
from tqdm import tqdm_notebook




from sklearn.metrics import r2_score, median_absolute_error, mean_absolute_error
from sklearn.metrics import median_absolute_error, mean_squared_error, mean_squared_log_error

init_notebook_mode(connected=True)

# Define functions

In [147]:
def load_regions(region_inf_2014, regions_apres_2015):
    df = pd.DataFrame()
    for i in range(2009, 2015):
        for r in region_inf_2014:
            name_file = "data/region" + str(r) + "_" + str(i) + ".csv"
            df = df.append(pd.read_csv(name_file))
            print(name_file + " uploaded")
    for i in range(2015, 2018):
        for r in regions_apres_2015:
            name_file = "data/region" + str(r) + "_" + str(i) + ".csv"
            df = df.append(pd.read_csv(name_file))
            print(name_file + " uploaded")
    return(df)


In [148]:
words_to_delete = ['forfait', 'medicament', 'consultation', 'seance', 'visite']

min_voc = 0

dicti_1 = {}
for i in list(string.punctuation):
    dicti_1[i] = " "
translation      = str.maketrans(dicti_1)

def correct(word):
    if d.check(word):
        return(word)
    else:
        if d.suggest(word) != []:
            return(d.suggest(word)[0])
        
def clean_sentence(sentences , words_to_delete = words_to_delete, min_char_word = None, stopw = True, correctw = False, stemm = False, delete_accent = False, delete_punct = True, verbose = False):
    
    if type(sentences) != list:
        sentences = [sentences]
        
    sentence_clean = [s.lower() for s in sentences]
    
    if delete_punct:
        sentence_clean = [re.sub('[ ]+', ' ', s.translate(translation)) for s in sentence_clean]
        if (verbose):
            print("punctuations are deleted")
    
    words_clean  = [s.split(" ") for s in sentence_clean]
    
    if words_to_delete:
        words_clean   = [[w for w in words if w not in words_to_delete] for words in words_clean]
        if (verbose):
            print("provided words are deleted")
        
    if  min_char_word :
        words_clean   = [[w for w in words if (bool(re.search(string = w, pattern = "[a-z]{" + str(min_char_word) + ",}"))) & (not bool(re.search(string = w , pattern = "[0-9]+")))] for words in words_clean]
        if (verbose):
            print("choose only words that have at least " + str(min_char_word) + " characters")
        
    if stopw:
        words_clean   = [[w for w in words if w not in stop_words] for words in words_clean]
        if (verbose):
            print("stopwords deleted")
        
    if correctw:
        words_clean = [[correct(w) for w in words] for words in words_clean]
        if (verbose):
            print("words corrected")
        
    if stemm:
        words_clean   = [[stemmer.stem(w) for w in words] for words in words_clean]
        if (verbose):
            print("words are stemmed")
    
    if delete_accent:
        words_clean   = [[unidecode.unidecode(w) for w in words] for words in words_clean]
        if (verbose):
            print("accents are deleted")
    
    res = [" ".join(words) for words in words_clean]
    res = [re.sub('(^([ ]+))|(([ ]+)$)', '', s) for s in res]
    res = [re.sub('[ ]+', ' ', s) for s in res]
    return(res)


texts_df         = pd.read_csv("data/nature_pres.csv", sep = ";")
texts_df.columns = ["PRS_NAT", "LIB_PRS_NAT"]
texts_df.PRS_NAT = texts_df.PRS_NAT.astype("str")

texts = texts_df.LIB_PRS_NAT
texts = [str(i) for i in texts]

texts_df['CLEANED_LIB_PRS_NAT'] = clean_sentence(list(texts), words_to_delete = words_to_delete, min_char_word = min_voc, stopw = False, correctw = False, stemm = False, delete_accent = True, delete_punct = True, verbose = False)

In [149]:


def add_cluster(data, texts_df = texts_df):
    
    data['CLUSTER_PRES'] = 'c6'
    # Cluster CARDIO
    spe_cardio = ['3']
    data.loc[(data.PSP_SPE_SNDS).isin(spe_cardio),"CLUSTER_PRES"] = 'c1'
    # Cluster DIAB
    spe_diab = []
    data.loc[(data.PSP_SPE_SNDS).isin(spe_diab),"CLUSTER_PRES"] = 'c2'
    # Cluster NEURO
    spe_neuro = ['32']
    data.loc[(data.PSP_SPE_SNDS).isin(spe_neuro),"CLUSTER_PRES"] = 'c3'
    # Cluster DHO
    spe_dho = ['15', '18', '36', '5']
    data.loc[(data.PSP_SPE_SNDS).isin(spe_dho),"CLUSTER_PRES"] = 'c4'
    # Cluster ORTHO
    spe_ortho = ['14', '31']
    data.loc[(data.PSP_SPE_SNDS).isin(spe_ortho),"CLUSTER_PRES"] = 'c5'
    
    # Cluster the rest By regex
    lexique_cardio         = ['card']
    cluster_cardio_libelle = texts_df[texts_df['CLEANED_LIB_PRS_NAT'].str.contains("|".join(lexique_cardio), regex=True)]['LIB_PRS_NAT']
    
    lexique_diab           = ['diab']
    cluster_diab_libelle   = texts_df[texts_df['CLEANED_LIB_PRS_NAT'].str.contains("|".join(lexique_diab), regex=True)]['LIB_PRS_NAT']

    lexique_neuro  = ['neuro']
    cluster_neuro_libelle  = texts_df[texts_df['CLEANED_LIB_PRS_NAT'].str.contains("|".join(lexique_neuro), regex=True)]['LIB_PRS_NAT']

    lexique_dho    = ['dent', 'opti', 'bucco', 'cheveux', 'monture', 'verre']
    cluster_dho_libelle    = texts_df[texts_df['CLEANED_LIB_PRS_NAT'].str.contains("|".join(lexique_dho), regex=True)]['LIB_PRS_NAT']

    lexique_ortho  = ['ortho', 'prothese', 'paresthesie']
    cluster_ortho_libelle  = texts_df[texts_df['CLEANED_LIB_PRS_NAT'].str.contains("|".join(lexique_ortho), regex=True)]['LIB_PRS_NAT']
    
    data.loc[(data["LIB_PRS_NAT"]).isin(cluster_cardio_libelle )&(data["CLUSTER_PRES"] == 'c6'), "CLUSTER_PRES"] = "c1"
    data.loc[(data["LIB_PRS_NAT"]).isin(cluster_diab_libelle )&(data["CLUSTER_PRES"] == 'c6'), "CLUSTER_PRES"]   = "c2"
    data.loc[(data["LIB_PRS_NAT"]).isin(cluster_neuro_libelle )&(data["CLUSTER_PRES"] == 'c6'), "CLUSTER_PRES"]  = "c3"
    data.loc[(data["LIB_PRS_NAT"]).isin(cluster_dho_libelle )&(data["CLUSTER_PRES"] == 'c6'), "CLUSTER_PRES"]    = "c4"
    data.loc[(data["LIB_PRS_NAT"]).isin(cluster_ortho_libelle )&(data["CLUSTER_PRES"] == 'c6'), "CLUSTER_PRES"]  = "c5"
    
    return(data)



In [150]:
dic_clusters = {"c1": "Cluster Cardiologie",
                "c2": "Cluster Diabete",
                "c3": "Cluster Neurologie",
                "c4": "Cluster DHO",
                "c5": "Cluster Orthopedie",
                "c6": "Cluster Autre"
               }
def prepare_data_for_pred(data, prs_nat_df = texts_df ):
    
    df = data
    cat_columns = ['PSP_SPE_SNDS', 'PSE_SPE_SNDS', 'PRS_PPU_SEC', 'PRS_NAT', 'PRE_INS_REG', 'DDP_SPE_COD', 'BEN_SEX_COD', 'ASU_NAT']
    for c in cat_columns:
        df[c] = df[c].astype("str")
    df['FLX_ANN_MOI'] =  pd.to_datetime(df.FLX_ANN_MOI, format="%Y%m")
    df['MONTH'] = df.FLX_ANN_MOI.dt.month
    x             = list(map(operator.add, df.PRS_DEP_MNT, df.PRS_REM_MNT))
    df['DIFF']        = list(map(operator.sub, df.PRS_PAI_MNT, df.PRS_REM_MNT))
    df['TARGET']  = list(map(operator.sub, df.PRS_PAI_MNT, x))
    
    
    df = df.merge(right=prs_nat_df, on="PRS_NAT", how="left")
    df = add_cluster(data = df)
    return(df)

In [6]:
def calculate_diff_date(date_left, date_right, what = "months"):
    d2 = pd.to_datetime(date_left)
    d1 = pd.to_datetime(date_right)
    diff = d2 - d1
    if what =="days":
        return(diff)
    if what =="months":
        return(int(diff.days/30))



In [7]:
def mean_absolute_percentage_error(y_true, y_pred): 
    abs_val = np.abs((y_true - y_pred) / y_true) 
    return np.mean( abs_val[~np.isnan(abs_val)]) * 100



In [8]:
class HoltWinters:
    
    """
    Holt-Winters model with the anomalies detection using Brutlag method
    
    # series - initial time series
    # slen - length of a season
    # alpha, beta, gamma - Holt-Winters model coefficients
    # n_preds - predictions horizon
    # scaling_factor - sets the width of the confidence interval by Brutlag (usually takes values from 2 to 3)
    
    """
    
    
    def __init__(self, series, slen, alpha, beta, gamma, n_preds, scaling_factor=1.96):
        self.series = series
        self.slen = slen
        self.alpha = alpha
        self.beta = beta
        self.gamma = gamma
        self.n_preds = n_preds
        self.scaling_factor = scaling_factor
        
        
    def initial_trend(self):
        sum = 0.0
        for i in range(self.slen):
            sum += float(self.series[i+self.slen] - self.series[i]) / self.slen
        return sum / self.slen  
    
    def initial_seasonal_components(self):
        seasonals = {}
        season_averages = []
        
        n_seasons = int(len(self.series)/self.slen)
        
        # let's calculate season averages
        for j in range(n_seasons):
            season_averages.append(sum(self.series[self.slen*j:self.slen*j+self.slen])/float(self.slen))
        # let's calculate initial values
        
        for i in range(self.slen):
            sum_of_vals_over_avg = 0.0
            
            for j in range(n_seasons):
                sum_of_vals_over_avg += self.series[self.slen*j+i]-season_averages[j]
            seasonals[i] = sum_of_vals_over_avg/n_seasons
        
        return seasonals   

          
    def triple_exponential_smoothing(self):
        
        self.result = []
        
        self.Smooth = []
        
        self.Season = []
        
        self.Trend = []
        
        self.PredictedDeviation = []
        self.UpperBond = []
        self.LowerBond = []

        seasonals = self.initial_seasonal_components()
        for i in range(len(self.series)+self.n_preds):
            
            if i == 0: # components initialization
                smooth = self.series[0]
                trend = self.initial_trend()
                self.result.append(self.series[0])
                self.Smooth.append(smooth)
                self.Trend.append(trend)
                self.Season.append(seasonals[i%self.slen])
                
                self.PredictedDeviation.append(0)
                
                self.UpperBond.append(self.result[0] + 
                                      self.scaling_factor * 
                                      self.PredictedDeviation[0])
                
                self.LowerBond.append(self.result[0] - 
                                      self.scaling_factor * 
                                      self.PredictedDeviation[0])
                continue
                
            if i >= len(self.series): # predicting
                m = i - len(self.series) + 1
                self.result.append((smooth + m*trend) + seasonals[i%self.slen])
                
                # when predicting we increase uncertainty on each step
                self.PredictedDeviation.append(self.PredictedDeviation[-1]*1.01) 
                
            else:
                val = self.series[i]
                last_smooth, smooth = smooth, self.alpha*(val-seasonals[i%self.slen]) + (1-self.alpha)*(smooth+trend)
                trend = self.beta * (smooth-last_smooth) + (1-self.beta)*trend
                seasonals[i%self.slen] = self.gamma*(val-smooth) + (1-self.gamma)*seasonals[i%self.slen]
                self.result.append(smooth+trend+seasonals[i%self.slen])
                
                # Deviation is calculated according to Brutlag algorithm.
                self.PredictedDeviation.append(self.gamma * np.abs(self.series[i] - self.result[i]) 
                                               + (1-self.gamma)*self.PredictedDeviation[-1])
                     
            self.UpperBond.append(self.result[-1] + 
                                  self.scaling_factor * 
                                  self.PredictedDeviation[-1])

            self.LowerBond.append(self.result[-1] - 
                                  self.scaling_factor * 
                                  self.PredictedDeviation[-1])

            self.Smooth.append(smooth)
            self.Trend.append(trend)
            self.Season.append(seasonals[i%self.slen])
        

In [9]:
from sklearn.model_selection import TimeSeriesSplit # you have everything done for you

def timeseriesCVscore(params, series, loss_function=mean_squared_error, slen=12):
    """
        Returns error on CV  
        
        params - vector of parameters for optimization
        series - dataset with timeseries
        slen - season length for Holt-Winters model
    """
    # errors array
    errors = []
    
    values = series.values
    alpha, beta, gamma = params
    
    # set the number of folds for cross-validation
    tscv = TimeSeriesSplit(n_splits=3) 
    # iterating over folds, train model on each, forecast and calculate error
    for train, test in tscv.split(values):
        
        model = HoltWinters(series=values[train], slen=slen, 
                            alpha=alpha, beta=beta, gamma=gamma, n_preds=len(test))
        
        model.triple_exponential_smoothing()
        
        predictions = model.result[-len(test):]
        
        actual = values[test]
        error = loss_function(predictions, actual)
        errors.append(error)
        
    return np.mean(np.array(errors))

In [10]:
def plotHoltWinters(series,
                    model,
                    title, 
                    plot_intervals=False, 
                    plot_anomalies=False, 
                    period = ['2010-01-01','2017-12-01'],
                    validation_period = ['2017-01-01','2017-12-01'],
                    plot_error = False):
    """
        series - dataset with timeseries
        plot_intervals - show confidence intervals
        plot_anomalies - show anomalies 
    """
    series_pred = series.index.tolist()
    series_pred.extend(pd.date_range(start=series.index[-1],periods = len(model.result) -len(series), freq="M").tolist())
    data = []
    total = 0    
    trace1 = go.Scatter(
        x=series.index,
        y=series,
        name = 'Actual')
    trace2 = go.Scatter(
        x=series_pred,
        y=model.result,
        name = 'Prediction')
    data = [trace1, trace2]
    if plot_intervals:
        trace4 = go.Scatter(
            x = series_pred,
            y = model.UpperBond,
            name = 'Upper Confidence',
            line = dict(
                color = ('rgb(205, 12, 24)'),
                dash = 'dot')
        )
        data.append(trace4)
        trace4 = go.Scatter(
            x = series_pred,
            y = model.LowerBond,
            name = 'Lower Confidence',
            line = dict(
                color = ('rgb(205, 12, 24)'),
                dash = 'dot')
        )
        data.append(trace4)
    

    if plot_error :
        training_ser   = series[(series.index < validation_period[0])]
        validation_ser = series[(series.index >= validation_period[0])& (series.index <= validation_period[1])]
        validation_predictions = model.result[len(training_ser):len(training_ser) + len(validation_ser)]
        training_predictions   = model.result[0:len(training_ser)]
        train_error      = round(mean_absolute_percentage_error(y_pred = training_predictions, y_true = training_ser), 2)
        validation_error = round(mean_absolute_percentage_error(y_pred = validation_predictions, y_true = validation_ser), 2)
        trace3 = go.Scatter(
           x=["2013-01-01", validation_period[0]],
           y=[1.5* series.max(), 1.5* series.max()],
           mode='text',
           name='Errors',
           text=["Train error: "+str(train_error) + "%", "Validation error: "+str(validation_error) + "%"],
           textposition='center',
           textfont=dict(
               family='sans serif',
               size=18,
               color='b'
           )
        )
        data.append(trace3)
    
    
    layout = {
        "showlegend":False,
        "title" : title + "( Depense Totale en 2020 est: "+str(np.round(np.sum(model.result[-12:]), 2))+ " euros )",
        "xaxis" : dict(range = period),
        "shapes": [{'type' : 'rect', 
                    'x0' : validation_period[0], 
                    'x1' : validation_period[1], 
                    'y0' : 0.1 * series.min(), 
                    'y1' : 2* series.max(),
                    'fillcolor': '#d3d3d3',
                    'opacity': 0.2}]
    }
    fig = dict(data=data, layout = layout)
        
    return(iplot(fig))

In [141]:
def train_tm(train_set , n_pred, list_slen = range(1,13)):
    
    # initializing model parameters alpha, beta and gamma
    x = [0, 0, 0] 
    # Minimizing the loss function \
    best = []
    fun_errors = []
    pars       = []
    for slen_i in list_slen:
        opt = minimize(timeseriesCVscore, x0=x, 
               args=(train_set, mean_squared_error, slen_i), 
               method="TNC", bounds = ((0, 1), (0, 1), (0, 1))
              )
        fun_errors.append(opt.fun) 
        pars.append(opt.x)
        best.append(slen_i)
    
    # Take optimal values...
    alpha_final, beta_final, gamma_final = pars[np.argmin(fun_errors)]
    slen_i = best[np.argmin(fun_errors)]

    
    print("best params: " , alpha_final, beta_final, gamma_final)
    # ...and train the model with them, forecasting for the next 50 hours
    model = HoltWinters(train_set.values, slen = slen_i, 
                    alpha = alpha_final, 
                    beta = beta_final, 
                    gamma = gamma_final, 
                    n_preds = n_pred, scaling_factor = 3)
    return(model)
    #model.triple_exponential_smoothing()

def prepare_tm(data):
    
    rem_by_month         = data[['FLX_ANN_MOI', 'DIFF_CARECO']].groupby('FLX_ANN_MOI').sum().reset_index()
    rem_by_month['date'] = pd.to_datetime(rem_by_month.FLX_ANN_MOI, format="%Y%m")
    time_ser             = pd.Series( rem_by_month.DIFF_CARECO)
    time_ser.index       = rem_by_month.date
    return(time_ser)

def make_prediction_with_validation_by_cluster(cluster, data , validation_period, n_pred, period, plot_intervals = True, list_slen = range(12,13)):
    
    time_ser    = prepare_tm(data = data[data.CLUSTER_PRES == cluster])
    time_ser    = time_ser[time_ser.index >= "2009-01-01"]
    model       = train_tm(train_set = time_ser[time_ser.index < validation_period[0]], n_pred = n_pred, list_slen = list_slen)
    model.triple_exponential_smoothing()
    p           = plotHoltWinters(series=time_ser, model = model, title = dic_clusters[cluster], validation_period = validation_period, plot_intervals = plot_intervals, plot_error = True)
    return(p)

def make_prediction_by_cluster(cluster, data , validation_period, n_pred, period, plot_intervals = True, list_slen = range(12,13), plot_error = False):
    
    time_ser    = prepare_tm(data = data[data.CLUSTER_PRES == cluster])
    time_ser    = time_ser[time_ser.index >= "2009-01-01"]
    model       = train_tm(train_set = time_ser, n_pred = n_pred, list_slen = list_slen)
    model.triple_exponential_smoothing()
    p           = plotHoltWinters(series=time_ser, model = model, title = dic_clusters[cluster], validation_period = validation_period, plot_intervals = plot_intervals, plot_error = plot_error)
    return(p)

# Regions

## Region Nord

### Load Data

In [None]:
region_inf_2014    =[3] 
regions_apres_2015 =[27, 28, 32] 
df = load_regions(region_inf_2014, regions_apres_2015)

### Prepare data

In [45]:
df = prepare_data_for_pred(data = df)

### Make predictions

In [46]:
validation_period = ["2017-01-01", "2017-12-01"]
prediction_period = ["2018-01-01", "2020-12-31"]

In [None]:
n_preds = calculate_diff_date(prediction_period[1], prediction_period[0])

#### Cluster Cardiologie

In [47]:
make_prediction_by_cluster(data = df, cluster = 'c1', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.07916438120527097 0.0 0.2507232877091862


#### Cluster Diabete

In [50]:
make_prediction_by_cluster(data = df, cluster = 'c2', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.0 0.0 0.0031590212608769708


#### Cluster Neurologie

In [53]:
make_prediction_by_cluster(data = df, cluster = 'c3', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(1,13))

best params:  0.10934475356250306 0.0 0.0


#### Cluster DHO

In [56]:
make_prediction_by_cluster(data = df, cluster = 'c4', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.004812494400476264 0.3814493551111677 0.021906835346255538


#### Cluster Orthopedie

In [57]:
make_prediction_by_cluster(data = df, cluster = 'c5', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.1484107304585507 0.013768868030233805 0.2936166490159926


#### Cluster Autre

In [58]:
make_prediction_by_cluster(data = df, cluster = 'c6', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.02795922869007067 0.010531712178262376 0.19583541157341394


## Region Bassin Parisien

In [123]:
region_inf_2014    =[1, 2] 
regions_apres_2015 =[11, 24] 
df = load_regions(region_inf_2014, regions_apres_2015)

data/region1_2009.csv uploaded
data/region2_2009.csv uploaded
data/region1_2010.csv uploaded
data/region2_2010.csv uploaded
data/region1_2011.csv uploaded
data/region2_2011.csv uploaded
data/region1_2012.csv uploaded
data/region2_2012.csv uploaded
data/region1_2013.csv uploaded
data/region2_2013.csv uploaded
data/region1_2014.csv uploaded
data/region2_2014.csv uploaded
data/region11_2015.csv uploaded
data/region24_2015.csv uploaded
data/region11_2016.csv uploaded
data/region24_2016.csv uploaded
data/region11_2017.csv uploaded
data/region24_2017.csv uploaded


### Prepare data

In [124]:
df = prepare_data_for_pred(data = df)

### Make predictions

In [125]:
validation_period = ["2017-01-01", "2017-12-01"]
prediction_period = ["2018-01-01", "2020-12-31"]
n_preds = calculate_diff_date(prediction_period[1], prediction_period[0])

#### Cluster Cardiologie

In [132]:
make_prediction_by_cluster(data = df, cluster = 'c1', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.0997548237281824 0.3972778836849251 1.0


#### Cluster Diabete

In [134]:
make_prediction_by_cluster(data = df, cluster = 'c2', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.008345784137709311 1.0 0.0


#### Cluster Neurologie

In [128]:
make_prediction_by_cluster(data = df, cluster = 'c3', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.2549446823166034 0.02656207946858352 0.0


#### Cluster DHO

In [129]:
make_prediction_by_cluster(data = df, cluster = 'c4', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.21309595663572634 0.012071609059526478 0.2969388265975842


#### Cluster Orthopedie

In [130]:
make_prediction_by_cluster(data = df, cluster = 'c5', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.051315935153885606 0.0 0.2917828830418234


#### Cluster Autre

In [131]:
make_prediction_by_cluster(data = df, cluster = 'c6', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.04155680732726191 0.18486415060752792 0.025729168646409173


## Region Ouest

### Load data

In [59]:
region_inf_2014    =[5] 
regions_apres_2015 =[52, 53] 
df = load_regions(region_inf_2014, regions_apres_2015)

data/region5_2009.csv uploaded
data/region5_2010.csv uploaded
data/region5_2011.csv uploaded
data/region5_2012.csv uploaded
data/region5_2013.csv uploaded
data/region5_2014.csv uploaded
data/region52_2015.csv uploaded
data/region53_2015.csv uploaded
data/region52_2016.csv uploaded
data/region53_2016.csv uploaded
data/region52_2017.csv uploaded
data/region53_2017.csv uploaded


### Prepare data

In [60]:
df = prepare_data_for_pred(data = df)

### Make predictions

In [61]:
validation_period = ["2017-01-01", "2017-12-01"]
prediction_period = ["2018-01-01", "2020-12-31"]


In [62]:
n_preds = calculate_diff_date(prediction_period[1], prediction_period[0])

#### Cluster Cardiologie

In [63]:
make_prediction_by_cluster(data = df, cluster = 'c1', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.8156465428831733 0.007277010308419041 0.0


#### Cluster Diabete

In [64]:
make_prediction_by_cluster(data = df, cluster = 'c2', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.010817082780407383 0.03950932468456303 0.00020782585677808774


#### Cluster Neurologie


In [69]:
make_prediction_by_cluster(data = df, cluster = 'c3', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(1,13))

best params:  0.1802119398305943 0.000501177283243448 0.0


#### Cluster DHO

In [66]:
make_prediction_by_cluster(data = df, cluster = 'c4', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.0 0.0 0.05337708572767408


#### Cluster Orthopedie

In [67]:
make_prediction_by_cluster(data = df, cluster = 'c5', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.017204588132755494 0.0 0.28442125160186554


#### Cluster Autre

In [70]:
make_prediction_by_cluster(data = df, cluster = 'c6', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(1,13))

best params:  0.033272939215247654 0.1830888314094642 0.23070265560573883


## Region Sud Ouest

### Load Data

In [71]:
region_inf_2014    =[6] 
regions_apres_2015 =[75] 
df = load_regions(region_inf_2014, regions_apres_2015)

data/region6_2009.csv uploaded
data/region6_2010.csv uploaded
data/region6_2011.csv uploaded
data/region6_2012.csv uploaded
data/region6_2013.csv uploaded
data/region6_2014.csv uploaded
data/region75_2015.csv uploaded
data/region75_2016.csv uploaded
data/region75_2017.csv uploaded


### Prepare Data

In [72]:
df = prepare_data_for_pred(data = df)

### Make Predictions

In [73]:
validation_period = ["2017-01-01", "2017-12-01"]
prediction_period = ["2018-01-01", "2020-12-31"]


In [74]:
n_preds = calculate_diff_date(prediction_period[1], prediction_period[0])

#### Cluster Cardiologie

In [75]:
make_prediction_by_cluster(data = df, cluster = 'c1', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.06107364041945196 0.5442760058936564 0.3639107930006369


#### Cluster Diabete


In [76]:
make_prediction_by_cluster(data = df, cluster = 'c2', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.08682106906803116 0.06530617721467386 0.22092131060987324


#### Cluster Neurologie


In [77]:
make_prediction_by_cluster(data = df, cluster = 'c3', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.008393932771844914 0.7227775603386211 0.07846565710982428


#### Cluster DHO


In [81]:
make_prediction_by_cluster(data = df, cluster = 'c4', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.010712145750886082 0.9676080298326105 0.0


#### Cluster Orthopedie


In [79]:
make_prediction_by_cluster(data = df, cluster = 'c5', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.2425757653004072 0.1261213182663185 0.1346700854950844


#### Cluster Autre

In [80]:
make_prediction_by_cluster(data = df, cluster = 'c6', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.16444604791390693 0.23059844757089554 0.03265267551874462


## Region Est

### Load Data

In [82]:
region_inf_2014    =[4] 
regions_apres_2015 =[44] 
df = load_regions(region_inf_2014, regions_apres_2015)

data/region4_2009.csv uploaded
data/region4_2010.csv uploaded
data/region4_2011.csv uploaded
data/region4_2012.csv uploaded
data/region4_2013.csv uploaded
data/region4_2014.csv uploaded
data/region44_2015.csv uploaded
data/region44_2016.csv uploaded
data/region44_2017.csv uploaded


### Preapare Data

In [83]:
df = prepare_data_for_pred(data = df)

### Make Predictions

In [84]:
validation_period = ["2017-01-01", "2017-12-01"]
prediction_period = ["2018-01-01", "2020-12-31"]


In [85]:
n_preds = calculate_diff_date(prediction_period[1], prediction_period[0])

#### Cluster Cardiologie

In [86]:
make_prediction_by_cluster(data = df, cluster = 'c1', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.0068424835340639145 0.18960804437263806 0.13353712313096194


#### Cluster Diabete

In [87]:
make_prediction_by_cluster(data = df, cluster = 'c2', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.40800947972924073 0.048886678232134606 0.23506081264062695


#### Cluster Neurologie

In [88]:
make_prediction_by_cluster(data = df, cluster = 'c3', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.20100147059270074 0.03691956397556467 0.14509294075563117


#### Cluster DHO

In [93]:
make_prediction_by_cluster(data = df, cluster = 'c4', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.0 8.385046385228634e-06 0.0


#### Cluster Orthopedie

In [90]:
make_prediction_by_cluster(data = df, cluster = 'c5', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.14442313420681013 0.0 0.12369904943552146


#### Cluster Autre

In [91]:
make_prediction_by_cluster(data = df, cluster = 'c6', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.14681754648829898 0.017316394444314875 0.26823165219450756


## Region Centre Est

### Load Data

In [94]:
region_inf_2014    =[7] 
regions_apres_2015 =[84] 
df = load_regions(region_inf_2014, regions_apres_2015)

data/region7_2009.csv uploaded
data/region7_2010.csv uploaded
data/region7_2011.csv uploaded
data/region7_2012.csv uploaded
data/region7_2013.csv uploaded
data/region7_2014.csv uploaded
data/region84_2015.csv uploaded
data/region84_2016.csv uploaded
data/region84_2017.csv uploaded


### Prepare Data

In [95]:
df = prepare_data_for_pred(data = df)

### Make Predictions

In [96]:
validation_period = ["2017-01-01", "2017-12-01"]
prediction_period = ["2018-01-01", "2020-12-31"]
n_preds = calculate_diff_date(prediction_period[1], prediction_period[0])

#### Cluster Cardiologie


In [97]:
make_prediction_by_cluster(data = df, cluster = 'c1', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.4718311321979014 0.1585084640656868 1.0


#### Cluster Diabete


In [98]:
make_prediction_by_cluster(data = df, cluster = 'c2', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.0934506924330521 0.0013235393154656938 0.0


#### Cluster Neurologie


In [99]:
make_prediction_by_cluster(data = df, cluster = 'c3', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.06079719318233251 0.06399578079765977 0.2079992247657056


#### Cluster DHO


In [100]:
make_prediction_by_cluster(data = df, cluster = 'c4', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  1.0 0.0034582644308651656 0.0


#### Cluster Orthopedie

In [104]:
make_prediction_by_cluster(data = df, cluster = 'c5', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.01085838704714498 0.26805985556886414 0.2234197062989592


#### Cluster Autre

In [102]:
make_prediction_by_cluster(data = df, cluster = 'c6', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.01282880633883876 0.10778621889735152 0.22265401524780898


## Region Mediteranie

### Load Data

In [105]:
region_inf_2014    =[8] 
regions_apres_2015 =[76, 93] 
df = load_regions(region_inf_2014, regions_apres_2015)

data/region8_2009.csv uploaded
data/region8_2010.csv uploaded
data/region8_2011.csv uploaded
data/region8_2012.csv uploaded
data/region8_2013.csv uploaded
data/region8_2014.csv uploaded
data/region76_2015.csv uploaded
data/region93_2015.csv uploaded
data/region76_2016.csv uploaded
data/region93_2016.csv uploaded
data/region76_2017.csv uploaded
data/region93_2017.csv uploaded


### Prepare Data

In [106]:
df = prepare_data_for_pred(data = df)

### Make Predictions

In [107]:
validation_period = ["2017-01-01", "2017-12-01"]
prediction_period = ["2018-01-01", "2020-12-31"]
n_preds = calculate_diff_date(prediction_period[1], prediction_period[0])

#### Cluster Cardiologie


In [108]:
make_prediction_by_cluster(data = df, cluster = 'c1', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.6532335545109071 0.06499532656181317 0.7558658847403894


#### Cluster Diabete


In [109]:
make_prediction_by_cluster(data = df, cluster = 'c2', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.4359748937591014 0.0 0.5622763894321908


#### Cluster Neurologie


In [110]:
make_prediction_by_cluster(data = df, cluster = 'c3', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.2634022549784928 0.02732348186268163 0.1724068070657105


#### Cluster DHO


In [111]:
make_prediction_by_cluster(data = df, cluster = 'c4', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.33385711555662 0.013830474602397225 0.1644832387073915


#### Cluster Orthopedie


In [112]:
make_prediction_by_cluster(data = df, cluster = 'c5', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.027106717758086263 0.025823174170505392 0.7529397570664871


#### Cluster Autre

In [113]:
make_prediction_by_cluster(data = df, cluster = 'c6', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.04521667313781508 0.03203865294929986 0.04721259205867412


## Region Outre-mer

### Load Data

In [143]:
region_inf_2014    =[9] 
regions_apres_2015 =[5] 
df = load_regions(region_inf_2014, regions_apres_2015)

data/region9_2009.csv uploaded
data/region9_2010.csv uploaded
data/region9_2011.csv uploaded
data/region9_2012.csv uploaded
data/region9_2013.csv uploaded
data/region9_2014.csv uploaded
data/region5_2015.csv uploaded
data/region5_2016.csv uploaded
data/region5_2017.csv uploaded


### Prepare Data

In [144]:
df = prepare_data_for_pred(data = df)

### Make Predictions

In [116]:
validation_period = ["2017-01-01", "2017-12-01"]
prediction_period = ["2018-01-01", "2020-12-31"]
n_preds = calculate_diff_date(prediction_period[1], prediction_period[0])

#### Cluster Cardiologie

In [145]:
make_prediction_by_cluster(data = df, cluster = 'c1', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.0 0.0 0.02667249685830414


#### Cluster Diabete

In [118]:
make_prediction_by_cluster(data = df, cluster = 'c2', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.07080631759371953 0.06784228529049924 0.2266573139297488


#### Cluster Neurologie

In [119]:
make_prediction_by_cluster(data = df, cluster = 'c3', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.23216087443724415 0.09201671134379208 0.18427141291397947



#### Cluster DHO

In [120]:
make_prediction_by_cluster(data = df, cluster = 'c4', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.049131355390066744 0.14537868752476446 0.1511810366115145


#### Cluster Orthopedie

In [121]:
make_prediction_by_cluster(data = df, cluster = 'c5', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(12,13))

best params:  0.4317426084995409 0.0032102884185379876 0.0


#### Cluster Autre

In [122]:
make_prediction_by_cluster(data = df, cluster = 'c6', validation_period = validation_period, n_pred = n_preds, period = ['2013-01-01','2020-01-01'], plot_intervals = False,list_slen = range(1,13))

best params:  0.036319522624060296 0.32972750184766003 0.000546292264675452
