**Models Prophet**

In [21]:
#%pip install prophet

In [22]:
import pandas as pd
from prophet import Prophet
from sklearn.preprocessing import OneHotEncoder
import pickle
import os
import numpy as np
from datetime import timedelta
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score



In [80]:
# Carreguem les dades
dades_temporals = pd.read_csv('../data/preprocessed/dataMatrix/daily_covidMatrix.csv') 
dades_estatiques = pd.read_csv('../data/preprocessed/dataMatrix/static_stateMatrix.csv')
clusters_targets = pd.read_csv('experiments_prophet/state_clusters_all.csv') 

In [24]:
targets = {
    'positiveIncrease': 'Cluster_positiveIncrease',
    'hospitalizedIncrease': 'Cluster_hospitalizedIncrease',
    'deathIncrease': 'Cluster_deathIncrease'
}

In [25]:
dades = pd.merge(dades_temporals, dades_estatiques, on='state', how='left')

In [26]:
# Inicialitzem encoder per l'estat
encoder = OneHotEncoder(sparse_output=False)
states_encoded = encoder.fit_transform(dades[['state']])
state_columns = encoder.get_feature_names_out(['state'])
states_encoded_df = pd.DataFrame(states_encoded, columns=state_columns)

dades = pd.concat([dades.reset_index(drop=True), states_encoded_df.reset_index(drop=True)], axis=1)


In [27]:
altres_regressors = [
    'no_coverage', 'bedsState_local_government','bedsNon_profit','bedsFor_profit',
    'bedsTotal','population_state','pop_density_state', 'Low_SVI_CTGY',
    'Moderate_Low_SVI_CTGY','Moderate_High_SVI_CTGY', 'Metro',
    'private_coverage','public_coverage','labor_cov_diff','pop_60-69',
    'pop_70-79','pop_80+'
]
#,'High_SVI_CTGY','Non-metro', 'pop_0-9', 'pop_10-19','pop_20-29','pop_30-39','pop_40-49','pop_50-59',



In [28]:
# Dividir en train i test

dades['date'] = pd.to_datetime(dades['date'])

train = dades[dades['date'] < '2021-03-01']
test = dades[dades['date'] >= '2021-03-01']

Entrenar els diferents models

In [None]:
# Crear la carpeta per guardar els models
os.makedirs('models_prophet', exist_ok=True)

# Iterar per cada variable target
for target_var, cluster_col in targets.items():
    # Tots els clusters possibles per aquesta variable
    clusters = clusters_targets[cluster_col].unique()
    
    for cluster in clusters:
        # Estats que pertanyen a aquest cluster
        estats_cluster = clusters_targets[clusters_targets[cluster_col] == cluster]['State'].unique()
        
        dades_cluster = train[train['state'].isin(estats_cluster)]      
          
        columns_model = ['date', target_var] + list(state_columns) + altres_regressors
        df_prophet = dades_cluster[columns_model].rename(columns={'date': 'ds', target_var: 'y'})
        
        model = Prophet(
            changepoint_prior_scale=0.05,
            seasonality_prior_scale=10.0)
        
        regressors = list(state_columns) + altres_regressors              
        for reg in regressors:
            model.add_regressor(reg)

        model.fit(df_prophet)
            
        model_filename = f'models_prophet/prophet_{target_var}_cluster{cluster}.pkl'
        with open(model_filename, 'wb') as f:
            pickle.dump(model, f)
        
        print(f"Model entrenat i guardat per {target_var}, cluster {cluster}")


Predir el test

In [None]:
metriques_totals = []

# Iterar per cada target
for target_var, cluster_col in targets.items():
    # Tots els clusters possibles per aquest target
    clusters = clusters_targets[cluster_col].unique()
    
    for cluster in clusters:
        model_filename = f'models_prophet/prophet_{target_var}_cluster{cluster}.pkl'
        
        with open(model_filename, 'rb') as f:
            model = pickle.load(f)
        
        estats_cluster = clusters_targets[clusters_targets[cluster_col] == cluster]['State'].unique()
        
        for estat in estats_cluster:
            dades_estat = train[train['state'] == estat]
            if dades_estat.empty:
                continue
            
            columns_model = ['date', target_var] + list(state_columns) + altres_regressors
            df_prophet = dades_estat[columns_model].rename(columns={'date': 'ds', target_var: 'y'})
            df_prophet['ds'] = pd.to_datetime(df_prophet['ds'])
            
            # Última fila coneguda de regressors
            regressors = list(state_columns) + altres_regressors
            last_known = df_prophet[regressors].iloc[-1]
            #print(estat)
            

            # Genera les dates futures (7 dies a partir de l'última data coneguda)
            start_date = df_prophet['ds'].max() + timedelta(days=1)
            future_dates = pd.date_range(start=start_date, periods=7, freq='D')
            future = pd.DataFrame({'ds': future_dates})

            # Repetir últims valors de regressors
            future_states_df = pd.DataFrame([last_known.values] * 7, columns=regressors).reset_index(drop=True)
            future = pd.concat([future, future_states_df], axis=1)
            #print('------------------------------')
            #print(future)
            
            # Predicció
            forecast = model.predict(future)
            #print('------------------')
            #print(forecast)
            prediccions = forecast[['ds', 'yhat']].copy()
            prediccions['state'] = estat
            prediccions['target'] = target_var
            #print('------------------')
            #print(prediccions)

            # Comparació 
            test_estat = test[
                (test['state'] == estat) &
                (test['date'].isin(prediccions['ds']))
            ]       
                      
            comparacio = pd.merge(prediccions, test_estat, left_on=['ds', 'state'], right_on=['date', 'state'])
            
            y_true = comparacio[target_var]
            y_pred = comparacio['yhat']
            
            # Mètriques
            mae = mean_absolute_error(y_true, y_pred)
            mse = mean_squared_error(y_true, y_pred)
            rmse = np.sqrt(mse)
            r2 = r2_score(y_true, y_pred)
           
            metriques_totals.append({
                'target': target_var,
                'state': estat,
                'cluster': cluster,
                'MAE': mae,
                'MSE': mse,
                'R2': r2
            })
                
df_metriques = pd.DataFrame(metriques_totals)



In [33]:
df_metriques['model'] = df_metriques['target'] + '_cluster' + df_metriques['cluster'].astype(str)

mitjanes = df_metriques.groupby('model').agg({
    'MAE': 'mean',
    'MSE': 'mean',
    'R2': 'mean'
}).reset_index()
mitjanes = mitjanes.rename(columns=lambda x: f'mitjana_{x}' if x != 'model' else x)

# Millor resultat per model (mínim MAE, mínim MSE, màxim R2)
millors = df_metriques.groupby('model').agg({
    'MAE': 'min',
    'MSE': 'min',
    'R2': 'max'
}).reset_index()
millors = millors.rename(columns=lambda x: f'millor_{x}' if x != 'model' else x)

# Pitjor resultat per model (màxim MAE, màxim MSE, mínim R2)
pitjors = df_metriques.groupby('model').agg({
    'MAE': 'max',
    'MSE': 'max',
    'R2': 'min'
}).reset_index()
pitjors = pitjors.rename(columns=lambda x: f'pitjor_{x}' if x != 'model' else x)

# Juntar-ho tot en un sol DataFrame final
resultats_models = mitjanes.merge(millors, on='model').merge(pitjors, on='model')

resultats_models


Unnamed: 0,model,mitjana_MAE,mitjana_MSE,mitjana_R2,millor_MAE,millor_MSE,millor_R2,pitjor_MAE,pitjor_MSE,pitjor_R2
0,deathIncrease_cluster1,1.531806,3.402129e+00,-1.688779,1.531806,3.402129e+00,-1.688779,1.531806,3.402129e+00,-1.688779
1,deathIncrease_cluster10,14.999372,2.377680e+02,-18.417721,14.999372,2.377680e+02,-18.417721,14.999372,2.377680e+02,-18.417721
2,deathIncrease_cluster11,13.175818,2.134742e+02,-6.020291,13.175818,2.134742e+02,-6.020291,13.175818,2.134742e+02,-6.020291
3,deathIncrease_cluster12,75.991312,6.203004e+03,-3.661195,75.991312,6.203004e+03,-3.661195,75.991312,6.203004e+03,-3.661195
4,deathIncrease_cluster13,7.246854,5.768332e+01,-6.360632,7.246854,5.768332e+01,-6.360632,7.246854,5.768332e+01,-6.360632
...,...,...,...,...,...,...,...,...,...,...
148,positiveIncrease_cluster51,1081.679259,1.260476e+06,-0.765670,1081.679259,1.260476e+06,-0.765670,1081.679259,1.260476e+06,-0.765670
149,positiveIncrease_cluster6,113.022801,1.432132e+04,-5.777521,113.022801,1.432132e+04,-5.777521,113.022801,1.432132e+04,-5.777521
150,positiveIncrease_cluster7,1104.996688,1.545879e+06,-10.574641,1104.996688,1.545879e+06,-10.574641,1104.996688,1.545879e+06,-10.574641
151,positiveIncrease_cluster8,785.558160,6.265209e+05,-29.482971,785.558160,6.265209e+05,-29.482971,785.558160,6.265209e+05,-29.482971


In [None]:
resultats_models = pd.read_csv("experiments_prophet/resultats_0.3-0.7.csv")

Unnamed: 0,model,mitjana_MAE,mitjana_MSE,mitjana_R2,millor_MAE,millor_MSE,millor_R2,pitjor_MAE,pitjor_MSE,pitjor_R2
0,deathIncrease_cluster1,2.065179,6.185794e+00,-2.890912,1.585241,4.304313e+00,-0.406076,2.545117,8.067274e+00,-5.375749
1,deathIncrease_cluster10,13.115653,2.295626e+02,-0.518133,10.974875,1.354764e+02,-0.292010,15.256430,3.236487e+02,-0.744257
2,deathIncrease_cluster11,19.160453,5.324546e+02,0.095096,15.646413,3.879574e+02,0.404713,23.083371,7.543065e+02,-0.309931
3,deathIncrease_cluster12,7.859813,8.587554e+01,-2.634065,5.012684,3.507461e+01,-0.760918,9.977498,1.464326e+02,-6.058597
4,deathIncrease_cluster13,1.995625,4.663616e+00,-5.721093,1.995625,4.663616e+00,-5.721093,1.995625,4.663616e+00,-5.721093
...,...,...,...,...,...,...,...,...,...,...
63,positiveIncrease_cluster5,39.408882,2.177587e+03,-0.030537,39.408882,2.177587e+03,-0.030537,39.408882,2.177587e+03,-0.030537
64,positiveIncrease_cluster6,305.573356,1.377458e+05,-5.169460,132.197259,2.586894e+04,0.806309,478.949452,2.496226e+05,-11.145228
65,positiveIncrease_cluster7,1766.596876,3.888470e+06,-29.493920,1650.274185,3.818758e+06,-0.462681,1882.919567,3.958182e+06,-58.525159
66,positiveIncrease_cluster8,228.033167,8.226222e+04,-0.993476,228.033167,8.226222e+04,-0.993476,228.033167,8.226222e+04,-0.993476


In [75]:
death_inc = resultats_models[resultats_models['model'].str.startswith("deathIncrease")]
death_inc['mitjana_MAE'].mean()

24.95400806920205

In [76]:
death_inc = resultats_models[resultats_models['model'].str.startswith("hospitalizedIncrease")]
death_inc['mitjana_MAE'].mean()

21.248699047621162

In [77]:
death_inc = resultats_models[resultats_models['model'].str.startswith("positiveIncrease")]
death_inc['mitjana_MAE'].mean()

419.93116132052194

In [None]:
#resultats_models.to_csv("resultats2_0.3-0.7.csv", index=False)
#resultats_models.to_csv("resultats2_0.5-0.5.csv", index=False)
resultats_models.to_csv("experiments_prophet/resultats_all.csv", index=False)
#resultats_models.to_csv("resultats2.csv", index=False)