In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns
import plotly.express as px
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
import joblib 

main_data = pd.read_csv(r"C:\Users\torre\Documents\Machine learning\eventos adversos\eventos.csv")

In [34]:
main_data

Unnamed: 0,Year,County,PSI,PSIDescription,Count,Population,ObsRate
0,2005,STATEWIDE,21,Retained Surgical Item or Unretrieved Device F...,321,26447738,1.21
1,2006,STATEWIDE,21,Retained Surgical Item or Unretrieved Device F...,341,26679904,1.28
2,2007,STATEWIDE,21,Retained Surgical Item or Unretrieved Device F...,353,26935982,1.31
3,2008,STATEWIDE,21,Retained Surgical Item or Unretrieved Device F...,346,27272108,1.27
4,2009,STATEWIDE,21,Retained Surgical Item or Unretrieved Device F...,308,27630962,1.11
...,...,...,...,...,...,...,...
4538,2011,Yuba,27,Perioperative Hemorrhage or Hematoma,16,51791,30.89
4539,2012,Yuba,27,Perioperative Hemorrhage or Hematoma,22,52301,42.06
4540,2013,Yuba,27,Perioperative Hemorrhage or Hematoma,24,52770,45.48
4541,2014,Yuba,27,Perioperative Hemorrhage or Hematoma,15,54942,27.30


In [35]:
main_data.info

<bound method DataFrame.info of       Year     County  PSI  ... Count  Population  ObsRate
0     2005  STATEWIDE   21  ...   321    26447738     1.21
1     2006  STATEWIDE   21  ...   341    26679904     1.28
2     2007  STATEWIDE   21  ...   353    26935982     1.31
3     2008  STATEWIDE   21  ...   346    27272108     1.27
4     2009  STATEWIDE   21  ...   308    27630962     1.11
...    ...        ...  ...  ...   ...         ...      ...
4538  2011       Yuba   27  ...    16       51791    30.89
4539  2012       Yuba   27  ...    22       52301    42.06
4540  2013       Yuba   27  ...    24       52770    45.48
4541  2014       Yuba   27  ...    15       54942    27.30
4542  2015       Yuba   27  ...    16       40517    39.49

[4543 rows x 7 columns]>

In [36]:
main_data['County'] = main_data['County'].replace('STATEWIDE', 'Statewide')

In [113]:
main_data = main_data[~main_data.County.isin(['Alpine', 'Sierra'])]

In [115]:
main_data.info

<bound method DataFrame.info of       Year     County  PSI  ... Count  Population  ObsRate
0     2005  Statewide   21  ...   321    26447738     1.21
1     2006  Statewide   21  ...   341    26679904     1.28
2     2007  Statewide   21  ...   353    26935982     1.31
3     2008  Statewide   21  ...   346    27272108     1.27
4     2009  Statewide   21  ...   308    27630962     1.11
...    ...        ...  ...  ...   ...         ...      ...
4538  2011       Yuba   27  ...    16       51791    30.89
4539  2012       Yuba   27  ...    22       52301    42.06
4540  2013       Yuba   27  ...    24       52770    45.48
4541  2014       Yuba   27  ...    15       54942    27.30
4542  2015       Yuba   27  ...    16       40517    39.49

[4389 rows x 7 columns]>

In [117]:
main_data_statewide = main_data[main_data.County == 'Statewide']
main_data_t = main_data_statewide.groupby(['Year', 'PSIDescription', 'PSI'])[['ObsRate']] \
       .sum() \
       .reset_index() \
       .sort_values(['PSIDescription', 'Year'])


fig = px.line(main_data_t, x='Year', y='ObsRate', color='PSIDescription', 
              color_discrete_sequence=px.colors.qualitative.Dark24)

# Layout adjustments.

fig.update_layout(title='Adverse Events Timeline by Description', xaxis_title ='', 
                  yaxis_title = 'Count per 100K Population')
                         
fig.for_each_trace(lambda t: t.update(name=t.name.replace("PSIDescription=", "")))

fig.show()

In [119]:
main_data_county = main_data[main_data.County != 'Statewide']

# Isolating statewide for our high-level analysis.

main_data_statewide = main_data[main_data.County == 'Statewide']

main_data_obs = main_data_county.groupby('County')[['ObsRate']] \
         .mean() \
         .reset_index() \
         .sort_values('ObsRate', ascending=False)

# Plotting.

fig = px.bar(main_data_obs, x='County', y='ObsRate', color='County')

# Layout adjustments.

fig.update_layout(title='Observed Rate by County', xaxis_title='', 
                  yaxis_title='Count per 100K Population')

fig.for_each_trace(lambda t: t.update(name=t.name.replace("County=","")))

fig.show()

In [120]:
main_data_time = main_data.groupby(['Year', 'County'])[['ObsRate']] \
          .sum() \
          .reset_index() \
          .sort_values(['County', 'Year'])

# Plotting.

fig = px.line(main_data_time, x='Year', y='ObsRate', color='County')

# Layout adjustments.

fig.update_layout(title='Adverse Events Timeline by County', xaxis_title='', 
                  yaxis_title = 'Count per 100K Population')

fig.for_each_trace(lambda t: t.update(name=t.name.replace("County=", "")))

fig.show()

## Entrenamiento del modelo
### El modelo utilizado es el de regresion lineal utilizando un GridSearch para econtrar los mejores parametros para el modelo

In [52]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

In [123]:

# Separar características y etiqueta
X = main_data[['Year', 'County', 'PSIDescription']]
y = main_data['ObsRate']

# Dividir los datos en Train/Val y Test
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Dividir nuevamente Train/Val en Train y Validation
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)


In [133]:

# Definir el preprocesador
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['County', 'PSIDescription']),  # Codificar las columnas categóricas
        ('num', StandardScaler(), ['Year'])  # Estandarizar la columna 'Year'
    ]
)

# Crear el pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())  # Definir el modelo como paso 'model'
])

### Configuramos el grid search

In [135]:
# Configurar GridSearchCV
param_grid = {
    'model__fit_intercept': [True, False],  # Ajustar parámetro fit_intercept de LinearRegression
    'model__copy_X': [True, False],  # Ajustar parámetro copy_X de LinearRegression
    'model__n_jobs': [-1, 1]  # Ajustar parámetro n_jobs de LinearRegression
}

# Configurar GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Ajustar el modelo con el conjunto de entrenamiento
grid_search.fit(X_train, y_train)

In [141]:
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

In [143]:
# Ajustar el modelo usando Grid Search
grid_search.fit(X_train, y_train)

In [145]:
best_model = grid_search.best_estimator_

In [147]:
y_pred = best_model.predict(X_test)
joblib.dump(best_model, 'best_model_lineal.sav')

['best_model_lineal.sav']

In [149]:
# Calcular y mostrar métricas
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Para todo el estado: Mean Squared Error:", mse)
print("Para todo el estado: R-squared:", r2)

Para todo el estado: Mean Squared Error: 47.223640842618835
Para todo el estado: R-squared: 0.7945912225641965
