In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import joblib


main_data = pd.read_csv(r"C:\Users\torre\Documents\Machine learning\eventos adversos\eventos.csv")

### Exploramos el Dataset

In [13]:
main_data

Unnamed: 0,Year,County,PSI,PSIDescription,Count,Population,ObsRate
0,2005,STATEWIDE,21,Retained Surgical Item or Unretrieved Device F...,321,26447738,1.21
1,2006,STATEWIDE,21,Retained Surgical Item or Unretrieved Device F...,341,26679904,1.28
2,2007,STATEWIDE,21,Retained Surgical Item or Unretrieved Device F...,353,26935982,1.31
3,2008,STATEWIDE,21,Retained Surgical Item or Unretrieved Device F...,346,27272108,1.27
4,2009,STATEWIDE,21,Retained Surgical Item or Unretrieved Device F...,308,27630962,1.11
...,...,...,...,...,...,...,...
4538,2011,Yuba,27,Perioperative Hemorrhage or Hematoma,16,51791,30.89
4539,2012,Yuba,27,Perioperative Hemorrhage or Hematoma,22,52301,42.06
4540,2013,Yuba,27,Perioperative Hemorrhage or Hematoma,24,52770,45.48
4541,2014,Yuba,27,Perioperative Hemorrhage or Hematoma,15,54942,27.30


In [15]:
main_data.info

<bound method DataFrame.info of       Year     County  PSI  ... Count  Population  ObsRate
0     2005  STATEWIDE   21  ...   321    26447738     1.21
1     2006  STATEWIDE   21  ...   341    26679904     1.28
2     2007  STATEWIDE   21  ...   353    26935982     1.31
3     2008  STATEWIDE   21  ...   346    27272108     1.27
4     2009  STATEWIDE   21  ...   308    27630962     1.11
...    ...        ...  ...  ...   ...         ...      ...
4538  2011       Yuba   27  ...    16       51791    30.89
4539  2012       Yuba   27  ...    22       52301    42.06
4540  2013       Yuba   27  ...    24       52770    45.48
4541  2014       Yuba   27  ...    15       54942    27.30
4542  2015       Yuba   27  ...    16       40517    39.49

[4543 rows x 7 columns]>

## Preparamos los datos
### Cambiamos STATEWIDE por Statewide
### Eliminamos los Countys de Alpine y Sierra ya que no cuentan con datos registrados, se toma esta decision ya que son los menos poblados

In [17]:
main_data['County'] = main_data['County'].replace('STATEWIDE', 'Statewide')

In [19]:
main_data = main_data[~main_data.County.isin(['Alpine', 'Sierra'])]

In [21]:
main_data_statewide = main_data[main_data.County == 'Statewide']
main_data_t = main_data_statewide.groupby(['Year', 'PSIDescription', 'PSI'])[['ObsRate']] \
       .sum() \
       .reset_index() \
       .sort_values(['PSIDescription', 'Year'])


fig = px.line(main_data_t, x='Year', y='ObsRate', color='PSIDescription', 
              color_discrete_sequence=px.colors.qualitative.Dark24)

# Layout adjustments.

fig.update_layout(title='Adverse Events Timeline by Description', xaxis_title ='', 
                  yaxis_title = 'Count per 100K Population')
                         
fig.for_each_trace(lambda t: t.update(name=t.name.replace("PSIDescription=", "")))

fig.show()

In [23]:

# Separar características y etiqueta
X = main_data[['Year', 'County', 'PSIDescription']]
y = main_data['ObsRate']

# Dividir los datos en Train/Val y Test
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Dividir nuevamente Train/Val en Train y Validation
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)


In [36]:
# Definir preprocesador
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['County', 'PSIDescription']),
        ('num', StandardScaler(), ['Year'])
    ]
)

# Crear el pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', SVR())  
])


In [27]:
# Guardar el pipeline
joblib.dump(pipeline, 'SVM-RL.sav')

['SVM-RL.sav']

### Mejoramos los parametros utilizando Gridsearch

In [29]:
from sklearn.model_selection import GridSearchCV

# Definir parámetros para GridSearch
param_grid = {
    'model__C': [0.1, 1, 10],
    'model__gamma': ['scale', 'auto'],
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)


In [30]:
# Realizar predicciones
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluar el modelo
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse}, R²: {r2}')


MSE: 32.5631736153505, R²: 0.8583598900379061


In [31]:
joblib.dump(best_model, 'bestmodelSVM.sav')


['bestmodelSVM.sav']