In [55]:
from google.colab import drive 
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [56]:
filepath = "./gdrive/MyDrive/DVA-project-covid-nursing-home/"

To access files from shared folder, add the shared folder as shortcut to your google drive

see: https://stackoverflow.com/questions/54351852/accessing-shared-with-me-with-colab

In [57]:
import pandas as pd 
import numpy as np
import random as rd
rd.seed(1887)
from datetime import datetime
import lightgbm as lgb

In [58]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.options.display.max_seq_items = None

#1. Load the model and pre-processed input data

In [59]:
model = lgb.Booster(model_file=filepath + "model/output_model/LGB_classification.txt")

In [60]:
df_nh_data = pd.read_pickle(filepath+"model/output_data/df_nh_data_lightGBM_classification.pkl")

#2. Prepare input data for forecasting

In [61]:
features = ['pct_comfirm','pct_avg_confirm', 'log_n_occupied_beds','log_n_resi', 'log_n_staff', 'pct_vaccine_resi', 'pct_vaccine_staff', 
            'log_num_treatment','log_county_weekly_confirmed', 'log_county_weekly_death','shortage_staff_True',
            "able_to_test_N","able_to_test_Y",'ownership_For Profit','nh_rating',
            'turnover_nursing_staff', 'nurse_staff_hrs', 'nh_health_survey_score','pct_death','pct_avg_confirm_staff',
           'amt_fine', 'acuindex2', 'paymcaid', 'paymcare','shortage_supply_Y',
       'agghighcfs', 'pctfem', 'pctunder65', 'pctblack_mds3', 'pcthisp_mds3',
       'pctwhite_mds3']

In [62]:
def prepare_input_data_forecast(data, target,features,window):
  data_copy = data.copy()
  for l in range(1,window):
    data_copy.loc[:,"pct_comfirm_"+str(l)] = data_copy.groupby(['nh_id'])['pct_comfirm'].shift(l)
    features_final = features + [("pct_comfirm_"+ str(n)) for n in range(1,window)]
  # lagging data would create NAs
  data_copy.dropna(inplace = True)
  # make sure the dataset is sorted by nh_id and date
  data_copy.sort_values(['nh_id','date'])
  # keep latest data for forecasting
  data_copy = data_copy.groupby('nh_id').tail(1)

  return data_copy[features_final],np.array(data_copy["nh_id"])

In [63]:
X_pred,nh_ids = prepare_input_data_forecast(df_nh_data,"risk_level",features,4)

In [64]:
X_pred.shape

(14536, 34)

In [65]:
X_pred.head()

Unnamed: 0,pct_comfirm,pct_avg_confirm,log_n_occupied_beds,log_n_resi,log_n_staff,pct_vaccine_resi,pct_vaccine_staff,log_num_treatment,log_county_weekly_confirmed,log_county_weekly_death,shortage_staff_True,able_to_test_N,able_to_test_Y,ownership_For Profit,nh_rating,turnover_nursing_staff,nurse_staff_hrs,nh_health_survey_score,pct_death,pct_avg_confirm_staff,amt_fine,acuindex2,paymcaid,paymcare,shortage_supply_Y,agghighcfs,pctfem,pctunder65,pctblack_mds3,pcthisp_mds3,pctwhite_mds3,pct_comfirm_1,pct_comfirm_2,pct_comfirm_3
120,0.0,0.007511,4.007333,3.988984,4.406719,92.453,0.0,0.0,11.265515,6.900731,0,0,1,1,5.0,57.375144,4.62192,5.333,0.087719,0.007511,655.14,14.041,71.43,12.5,0,6.765772,79.59,22.45,18.455932,0.0,89.8,0.052632,0.017544,0.0
244,0.0,0.005001,4.369448,4.369448,5.273,91.026,0.0,0.0,12.131333,7.901007,0,0,1,1,4.0,59.0,4.37246,6.667,0.023529,0.005001,650.0,12.984,77.6,3.83,0,5.47,61.84,25.0,28.95,0.0,71.05,0.011765,0.0,0.011765
340,0.0,0.002361,3.583519,3.610918,4.369448,100.0,0.0,0.0,11.548264,7.330405,0,0,0,0,5.0,58.320768,4.81546,13.333,0.06,0.002361,0.0,11.585809,64.628817,10.728229,0,5.135609,81.58,27.224041,21.132182,0.0,97.37,0.02,0.0,0.0
487,0.0,0.009066,4.343805,4.369448,4.564348,97.436,0.0,0.0,14.151911,9.725974,0,0,1,1,3.0,58.815477,3.528801,12.667,0.022222,0.009066,0.0,14.107,18.82,2.35,0,9.148196,56.1,35.37,79.27,0.0,20.73,0.011111,0.0,0.0
616,0.0,0.004343,4.406719,4.454347,4.94876,94.118,0.0,0.0,14.185002,9.728956,0,0,1,1,3.0,47.9,3.39029,16.0,0.0,0.004343,1625.0,13.014,69.47,4.21,0,9.148196,75.61,26.379183,30.40475,0.0,91.36,0.009709,0.009709,0.029126


#3. Prediction

In [66]:
y_pred_probs = model.predict(X_pred)
y_pred = np.argmax(y_pred_probs, axis=1)

combine with nh_id

In [67]:
prediction = pd.DataFrame({'nh_id':nh_ids, 'predicted_risk_level': y_pred, 'prob_high_risk':y_pred_probs[:,2]})

In [68]:
prediction.head()

Unnamed: 0,nh_id,predicted_risk_level,prob_high_risk
0,15009,1,0.182354
1,15010,1,0.14549
2,15012,1,0.207691
3,15014,1,0.159184
4,15015,0,0.1537


save prediction file

In [69]:
prediction.to_pickle(filepath+"model/output_model/prediction.pkl")

In [70]:
prediction.sort_values('prob_high_risk', ascending=False).head()

Unnamed: 0,nh_id,predicted_risk_level,prob_high_risk
6718,245593,2,0.802107
9843,366202,2,0.795073
12938,525579,2,0.782288
3720,155330,2,0.781803
3892,155657,2,0.773688


In [71]:
df_nh_data_visual = df_nh_data[["nh_id", "date", "nh_nm", "nh_county", "Address", "nh_state", "Lat", "Long", "resi_week_confirm", "resi_week_death", "pct_vaccine_resi", "pctunder65"]]
prediction_with_date = prediction
prediction_with_date["prediction_start_date"] = "2022-10-31"


In [72]:
final_visual = pd.merge(prediction_with_date, df_nh_data_visual, on =['nh_id'], how='left')

In [73]:
final_visual = final_visual[(final_visual['date'] >= '2022-01-01')]

In [74]:
final_visual.to_csv(filepath+'model/output_data/final_visual.csv',index = False)

In [75]:
final_visual.shape

(635645, 15)

In [76]:
final_visual.isnull().sum()

nh_id                    0
predicted_risk_level     0
prob_high_risk           0
prediction_start_date    0
date                     0
nh_nm                    0
nh_county                0
Address                  0
nh_state                 0
Lat                      0
Long                     0
resi_week_confirm        0
resi_week_death          0
pct_vaccine_resi         0
pctunder65               0
dtype: int64