# Nursing Homes EDA

In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import plotly as py
import plotly.figure_factory as ff
import pandas as pd
from tqdm import tqdm
import os
from os.path import join as oj
import numpy as np
from bokeh.plotting import figure, show, output_notebook, output_file, save
import sys
import copy
import re
from datetime import date

## Load in raw HIFLD and NYT nursing homes data

In [2]:
data_dir = "../../data/nursinghome_level/raw/"
data_dir_clean = "../../data/nursinghome_level/processed/"
sys.path.append(data_dir)
from hifld_nursinghomes.load import load_hifld_nursinghomes
from nyt_nursinghomes.load import load_nyt_nursinghomes

In [3]:
hifld = load_hifld_nursinghomes(oj(data_dir, "hifld_nursinghomes"))
#nyt = load_nyt_nursinghomes(oj(data_dir, "nyt_nursinghomes"))
nyt = load_nyt_nursinghomes(oj(data_dir, "nyt_nursinghomes"))
#nyt = pd.read_csv(oj(data_dir, "nyt_nursinghomes", "nyt_nursinghomes_2020-05-11.csv"))

In [4]:
hifld.shape

(39762, 33)

In [5]:
nyt.shape

(543, 5)

In [6]:
nyt.columns

Index(['Name', 'Cases_2020-05-11', 'Deaths_2020-05-11', 'City', 'State'], dtype='object')

In [7]:
hifld.columns

Index(['FID', 'ID', 'NAME', 'ADDRESS', 'CITY', 'STATE', 'ZIP', 'ZIP4',
       'TELEPHONE', 'TYPE', 'STATUS', 'POPULATION', 'COUNTY', 'COUNTYFIPS',
       'COUNTRY', 'LATITUDE', 'LONGITUDE', 'NAICS_CODE', 'NAICS_DESC',
       'SOURCE', 'SOURCEDATE', 'VAL_METHOD', 'VAL_DATE', 'WEBSITE', 'TOT_RES',
       'TOT_STAFF', 'BEDS', 'EXCESS_BED', 'OWNERSHIP', 'MEDICAIDID',
       'MEDICAREID', 'STATE_LIC', 'SOURCETYPE'],
      dtype='object')

In [8]:
hifld.isna().sum()

FID               0
ID                0
NAME              0
ADDRESS           0
CITY              0
STATE             0
ZIP               0
ZIP4          33752
TELEPHONE      2101
TYPE              0
STATUS            0
POPULATION     4741
COUNTY            0
COUNTYFIPS        0
COUNTRY           0
LATITUDE          0
LONGITUDE         0
NAICS_CODE        0
NAICS_DESC        0
SOURCE            0
SOURCEDATE        0
VAL_METHOD        0
VAL_DATE          0
WEBSITE       35265
TOT_RES       38826
TOT_STAFF     39268
BEDS           4527
EXCESS_BED    39449
OWNERSHIP     19819
MEDICAIDID    39356
MEDICAREID    39726
STATE_LIC     11626
SOURCETYPE        1
dtype: int64

In [9]:
nyt.isna().sum()

Name                 0
Cases_2020-05-11     0
Deaths_2020-05-11    0
City                 0
State                0
dtype: int64

## Clean HIFLD

In [10]:
hifld_raw = copy.deepcopy(hifld)

In [11]:
hifld_raw.shape

(39762, 33)

In [12]:
# drop those with all nas
#hifld = hifld.dropna(subset = ["POPULATION", "TOT_RES", "TOT_STAFF", "BEDS", "EXCESS_BED"],
#                     how = "all")

In [13]:
# drop facilites that are closed
hifld = hifld.loc[hifld["STATUS"] != "CLOSED"]

In [14]:
hifld.shape

(38938, 33)

In [15]:
keys = ["NAME", "ADDRESS", "CITY", "STATE", "ZIP", "TYPE", "POPULATION",
        "COUNTYFIPS", "TOT_RES", "TOT_STAFF", "BEDS", "EXCESS_BED"]
hifld = hifld.drop_duplicates(subset = keys)

In [16]:
hifld.shape

(38893, 33)

In [17]:
hifld.drop_duplicates(subset = ["NAME", "ADDRESS", "CITY", "STATE", "TYPE"], keep  = False).shape

(38841, 33)

In [18]:
# manually edit those with duplicated key: ["NAME", "ADDRESS", "CITY", "STATE", "TYPE"]
hifld = hifld.drop_duplicates(subset = ["NAME", "ADDRESS", "CITY", "STATE", "TYPE"], keep = False)
hifld_edits = pd.read_csv(oj(data_dir_clean, "hifld_nursinghomes", "hifld_nursinghomes_manualedits.csv"))
# Note: for duplicates in MO: used https://health.mo.gov/information/boards/certificateofneed/pdf/rcfcty.pdf for validation
# Nursing homes in other states were googled manually
hifld = pd.concat([hifld, hifld_edits], axis = 0, sort = False)

In [19]:
hifld.shape

(38867, 33)

In [20]:
hifld.head()

Unnamed: 0,FID,ID,NAME,ADDRESS,CITY,STATE,ZIP,ZIP4,TELEPHONE,TYPE,...,WEBSITE,TOT_RES,TOT_STAFF,BEDS,EXCESS_BED,OWNERSHIP,MEDICAIDID,MEDICAREID,STATE_LIC,SOURCETYPE
0,1,5449,"SUMMIT AT HIDDEN VALLEY, THE",438 23RD STREET,OAK HILL,WV,25901,,(304) 469-8255,NURSING HOME,...,,,,58.0,,,,,,NURSING HOME
1,2,5156,BROOKDALE LAKE RIDGE,3940 PRINCE WILLIAM PARKWAY,WOODBRIDGE,VA,22192,4513.0,(703) 680-0600,ASSISTED CARE,...,,,,107.0,,,,,,ASSISTED LIVING FACILITY
2,3,37345,VILLAGE GREEN HEALTH AND REHABILITATION,1601 PURDUE DRIVE,FAYETTEVILLE,NC,28304,3674.0,(910) 486-5000,NURSING HOME,...,,,,170.0,,,,,NH0502,NURSING FACILITY
3,4,25342,CARE ONE AT LEXINGTON,178 LOWELL STREET,LEXINGTON,MA,2420,2719.0,(781) 862-7400,NURSING HOME,...,www.lexingtonhcc.com,,,,,PROFIT,,,,NURSING FACILITY
4,5,5501389,TIMBER VIEW,S8560 BALSAM ROAD,EAU CLAIRE,WI,54701,,(715) 878-4699,ASSISTED LIVING,...,,,,6.0,,BROTOLOC INC DBA BROTOLOC NORTH,,,510256,ASSISTED LIVING


In [21]:
hifld.tail()

Unnamed: 0,FID,ID,NAME,ADDRESS,CITY,STATE,ZIP,ZIP4,TELEPHONE,TYPE,...,WEBSITE,TOT_RES,TOT_STAFF,BEDS,EXCESS_BED,OWNERSHIP,MEDICAIDID,MEDICAREID,STATE_LIC,SOURCETYPE
21,38377,291386,WESTPHALIA HILLS - A STONEBRIDGE COMMUNITY,1899 HIGHWAY 63,WESTPHALIA,MO,65085,2215.0,(573) 455-2280,NURSING HOME,...,https://stonebridgeseniorliving.com/westphalia...,,,28.0,,PROFIT CORPORATION,,,18653,RESIDENTIAL CARE FACILITY
22,11221,48285,FRANKLIN HEIGHTS NURSING & REHABILITATION,223 S RESLER,EL PASO,TX,79912,,(915) 584-9417,NURSING HOME,...,http://franklinheightsnursing.com/,,,132.0,,,,,4985,SNF/NF
23,25111,48481,KNOPP NURSING & REHAB CENTER INC,202 BILLIE DR,FREDERICKSBURG,TX,78624,,(830) 997-8840,NURSING HOME,...,https://www.knopphealthcare.com/,,,60.0,,,,,5025,SNF/NF
24,7978,482809,THE HEIGHTS OF BULVERDE,384 HARMONY HILLS,SPRING BRANCH,TX,78070,,(830) 438-1276,NURSING HOME,...,https://www.touchstone-communities.com/communi...,,,124.0,,,,,106817,SKILLED NURSING FACILITY
25,13327,3619,ALICE HYDE MEDICAL CENTER,45 SIXTH STREET,MALONE,NY,12953,,(518) 481-8000,NURSING HOME,...,http://alicehyde.com/services/nursinghome.asp,,,135.0,,VOLUNTARY--NOT FOR PROFIT CORPORATION,,,1624000N,RESIDENTIAL HEALTH CARE


In [22]:
keys = ["NAME", "ADDRESS", "CITY", "STATE", "TYPE"]
hifld.drop_duplicates(subset = keys).shape

(38867, 33)

## Merge data

In [23]:
hifld = pd.read_csv(oj(data_dir, "hifld_nursinghomes", "hifld_nursinghomes.csv"))
hifld.shape

(39762, 33)

In [24]:
hifld.head()

Unnamed: 0,FID,ID,NAME,ADDRESS,CITY,STATE,ZIP,ZIP4,TELEPHONE,TYPE,...,WEBSITE,TOT_RES,TOT_STAFF,BEDS,EXCESS_BED,OWNERSHIP,MEDICAIDID,MEDICAREID,STATE_LIC,SOURCETYPE
0,1,5449,"SUMMIT AT HIDDEN VALLEY, THE",438 23RD STREET,OAK HILL,WV,25901,NOT AVAILABLE,(304) 469-8255,NURSING HOME,...,NOT AVAILABLE,-999,-999,58,-999,NOT AVAILABLE,NOT AVAILABLE,NOT AVAILABLE,NOT AVAILABLE,NURSING HOME
1,2,5156,BROOKDALE LAKE RIDGE,3940 PRINCE WILLIAM PARKWAY,WOODBRIDGE,VA,22192,4513,(703) 680-0600,ASSISTED CARE,...,NOT AVAILABLE,-999,-999,107,-999,NOT AVAILABLE,NOT AVAILABLE,NOT AVAILABLE,NOT AVAILABLE,ASSISTED LIVING FACILITY
2,3,37345,VILLAGE GREEN HEALTH AND REHABILITATION,1601 PURDUE DRIVE,FAYETTEVILLE,NC,28304,3674,(910) 486-5000,NURSING HOME,...,NOT AVAILABLE,-999,-999,170,-999,NOT AVAILABLE,NOT AVAILABLE,NOT AVAILABLE,NH0502,NURSING FACILITY
3,4,25342,CARE ONE AT LEXINGTON,178 LOWELL STREET,LEXINGTON,MA,2420,2719,(781) 862-7400,NURSING HOME,...,www.lexingtonhcc.com,-999,-999,-999,-999,PROFIT,NOT AVAILABLE,NOT AVAILABLE,NOT AVAILABLE,NURSING FACILITY
4,5,5501389,TIMBER VIEW,S8560 BALSAM ROAD,EAU CLAIRE,WI,54701,NOT AVAILABLE,(715) 878-4699,ASSISTED LIVING,...,NOT AVAILABLE,-999,-999,6,-999,BROTOLOC INC DBA BROTOLOC NORTH,NOT AVAILABLE,NOT AVAILABLE,510256,ASSISTED LIVING


In [25]:
nyt.head()

Unnamed: 0,Name,Cases_2020-05-11,Deaths_2020-05-11,City,State
0,PARAMUS VETERANS MEMORIAL HOME,264,71,PARAMUS,NJ
1,BERGEN NEW BRIDGE MEDICAL CENTER NURSING HOME,233,26,PARAMUS,NJ
2,FUTURECARE LOCHEARN NURSING HOME,231,21,BALTIMORE,MD
3,ANDOVER SUBACUTE AND REHABILITATION CENTER II,189,55,ANDOVER,NJ
4,WOODBRIDGE NURSING PAVILION,183,17,CHICAGO,IL


In [26]:
nyt.shape

(543, 5)

In [27]:
# exact matching
df = pd.merge(nyt, hifld, how = "left", 
              left_on = ["Name", "City", "State"], right_on = ["NAME", "CITY", "STATE"])
df.shape

(544, 38)

In [28]:
# unmatched
df["FID"].isna().sum()

441

In [29]:
def clean_names(df, col):
    df[col] = df[col].str.replace(",", "")
    df[col] = df[col].str.replace(".", "")
    df[col] = df[col].str.replace(" LTD", "")
    df[col] = df[col].str.replace(" LLC", "")
    df[col] = df[col].str.replace(" INC", "")
    df[col] = df[col].str.replace("&", "AND")
    df[col] = df[col].str.replace("'", "")
    df[col] = df[col].str.replace("HEALTHCARE", "HEALTH CARE")
    df[col] = df[col].str.replace("REHABILITATION", "REHAB")
    df[col] = df[col].apply(lambda x: re.sub(' +', ' ', x))  # remove duplicated spaces
    df[col] = df[col].str.strip()
    return df

def clean_cities(df, col):
    df[col] = df[col].str.replace(",", "")
    df[col] = df[col].str.replace(".", "")
    df[col] = df[col].str.replace("SAINT", "ST")
    df[col] = df[col].str.replace("MOUNT ", "MT ")
    df[col] = df[col].str.replace("TOWNSHIP", "")
    df[col] = df[col].str.replace("'", "")
    #df[col] = df[col].str.replace("CITY", "")
    df[col] = df[col].str.strip()
    return df

def clean_names2(df, col):
    # more ambitious cleaning
    df[col] = df[col].str.replace(",", "")
    df[col] = df[col].str.replace(".", "")
    df[col] = df[col].str.replace(" LTD", "")
    df[col] = df[col].str.replace(" LLC", "")
    df[col] = df[col].str.replace(" INC", "")
    df[col] = df[col].str.replace("&", "AND")
    df[col] = df[col].str.replace("'", "")
    df[col] = df[col].str.replace("HEALTHCARE", "HEALTH CARE")
    df[col] = df[col].str.replace("REHABILITATION", "")
    df[col] = df[col].str.replace("REHAB", "")
    df[col] = df[col].str.replace("CENTER", "")
    df[col] = df[col].str.replace("CENTRE", "")
    df[col] = df[col].str.replace("FACILITY", "")
    df[col] = df[col].str.replace("NURSING HOME", "")
    df[col] = df[col].str.replace("CONVALESCENT", "")
    df[col] = df[col].str.replace("NURSING", "")
    df[col] = df[col].str.replace("WELLNESS SUITES", "")
    df[col] = df[col].str.replace("RETIREMENT COMMUNITY", "")
    #df[col] = df[col].str.replace("THE", "")
    #df[col] = df[col].str.replace("ASSISTED", "")
    df[col] = df[col].str.replace("SENIOR", "")
    df[col] = df[col].str.replace("LIVING", "")
    df[col] = df[col].str.replace("AND", "")
    #df[col] = df[col].str.replace("CARE", "")
    df[col] = df[col].str.replace("HEALTH", "")
    df[col] = df[col].apply(lambda x: re.sub(' +', ' ', x))  # remove duplicated spaces
    df[col] = df[col].str.strip()
    return df

In [30]:
# clean names and cities
nyt = clean_names(nyt, "Name")
nyt = clean_cities(nyt, "City")
hifld = clean_names(hifld, "NAME")
hifld = clean_cities(hifld, "CITY")
nyt2 = copy.deepcopy(nyt)
nyt2 = clean_names2(nyt2, "Name")
hifld2 = copy.deepcopy(hifld)
hifld2 = clean_names2(hifld2, "NAME")

In [31]:
nyt["Name"][:15]

0                     PARAMUS VETERANS MEMORIAL HOME
1      BERGEN NEW BRIDGE MEDICAL CENTER NURSING HOME
2                   FUTURECARE LOCHEARN NURSING HOME
3               ANDOVER SUBACUTE AND REHAB CENTER II
4                        WOODBRIDGE NURSING PAVILION
5                         THE HARBORAGE NURSING HOME
6                   CANTERBURY CARE AND REHAB CENTER
7                       CHRISTIAN HEALTH CARE CENTER
8                       REDWOOD SPRINGS NURSING HOME
9                   MEADOWBROOK MANOR OF BOLINGBROOK
10                 PRUITTHEALTH PALMYRA NURSING HOME
11                  CLINTON NURSING AND REHAB CENTER
12    SYMPHONY OF MORGAN PARK REHABILLITATION CENTER
13      HACKENSACK MERIDIAN HEALTH NURSING AND REHAB
14               CAREONE AT NEW MILFORD NURSING HOME
Name: Name, dtype: object

In [32]:
nyt2["Name"][:15]

0         PARAMUS VETERANS MEMORIAL HOME
1              BERGEN NEW BRIDGE MEDICAL
2                    FUTURECARE LOCHEARN
3                       OVER SUBACUTE II
4                    WOODBRIDGE PAVILION
5                          THE HARBORAGE
6                        CANTERBURY CARE
7                         CHRISTIAN CARE
8                        REDWOOD SPRINGS
9       MEADOWBROOK MANOR OF BOLINGBROOK
10                        PRUITT PALMYRA
11                               CLINTON
12    SYMPHONY OF MORGAN PARK ILLITATION
13                   HACKENSACK MERIDIAN
14                CAREONE AT NEW MILFORD
Name: Name, dtype: object

In [33]:
# exact matching
df = pd.merge(nyt, hifld, how = "left", 
              left_on = ["Name", "City", "State"], right_on = ["NAME", "CITY", "STATE"])
df.shape

(544, 38)

In [34]:
# unmatched
df["FID"].isna().sum()

398

In [35]:
# cities that aren't in hifld
([city for city in nyt.City.unique() if city not in hifld.CITY.unique()])

['LOPATCONG',
 'FAIR LAWN',
 'MIDDLEBROOK',
 'CALVERTON',
 'BLAIRSTOWN',
 'QUEENS',
 'EAST BRADENTON']

In [36]:
from fuzzywuzzy import process, fuzz

In [37]:
matched_fid = []
matched_level = []

for i in range(nyt.shape[0]):
    name = nyt.loc[i, "Name"]
    name2 = nyt2.loc[i, "Name"]
    city = nyt.loc[i, "City"]
    state = nyt.loc[i, "State"]
    
    matched_all = hifld.loc[(hifld["NAME"] == name) & (hifld["CITY"] == city) & (hifld["STATE"] == state)]
    
    if matched_all.shape[0] == 1:  # one exact match
        matched_fid.append(matched_all.iloc[0]["FID"])
        matched_level.append(1)
    elif matched_all.shape[0] > 1:
        if matched_all.NAME.iloc[0] == "CHRISTIAN HEALTH CARE CENTER":
            matched_fid.append(6658)
            matched_level.append(0)
        else:
            print("Multiple exact matches for: " + name)
    else:  # if no exact match, do fuzzy matching
        hifld_matched = hifld.loc[(hifld["CITY"] == city) & (hifld["STATE"] == state)]  # exact matching on city and state
        if hifld_matched.shape[0] > 0:
            matched = process.extractOne(name, hifld_matched["NAME"], scorer=fuzz.WRatio)
            if matched[1] >= 87:
                matched_fids = hifld_matched.loc[hifld_matched["NAME"] == matched[0]]
                matched_level.append(2)
            else:
                hifld2_matched = hifld2.loc[(hifld2["CITY"] == city) & (hifld2["STATE"] == state)]  # use abbreviated names
                matched = process.extractOne(name2, hifld2_matched["NAME"], scorer=fuzz.WRatio)
                if matched[1] >= 87:
                    matched_fids = hifld2_matched.loc[hifld2_matched["NAME"] == matched[0]]
                    matched_level.append(3)
                else:
                    matched = process.extractOne(name2, hifld2_matched["NAME"], scorer=fuzz.ratio)
                    matched_fids = hifld2_matched.loc[hifld2_matched["NAME"] == matched[0]]
                    matched_level.append(4)
                    
            if matched_fids.shape[0] == 1:
                matched_fid.append(matched_fids["FID"].iloc[0])
            else:
                if matched_fids["POPULATION"].isna().sum() == 0:  # no nans in population field
                    matched_fid.append(matched_fids.loc[matched_fids["POPULATION"] == np.max(matched_fids["POPULATION"])]["FID"].iloc[0])  # merge with entry with highest population
                elif matched_fids["POPULATION"].isna().all():  # all nans in population field
                    matched_fid.append(matched_fids["FID"].iloc[0])  # take first one
                else:  # some nans in population field
                    matched_fid.append(matched_fids.loc[matched_fids["POPULATION"] == np.nanmax(matched_fids["POPULATION"])]["FID"].iloc[0])  # merge with entry with highest population
        else:
            matched_fid.append(np.NaN)
            matched_level.append(5)

In [38]:
len(matched_fid)

543

In [39]:
nyt["Matched FID"] = matched_fid
nyt["Matched Level"] = matched_level

In [40]:
nyt["Matched Level"].value_counts()

2    188
1    144
3    102
4     94
5     14
0      1
Name: Matched Level, dtype: int64

In [41]:
nyt.head()

Unnamed: 0,Name,Cases_2020-05-11,Deaths_2020-05-11,City,State,Matched FID,Matched Level
0,PARAMUS VETERANS MEMORIAL HOME,264,71,PARAMUS,NJ,26380.0,2
1,BERGEN NEW BRIDGE MEDICAL CENTER NURSING HOME,233,26,PARAMUS,NJ,2677.0,4
2,FUTURECARE LOCHEARN NURSING HOME,231,21,BALTIMORE,MD,22399.0,2
3,ANDOVER SUBACUTE AND REHAB CENTER II,189,55,ANDOVER,NJ,17465.0,2
4,WOODBRIDGE NURSING PAVILION,183,17,CHICAGO,IL,25193.0,1


In [42]:
manual_merge_tab = pd.read_csv("../../data/nursinghome_level/manual_merge_table.csv")
manual_merge_tab.dtypes

Name     object
FID       int64
City     object
State    object
dtype: object

In [43]:
manual_merge_tab.head()

Unnamed: 0,Name,FID,City,State
0,VIRTUA HEALTH AND REHAB CENTER AT MT HOLLY,11525,MT HOLLY,NJ
1,FOREST MANOR HEALTH CARE CENTER,8624,BLAIRSTOWN,NJ
2,PARKER JEWISH INSTITUTE FOR HEALTH CARE AND REHAB,6497,QUEENS,NY
3,RIDERWOOD ASSISTED LIVING FACILITY,21060,CALVERTON,MD
4,MANATEE SPRINGS REHAB AND NURSING CENTER,28526,EAST BRADENTON,FL


In [44]:
# fix some nursing home merge FIDs manually
for i in range(manual_merge_tab.shape[0]):
    name = manual_merge_tab.Name.iloc[i]
    fid = manual_merge_tab.FID.iloc[i]
    city = manual_merge_tab.City.iloc[i]
    state = manual_merge_tab.State.iloc[i]
    idx = (nyt.Name == name) & (nyt.City == city) & (nyt.State == state)
    nyt["Matched FID"].loc[idx] = fid

In [45]:
nyt.head()

Unnamed: 0,Name,Cases_2020-05-11,Deaths_2020-05-11,City,State,Matched FID,Matched Level
0,PARAMUS VETERANS MEMORIAL HOME,264,71,PARAMUS,NJ,26380.0,2
1,BERGEN NEW BRIDGE MEDICAL CENTER NURSING HOME,233,26,PARAMUS,NJ,2677.0,4
2,FUTURECARE LOCHEARN NURSING HOME,231,21,BALTIMORE,MD,22399.0,2
3,ANDOVER SUBACUTE AND REHAB CENTER II,189,55,ANDOVER,NJ,17465.0,2
4,WOODBRIDGE NURSING PAVILION,183,17,CHICAGO,IL,25193.0,1


In [46]:
nyt["Matched FID"].nunique()

530

In [47]:
# need to fix these duplicates (manual check)
nyt.loc[nyt.duplicated(subset = "Matched FID", keep = False) & (nyt["Matched FID"] != -999)].sort_values("Matched FID")

Unnamed: 0,Name,Cases_2020-05-11,Deaths_2020-05-11,City,State,Matched FID,Matched Level
414,CAREONE AT THE CUPOLA LONG-TERM CARE,60,14,PARAMUS,NJ,6656.0,4
455,CAREONE AT THE CUPOLA ASSISTED LIVING,57,14,PARAMUS,NJ,6656.0,4
66,WINDSOR PARK MANOR RETIREMENT COMMUNITY,118,21,CAROL STREAM,IL,13272.0,2
182,COVENANT LIVING AT WINDSOR PARK LONG-TERM CARE,84,19,CAROL STREAM,IL,13272.0,4
13,HACKENSACK MERIDIAN HEALTH NURSING AND REHAB,167,19,HACKENSACK,NJ,20489.0,4
46,HACKENSACK MERIDIAN HEALTH PROSPECT HEIGHTS CA...,128,10,HACKENSACK,NJ,20489.0,2


In [48]:
# need to fix merge in raw/nyt_nursinghomes/
# after manual inspection, decide to take max
nyt_duplicated_ls = []
for fid in nyt["Matched FID"].loc[nyt["Matched FID"].duplicated()].unique():
    if fid == -999:
        continue
    nyt_duplicated = nyt.loc[nyt["Matched FID"] == fid]
    nyt_duplicated = nyt_duplicated.loc[nyt_duplicated["Cases_2020-05-11"] == np.max(nyt_duplicated["Cases_2020-05-11"])]
    #nyt_duplicated = nyt_duplicated.loc[nyt_duplicated["Cases_"+str(date.today())] == np.max(nyt_duplicated["Cases_"+str(date.today())])]
    nyt_duplicated_ls.append(nyt_duplicated)
    nyt = nyt.loc[nyt["Matched FID"] != fid]
nyt_duplicated = pd.concat(nyt_duplicated_ls, axis = 0, sort = False)
nyt = pd.concat([nyt, nyt_duplicated], axis = 0, sort = False)

In [49]:
nyt["Matched FID"].loc[nyt["Matched FID"] != -999].nunique()

529

In [50]:
nyt["Matched FID"].loc[nyt["Matched FID"] != -999].shape

(529,)

In [51]:
nyt.shape

(540, 7)

In [52]:
nyt.to_csv("full_merge_table.csv", index=False)

In [53]:
nyt["Matched FID"] = nyt["Matched FID"].astype(int)

In [54]:
# merge with hifld
data = pd.merge(nyt, hifld, left_on = "Matched FID", right_on = "FID", how = "left")

In [55]:
data.shape

(540, 40)

In [56]:
data.head()

Unnamed: 0,Name,Cases_2020-05-11,Deaths_2020-05-11,City,State,Matched FID,Matched Level,FID,ID,NAME,...,WEBSITE,TOT_RES,TOT_STAFF,BEDS,EXCESS_BED,OWNERSHIP,MEDICAIDID,MEDICAREID,STATE_LIC,SOURCETYPE
0,PARAMUS VETERANS MEMORIAL HOME,264,71,PARAMUS,NJ,26380,2,26380.0,3400363.0,N J VETERANS MEMORIAL HOME - PARAMUS,...,NOT AVAILABLE,-999.0,-999.0,336.0,-999.0,GOVT - STATE,NOT AVAILABLE,NOT AVAILABLE,60228,LONG TERM CARE FACILITY
1,BERGEN NEW BRIDGE MEDICAL CENTER NURSING HOME,233,26,PARAMUS,NJ,2677,4,2677.0,3400079.0,BERGEN REGIONAL MEDICAL CENTER,...,NOT AVAILABLE,-999.0,-999.0,574.0,-999.0,GOVT - COUNTY,NOT AVAILABLE,NOT AVAILABLE,10201L,LONG TERM CARE FACILITY
2,FUTURECARE LOCHEARN NURSING HOME,231,21,BALTIMORE,MD,22399,2,22399.0,24647.0,LOCHEARN NURSING HOME,...,NOT AVAILABLE,-999.0,-999.0,200.0,-999.0,NOT AVAILABLE,NOT AVAILABLE,NOT AVAILABLE,NOT AVAILABLE,COMPREHENSIVE CARE FACILITIES AND EXTENDED CAR...
3,ANDOVER SUBACUTE AND REHAB CENTER II,189,55,ANDOVER,NJ,17465,2,17465.0,3400029.0,ANDOVER SUBACUTE AND REHAB II,...,NOT AVAILABLE,-999.0,-999.0,543.0,-999.0,PROFIT - CORPORATION,NOT AVAILABLE,NOT AVAILABLE,61901,LONG TERM CARE FACILITY
4,WOODBRIDGE NURSING PAVILION,183,17,CHICAGO,IL,25193,1,25193.0,171190.0,WOODBRIDGE NURSING PAVILION,...,NOT AVAILABLE,-999.0,-999.0,222.0,-999.0,FOR-PROF CORPORATION,NOT AVAILABLE,NOT AVAILABLE,6007074,NURSING HOME


In [57]:
data.isna().sum()

Name                  0
Cases_2020-05-11      0
Deaths_2020-05-11     0
City                  0
State                 0
Matched FID           0
Matched Level         0
FID                  11
ID                   11
NAME                 11
ADDRESS              11
CITY                 11
STATE                11
ZIP                  11
ZIP4                 11
TELEPHONE            11
TYPE                 11
STATUS               11
POPULATION           11
COUNTY               11
COUNTYFIPS           11
COUNTRY              11
LATITUDE             11
LONGITUDE            11
NAICS_CODE           11
NAICS_DESC           11
SOURCE               11
SOURCEDATE           11
VAL_METHOD           11
VAL_DATE             11
WEBSITE              11
TOT_RES              11
TOT_STAFF            11
BEDS                 11
EXCESS_BED           11
OWNERSHIP            11
MEDICAIDID           11
MEDICAREID           11
STATE_LIC            11
SOURCETYPE           11
dtype: int64

In [58]:
(data["Matched FID"] == -999).sum()

11

In [60]:
data = data.replace(-999, np.NaN)

In [61]:
data.isna().sum()

Name                   0
Cases_2020-05-11       0
Deaths_2020-05-11      0
City                   0
State                  0
Matched FID           11
Matched Level          0
FID                   11
ID                    11
NAME                  11
ADDRESS               11
CITY                  11
STATE                 11
ZIP                   11
ZIP4                  11
TELEPHONE             11
TYPE                  11
STATUS                11
POPULATION            88
COUNTY                11
COUNTYFIPS            11
COUNTRY               11
LATITUDE              11
LONGITUDE             11
NAICS_CODE            11
NAICS_DESC            11
SOURCE                11
SOURCEDATE            11
VAL_METHOD            11
VAL_DATE              11
WEBSITE               11
TOT_RES              522
TOT_STAFF            538
BEDS                  88
EXCESS_BED           522
OWNERSHIP             11
MEDICAIDID            11
MEDICAREID            11
STATE_LIC             11
SOURCETYPE            11


In [62]:
data.to_csv("nursinghomes_data.csv", index=False)