In [1]:
import pandas as pd
import numpy as np
import os
import datetime

In [49]:
neighborcounties = pd.read_csv('neighborcounties.csv', encoding='latin1', index_col=0)
neighborstats = pd.read_csv('neighborstats.csv', encoding='latin1')
google_neighbor = pd.read_csv('google_neighbor.csv', encoding='latin1').drop(columns=['Unnamed: 0'])
Age_Race = pd.read_csv('Age_Race.csv', encoding='latin1',index_col=0)
Pop_60 = pd.read_csv('Pop_60.csv')
Density = pd.read_csv('Density.csv')
JHU = pd.read_csv('aggregate_jhu-copy.csv')       

In [3]:
def fillframe_date(fips, date, value, colname, neighborcounties, neighborstats, min_neighbors=2):
    #Is able to fill a dataframe that has a date column by pivoting by date
    #And filling in NaNs by factor & date for each county, using fillcol method
    
    #Making Dataframe to sort values by dates, pivoting
    frame = pd.DataFrame(data = [fips,date,value])
    frame = frame.T
    frame.columns = ['FIPS', 'Date', colname]
    frame = frame.pivot(index = 'FIPS', columns='Date', values=[colname]).reset_index()
    
    #fixing up the frame to haev proper columns
    frame.to_csv('frame.csv')
    frame = pd.read_csv('frame.csv', encoding='latin1')
    frame = frame.rename(columns=frame.iloc[0]).drop(frame.index[0])
    frame.columns = frame.columns.fillna('FIPS')
    frame = frame.drop(columns = 'Date')
    
    #Iterating through each date to fill NaNs
    datelist = frame.columns    
    datelist = datelist[1:]
    for d in datelist:
        frame[d] = fillcol(frame['FIPS'], frame[d], neighborcounties, neighborstats, min_neighbors)
    #Unpivoting the frame so it can be returned
    frame = frame.melt(id_vars=['FIPS']).sort_values(['FIPS','variable']).reset_index(drop=True)
    frame.columns = ['FIPS', 'Date', colname]
    
    return frame

In [4]:
def fillcol(fips, value, neighborcounties, neighborstats, min_neighbors=2):
    #Takes in a column of fips codes, and any type of datafield with some NaNs,
    #Computes distance-weighted average of the value across all neighbors of NaN counties
    
    #Loading in the fips and value into proper dataframes
    df = pd.DataFrame(data = [fips,value]).T
    df.columns = ['FIPS', 'Values']
    df.Values = df.Values.astype(float)
    df = df.set_index('FIPS')
    
    #creating new column to set to the current dataframe values
    newcol = []
    for ind in df.index:
        #for any entries with NaNs
        if np.isnan(df['Values'][ind]):
            #list of neighbors for NaN county
            neighbors = list(neighborcounties[neighborcounties['orgfips'] == ind]['adjfips'])
            nonzero = 0
            weightedval = 0
            totalinvdist = 0
            totaldist = 0
            vals = 0
            #iterates though neighbors of NaN county with non-NaN entires
            for n in neighbors:
                if n in df.index:
                    if ~np.isnan(df['Values'][n]):
                        #Getting weighted values, using 1/dist as a scalar to show closer distance counts more
                        nonzero += 1
                        dist = list(neighborcounties.query('orgfips == ' + str(ind) + ' and adjfips == ' + str(n))['Pop_10'])[0]
                        totalinvdist += (1/dist)**1
                        weightedval += ((1/dist)**1)*df['Values'][n]
            #If there are at least 2 neighbors (this can be adjusted)
            if nonzero >= min_neighbors:
                newcol.append(weightedval/(totalinvdist))
            else:
                newcol.append(np.nan)
        else:
            newcol.append(df['Values'][ind])
    return newcol

In [5]:
def fillgoogle(colname, google_neighbor):
    #Method to fill up the google mobility data
    #Uses colname to designate which column to fill
    numnans = len(google_neighbor[np.isnan(google_neighbor[colname])])
    while numnans > 0:
        tempnum = numnans
        #Creating the filled column from method
        temp = fillframe_date(google_neighbor['FIPS'],google_neighbor['Date'],google_neighbor[colname], colname, neighborcounties, neighborstats)
        google_neighbor[colname] = temp[colname]
        numnans = len(google_neighbor[np.isnan(google_neighbor[colname])])
        #Checking if the number of nans changes
        if tempnum == numnans:
            #if number doesnt change, try again with only 1 neighbor, otherwise quit
            temp = fillframe_date(google_neighbor['FIPS'],google_neighbor['Date'],google_neighbor[colname], colname, neighborcounties, neighborstats, 1)
            google_neighbor[colname] = temp[colname]
            numnans = len(google_neighbor[np.isnan(google_neighbor[colname])])
            if tempnum == numnans:
                numnans = 0     
        
    return google_neighbor

In [6]:
def fillfixed(colname, data, code, neighborcounties, neighborstats):
    #Method to fill up the google mobility data
    #Uses colname to designate which column to fill
    numnans = len(data[np.isnan(data[colname])])
    while numnans > 0:
        tempnum = numnans
        #Creating the filled column from method
        newcol = fillcol(data[code], data[colname], neighborcounties, neighborstats)
        data[colname] = newcol
        numnans = len(data[np.isnan(data[colname])])
        #Checking if the number of nans changes
        if tempnum == numnans:
            #if number doesnt change, try again with only 1 neighbor, otherwise quit
            newcol = fillcol(data[code], data[colname], neighborcounties, neighborstats)
            data[colname] = newcol
            numnans = len(data[np.isnan(data[colname])])
            if tempnum == numnans:
                numnans = 0     
    return data    

In [7]:
def popfill(total,estimate):
    #Fills in the population of each county, merging density with Pop_60
    if np.isnan(total):
        return estimate
    else:
        return total

In [8]:
#List of all the counties with neighbor data
county_list = pd.DataFrame(neighborcounties.orgfips.unique(),columns = ['fips']).set_index('fips')
#Calculating the Total population for each county
Total_Pop = county_list.join(Pop_60.set_index('FIPS'), how='left')
#Adding in extra approximated rows for NaN values, using Density
Density['Pop'] = Density['2010 Density per square mile of land area - Population'] * Density['Area in square miles - Land area']
Density = Density[['FIPS', 'Pop']]
Total_Pop = Total_Pop.join(Density.set_index('FIPS'), how='left')
Total_Pop['Population'] = Total_Pop.apply(lambda x: popfill(x['total_pop'],x['Pop']), axis=1)
Total_Pop = Total_Pop['Population']
Total_Pop.to_csv('Total_Pop')

In [9]:
#Adding extra rows to demographic info to include all counties
Age_Race = county_list.join(Age_Race, how='left')
#Setting the total population for each county
Age_Race['Total population'] = list(Total_Pop)
#Dividing the entires by population
Age_Race = Age_Race.astype(str).astype(float)
Age_Race = Age_Race.divide(Age_Race['Total population'], axis=0).reset_index()

#Going through each column of Age_Race to fill NaNs
cols = list(Age_Race.columns)[2:]
for col in cols:
    Age_Race = fillfixed(col, Age_Race, 'fips', neighborcounties, neighborstats)
Age_Race = Age_Race.dropna()

In [10]:
Age_Race.to_csv('Age_Race_Filled.csv')

In [11]:
cols = list(google_neighbor.columns)[1:]
for col in cols:
    google_neighbor = fillgoogle(col, google_neighbor)
google_neighbor = google_neighbor.fillna(0)

In [12]:
google_neighbor.to_csv('google_mobility_filled.csv')

In [53]:
JHU = JHU[JHU['FIPS'] % 1000 != 0]
JHU = JHU.drop(columns = ['State', 'Area_Name'])
JHU.head

<bound method NDFrame.head of        FIPS  Rural-urban_Continuum Code_2013  Urban_Influence_Code_2013  \
2      1001                              2.0                        2.0   
3      1003                              3.0                        2.0   
4      1005                              6.0                        6.0   
5      1007                              1.0                        1.0   
6      1009                              1.0                        1.0   
...     ...                              ...                        ...   
3268  72145                              1.0                        1.0   
3269  72147                              7.0                       12.0   
3270  72149                              2.0                        2.0   
3271  72151                              1.0                        1.0   
3272  72153                              2.0                        2.0   

      Economic_typology_2015  POP_ESTIMATE_2018  N_POP_CHG_2018  Birt

In [55]:
#Going through each column of Aggregate JHU to fill NaNs
cols = list(JHU.columns)[2:]
for col in cols:
    JHU = fillfixed(col, JHU, 'FIPS', neighborcounties, neighborstats)
JHU = JHU.dropna()

In [None]:
JHU.to_csv('aggregate_jhu_filled.csv')

In [None]:
Age_Race.head(20)

In [56]:
google_neighbor.head(20)

Unnamed: 0,Date,Retail & recreation,Grocery & pharmacy,Parks,Transit stations,Workplaces,Residential,FIPS
0,2020-03-29,-35.0,7.0,,,-30.0,,45001
1,2020-04-05,-30.0,7.0,,,-39.0,,45001
2,2020-04-11,-30.0,7.0,,,-25.0,,45001
3,2020-03-29,-24.0,-5.0,,-36.0,-21.0,15.0,22001
4,2020-04-05,-27.0,11.0,,-46.0,-30.0,16.0,22001
5,2020-04-11,-31.0,23.0,,-52.0,-28.0,12.0,22001
6,2020-03-29,-30.0,-18.0,,,-26.0,13.0,51001
7,2020-04-05,-30.0,-19.0,,,-25.0,14.0,51001
8,2020-04-11,-24.0,-4.0,,,-19.0,14.0,51001
9,2020-03-29,-49.0,-23.0,33.0,-54.0,-43.0,11.0,16001


In [None]:
JHU.head(20)