In [1]:
import pandas as pd
import numpy as np
import os
import datetime
import time

In [2]:
import git
import sys
repo = git.Repo("./", search_parent_directories=True)
homedir = repo.working_dir

In [3]:
#Neighbor Data
neighborcounties = pd.read_csv(f"{homedir}/models/processing/USA/County_Based/neighborcounties.csv", index_col = 0)
#Google Data
Google = pd.read_csv(f"{homedir}/models/processing/USA/County_Based/google_new.csv", index_col = 0)
#Fixed Data
Age_Race = pd.read_csv(f"{homedir}/models/processing/USA/County_Based/Age_Race.csv", index_col = 0)
Pop_60 = pd.read_csv(f"{homedir}/models/processing/USA/County_Based/Pop_60.csv")
Density = pd.read_csv(f"{homedir}/models/processing/USA/County_Based/Density.csv")

In [4]:
def fillframe_date(fips, date, value, colname, neighborcounties, min_neighbors=2):
    #Is able to fill a dataframe that has a date column by pivoting by date
    #And filling in NaNs by factor & date for each county, using fillcol method
    #Making Dataframe to sort values by dates, pivoting
    tic = time.time()
    frame = pd.DataFrame(data = [fips,date,value])
    frame = frame.T
    frame.columns = ['FIPS', 'Date', colname]
    frame = frame.pivot(index = 'FIPS', columns='Date', values=[colname]).reset_index()
    
    #fixing up the frame to haev proper columns
    frame.to_csv('frame.csv')
    frame = pd.read_csv('frame.csv', encoding='latin1')
    frame = frame.rename(columns=frame.iloc[0]).drop(frame.index[0])
    frame.columns = frame.columns.fillna('FIPS')
    frame = frame.drop(columns = 'Date')
    toc = time.time()
    print(toc - tic)
    #Iterating through each date to fill NaNs
    datelist = frame.columns    
    datelist = datelist[1:]
    for d in datelist:
        print(d)
        tic = time.time()
        frame[d] = fillcol(frame['FIPS'], frame[d], neighborcounties, min_neighbors)
        toc = time.time()
        print(toc - tic)
    #Unpivoting the frame so it can be returned
    frame = frame.melt(id_vars=['FIPS']).sort_values(['FIPS','variable']).reset_index(drop=True)
    frame.columns = ['FIPS', 'Date', colname]
    
    return frame

In [5]:
def fillcol(fips, value,neighborcounties, min_neighbors=2):
    #Takes in a column of fips codes, and any type of datafield with some NaNs,
    #Computes distance-weighted average of the value across all neighbors of NaN counties
    tic1 = time.time()
    #Loading in the fips and value into proper dataframes
    #This is the df with only nan values
    df = pd.DataFrame(data = [fips,value]).T
    df.columns = ['FIPS', 'Values']
    df.Values = df.Values.astype(float)
    df = df.set_index('FIPS')
    
    #creating new column to set to the current dataframe values
    newcol = []
    for ind in df.index:
        #for any entries with NaNs
        if np.isnan(df['Values'][ind]):
            #list of neighbors for NaN county
            neighbors = list(neighborcounties[neighborcounties['orgfips'] == ind]['adjfips'])
            nonzero = 0
            weightedval = 0
            totalinvdist = 0
            totaldist = 0
            vals = 0
            #iterates though neighbors of NaN county with non-NaN entires
            for n in neighbors:
                if n in df.index:
                    if ~np.isnan(df['Values'][n]):
                        #Getting weighted values, using 1/dist as a scalar to show closer distance counts more
                        nonzero += 1
                        dist = list(neighborcounties.query('orgfips == ' + str(ind) + ' and adjfips == ' + str(n))['Pop_10'])[0]
                        totalinvdist += (1/dist)**1
                        weightedval += ((1/dist)**1)*df['Values'][n]
            #If there are at least 2 neighbors (this can be adjusted)
            if nonzero >= min_neighbors:
                newcol.append(weightedval/(totalinvdist))
            else:
                newcol.append(np.nan)
        else:
            newcol.append(df['Values'][ind])
    toc1 = time.time()
    print(toc1 - tic1)
    return newcol

In [6]:
def fillgoogle(colname, google_neighbor):
    #Method to fill up the google mobility data
    #Uses colname to designate which column to fill
    #rows that have NaN values at the value of interest
    numnans = len(google_neighbor[np.isnan(google_neighbor[colname])])
    while numnans > 0:
        print('1: ' + str(numnans))
        tempnum = numnans
        #Creating the filled column from method
        temp = fillframe_date(google_neighbor['FIPS'],google_neighbor['Date'],google_neighbor[colname], colname, neighborcounties)
        google_neighbor[colname] = temp[colname]
        numnans = len(google_neighbor[np.isnan(google_neighbor[colname])])
        print('2: ' + str(numnans))
        #Checking if the number of nans changes
        if tempnum == numnans:
            print(numnans)
            print('3: ' + str(numnans))
            #if number doesnt change, try again with only 1 neighbor, otherwise quit
            temp = fillframe_date(google_neighbor['FIPS'],google_neighbor['Date'],google_neighbor[colname], colname, neighborcounties, 1)
            google_neighbor[colname] = temp[colname]
            numnans = len(google_neighbor[np.isnan(google_neighbor[colname])])
            print('4: ' + str(numnans))
            if tempnum <= numnans:
                numnans = 0
        elif tempnum < numnans:
            numnans = 0
        
    return google_neighbor

In [7]:
def fillfixed(colname, data, code, neighborcounties):
    #Method to fill up the google mobility data
    #Uses colname to designate which column to fill
    numnans = len(data[np.isnan(data[colname])])
    while numnans > 0:
        print(numnans)
        tempnum = numnans
        #Creating the filled column from method
        newcol = fillcol(data[code], data[colname], neighborcounties)
        data[colname] = newcol
        numnans = len(data[np.isnan(data[colname])])
        #Checking if the number of nans changes
        if tempnum == numnans:
            #if number doesnt change, try again with only 1 neighbor, otherwise quit
            newcol = fillcol(data[code], data[colname], neighborcounties)
            data[colname] = newcol
            numnans = len(data[np.isnan(data[colname])])
            if tempnum == numnans:
                numnans = 0     
    return data

In [8]:
def popfill(total,estimate):
    #Fills in the population of each county, merging density with Pop_60
    if np.isnan(total):
        return estimate
    else:
        return total

In [9]:
#List of all the counties with neighbor data
county_list = pd.DataFrame(neighborcounties.orgfips.unique(),columns = ['fips']).set_index('fips')
#Calculating the Total population for each county
Total_Pop = county_list.join(Pop_60.set_index('FIPS'), how='left')
#Adding in extra approximated rows for NaN values, using Density
Density['Pop'] = Density['2010 Density per square mile of land area - Population'] * Density['Area in square miles - Land area']
Density = Density[['FIPS', 'Pop']]
Total_Pop = Total_Pop.join(Density.set_index('FIPS'), how='left')
Total_Pop['Population'] = Total_Pop.apply(lambda x: popfill(x['total_pop'],x['Pop']), axis=1)
Total_Pop = Total_Pop['Population']
Total_Pop.to_csv('Total_Pop')

In [10]:
#Adding extra rows to demographic info to include all counties
Age_Race = county_list.join(Age_Race, how='left')
#Setting the total population for each county
Age_Race['Total population'] = list(Total_Pop)
#Dividing the entires by population
Age_Race = Age_Race.astype(str).astype(float)
Age_Race = Age_Race.divide(Age_Race['Total population'], axis=0).reset_index()

#Going through each column of Age_Race to fill NaNs
cols = list(Age_Race.columns)[2:]
for col in cols:
    Age_Race = fillfixed(col, Age_Race, 'fips', neighborcounties)
Age_Race = Age_Race.dropna()

#I Ran this halfway and then stopped

2383
9.236740350723267
1758
6.656149625778198
1242


KeyboardInterrupt: 

In [11]:
Age_Race.head

<bound method NDFrame.head of        fips  Total population      Male    Female  Under 5 years  \
0      1001               1.0  0.482249       NaN            NaN   
1      1003               1.0  0.518475  0.553624       0.053491   
2      1005               1.0       NaN       NaN            NaN   
3      1007               1.0  0.489091       NaN            NaN   
4      1009               1.0  0.492673       NaN            NaN   
...     ...               ...       ...       ...            ...   
3213  72145               1.0       NaN       NaN            NaN   
3214  72147               1.0       NaN       NaN            NaN   
3215  72149               1.0       NaN       NaN            NaN   
3216  72151               1.0       NaN       NaN            NaN   
3217  72153               1.0       NaN       NaN            NaN   

      5 to 9 years  20 to 24 years  25 to 34 years  35 to 44 years  \
0              NaN             NaN             NaN             NaN   
1         0.0

In [None]:
Age_Race.to_csv('Age_Race_Filled.csv')

In [None]:
cols = list(Google.columns)[2:]

In [None]:
Google = fillgoogle(cols[0], Google)
Google

In [None]:
Google = fillgoogle(cols[1], Google)
Google

In [None]:
Google = fillgoogle(cols[2], Google)
Google

In [None]:
Google = fillgoogle(cols[3], Google)
Google

In [None]:
Google = fillgoogle(cols[4], Google)
Google

In [None]:
Google = fillgoogle(cols[5], Google)
Google

In [None]:
Google = Google.fillna(0)
Google.to_csv('google_new_filled.csv')