In [1]:
import os
import pandas as pd
import math
import numpy as np
import us
import uszipcode as uszc
import difflib
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
geolocator = Nominatim(user_agent='trusch')

# find county functions:
- from zip-code
- from nearest town and US-state

In [2]:
def find_counties_zc(zipcode):
    search = uszc.SearchEngine()
    counties = ""

    loc_data = search.by_zipcode(zipcode)

    if bool(loc_data) == False:
        return counties
    else:
        counties += loc_data.county
    
    return counties

def find_counties_cs(city, state):
    search = uszc.SearchEngine(simple_zipcode=False)
    counties = ""
    
    loc_data = search.by_city_and_state(city, state)
    
    if len(loc_data) > 1:
        for loc in loc_data:
            if loc.county not in counties:
                counties+= loc.county + ", "
    elif len(loc_data) == 1:
        counties = loc_data[0].county
    counties = counties.rstrip(", ")
    
    return counties

# set path, load data

In [11]:
# box dir
in_path = os.path.expanduser(os.path.join('~', 'Box', 'COVID-19 Adolphs Lab', 'core_analysis', 'raw_data'))
out_path = os.path.expanduser(os.path.join('~', 'Box', 'COVID-19 Adolphs Lab', 'core_analysis', 'processed_data_tr'))

#importing location data
location_data = pd.read_csv(os.path.join(in_path, "location_data_w1-16_A-M.csv"))

# select waves -> drop Conte and 15b
include_w = []
for w in range(1,17): 
    include_w.append(str(w))
location_data = location_data.loc[location_data.wave.isin(include_w),: ]


# check and clean free lkeyboard responses (nearest town & zip-code)

In [12]:
location_data.loc[location_data.PROLIFIC_PID == '55bb74f1fdf99b1519ef4762',:]

Unnamed: 0.1,Unnamed: 0,PROLIFIC_PID,wave,zip_code,moved,state,nearest_town,loc_description
16,16,55bb74f1fdf99b1519ef4762,1,,,Minnesota,,countryside_near_town
3555,3555,55bb74f1fdf99b1519ef4762,3,,No,,grand rapids,
6184,6184,55bb74f1fdf99b1519ef4762,5,,Yes,Minnesota,Grand rapids,countryside_remote
7466,7466,55bb74f1fdf99b1519ef4762,6,,No,,"Grand Rapids, MN",
9974,9974,55bb74f1fdf99b1519ef4762,8,55744.0,No,,"grand rapids, MN",
11202,11439,55bb74f1fdf99b1519ef4762,9,55744.0,No,,"gRAND rAPIDS, mn",
12783,13020,55bb74f1fdf99b1519ef4762,10,,No,,"grand rapids, mn",
14344,14706,55bb74f1fdf99b1519ef4762,12,,No,,grand rapids mn,
15139,15536,55bb74f1fdf99b1519ef4762,13,,No,,"grand rapids, mn",
15837,16277,55bb74f1fdf99b1519ef4762,14,,No,,"Grand Rapids, MN",


zipcodes: 5 digits?

In [4]:
# validity of provided zip-codes -> do they have 5 digits?
bad_zip = []
zip_codes = list(location_data.zip_code.dropna().unique())
for i_zip in zip_codes:
    if len(i_zip) != 5:
        bad_zip.append(i_zip)
location_data.loc[location_data.zip_code.isin(bad_zip), 'zip_code'] = np.nan
bad_zip = pd.DataFrame(bad_zip, columns = ['bad_zip'])
bad_zip.to_csv(os.path.join(out_path, 'bad_zip_codes.csv'), index = False)

nearest town names: lowercase and strip state from string

In [5]:
# lowercase twon and state
location_data['nearest_town'] = location_data['nearest_town'].str.lower()
location_data['nearest_town'] = location_data['nearest_town'].str.strip()
location_data['state'] = location_data['state'].str.lower()

# drop if state information for nearest_town add with ",", e.g., "Atlanta, GA"
nearest_town_df = location_data.dropna(subset=['nearest_town'])
for idx in nearest_town_df.index:
    tmp_str = nearest_town_df.loc[idx, ['nearest_town']].str.split(',')
    location_data.loc[idx, ['nearest_town']] = tmp_str[0][0]


replace manually identified typos/inaccuracies in town responses and zip-codes

In [6]:
# likely typos/ spelling variations/ comments/ etc. in town names
location_data.nearest_town = location_data.nearest_town.replace({'la': 'los angeles',
                                                                'sf': 'san francisco',
                                                                'nyc': 'new york city',
                                                                'ny': 'new york',
                                                                'new york city': 'new york',
                                                                'manhattan': 'new york',                              
                                                                'sparta township': 'sparta',
                                                                'rock island illinois': 'rock island',
                                                                'sedro-woolley': 'sedro woolley',
                                                                'denhver': 'denver',
                                                                'upper allen': 'harrisburg',
                                                                'traverse city': 'fife lake',
                                                                'north hollywood': 'los angeles',
                                                                'phoenix arizona': 'phoenix',
                                                                'omaha ne': 'omaha',
                                                                'sorrento/mt plymouth': 'sorrento',
                                                                'mt plymouth/sorrento': 'sorrento',
                                                                'sorrento fl': 'sorrento',
                                                                'sorrent0/mt plymouth': 'sorrento',
                                                                'fort collins (colorado)': 'fort collins',
                                                                'barre': 'barre city',
                                                                'lake orion': 'oxford',
                                                                'plymouth meeting': 'plymouth',
                                                                'everett washington': 'everett',
                                                                'cranberry twp': 'cranberry township',
                                                                'glendale arizona': 'glendale',
                                                                'beverly (boston)': 'beverly',
                                                                'calera/birmingham': 'calera',
                                                                'charolotte': 'charlotte',
                                                                'glendale/phoenix': 'glendale',
                                                                'central point or medford': 'medford',
                                                                'lewistown/moore': 'moore',
                                                                'i live in green bay': 'green bay',
                                                                'ventura?': 'ventura',
                                                                'arlington ma': 'arlington',
                                                                "san jose (still! - since i said i hadn't moved since last week": 'san jose',
                                                                'campell':'campbell',
                                                                'salem oregon':'salem',
                                                                'salem or':'salem',
                                                                'phila': 'philadelphia',
                                                                'chicopee / springfield': 'chicopee',
                                                                'prescott az': 'prescott',
                                                                'coral springs': 'pompano',
                                                                'grand rapids mn': 'grand rapids',
                                                                'saint louis city': 'saint louis',
                                                                'yazoo city': 'yazoo',
                                                                'louisvillr': 'louisville',
                                                                ' spokane':'spokane',
                                                                'shallote':'shallotte',
                                                                'albany ny':'albany',
                                                                'geneva ny':'geneva',
                                                                'fairbaks':'fairbanks',
                                                                'cincinnati':'cincinnatti',
                                                                'mechanicsburg/upper allen':'mechanicsburg',
                                                                'cleveland oh':'cleveland',
                                                                'mumee':'maumee',
                                                                'boca raton and delray': 'delray',
                                                                'delray and boca raton': 'delray',
                                                                'delray and boca': 'delray',
                                                                'boca raton': 'delray',
                                                                'graham wa': 'graham',
                                                                'panama city fl':'panama city',
                                                                'south beloit':'beloit',
                                                                'rockford il':'rockford',
                                                                'graham tx':'graham',
                                                                ' carlsbad':'carlsbad', 
                                                                'covington ky':'covington', 
                                                                'kigston':'kingston',
                                                                'coconut creek':'pompano',                             
                                                                'ben hill': 'fitzgerald',
                                                                'margate':'pompano',
                                                                'parma': 'parma heights',
                                                                'parma heights oh': 'parma heights',
                                                                'parma height ohio': 'parma heights',
                                                                'parma hts': 'parma heights',
                                                                'allenton': 'Lumberton'})
# likely typo: -/0
location_data.zip_code = location_data.zip_code.replace({'75-63': '75063'})

# likely mistake in state-drop down menu
location_data.loc[location_data.PROLIFIC_PID == '5deb05909f631c23193bbffd', 'state'] = 'washington'


1. further town spell checks: 
- additional spell checks using string-sequence matching (difflib.SequenceMatcher >= 0.7)

2. location-description and state expansion
- state of residence and location-description where only collected in wave 1 and if subjects indicated a move, thus needs to be copied from w1 or "move" waves 


In [7]:
county_data = pd.DataFrame()
# loop subjects
for pid in location_data.PROLIFIC_PID.unique():
    pid_idx = location_data.index[location_data.PROLIFIC_PID == pid]
    tmp_df = location_data.loc[pid_idx].copy().reset_index(drop = True)
    location_data.drop(pid_idx, inplace = True)
    location_data.reset_index(drop = True, inplace = True)

    # if more than 1 wave
    if max(tmp_df.wave.astype(int))> 1:
    
    # no move indicated -> continue
        if all(tmp_df.moved != 'Yes'):
            tmp_df['state'] = tmp_df.loc[0, 'state'] # copy W1 state
            tmp_df['loc_description']= tmp_df.loc[0, 'loc_description'] # copy W1 location description 
            tmp_df.loc[0,'nearest_town'] = tmp_df.loc[1,'nearest_town'] # copy W2 town to W1 


            # nearest town info. not identical across all waves
            if len(tmp_df['nearest_town'].dropna().unique()) > 1:
                ### SPELL CORRECTION ####
                # check for small typos (2 and 3 diffent spellings); if more than 3 --> needs more cleaning (manual)
                nearest_towns = tmp_df['nearest_town'].dropna().unique()  
                # two unique resp.
                if len(nearest_towns) == 2: 
                    # between string var small --> same response
                    if difflib.SequenceMatcher(None, nearest_towns[0], nearest_towns[1]).ratio()> 0.7:
                        # take most frequent spelling
                        if sum(tmp_df['nearest_town'] ==  nearest_towns[0])> sum(tmp_df['nearest_town'] ==  nearest_towns[1]):
                            tmp_df['nearest_town'] = nearest_towns[0]
                        else:
                            tmp_df['nearest_town'] = nearest_towns[1]

                # three unique resp.
                elif len(nearest_towns) == 3:
                    diff_r1 = difflib.SequenceMatcher(None, nearest_towns[0], nearest_towns[1]).ratio()> 0.7
                    diff_r2 = difflib.SequenceMatcher(None, nearest_towns[0], nearest_towns[2]).ratio()> 0.7
                    diff_r3 = difflib.SequenceMatcher(None, nearest_towns[1], nearest_towns[2]).ratio()> 0.7
                    # between string var small --> same response
                    if diff_r1 & diff_r2 & diff_r3:
                        sum_t1 = sum(tmp_df['nearest_town'] ==  nearest_towns[0])
                        sum_t2 = sum(tmp_df['nearest_town'] ==  nearest_towns[1])
                        sum_t3 = sum(tmp_df['nearest_town'] ==  nearest_towns[2])
                        # take most frequent spelling
                        if (sum_t1 > sum_t2) & (sum_t1 > sum_t3):
                            tmp_df['nearest_town'] = nearest_towns[0]
                        elif (sum_t2 > sum_t1) & (sum_t2 > sum_t3):
                            tmp_df['nearest_town'] = nearest_towns[1]
                        elif (sum_t3 > sum_t1) & (sum_t3 > sum_t2):
                            tmp_df['nearest_town'] = nearest_towns[2]
                            
                            
        if any(tmp_df.moved == 'Yes'):
            
            # copy nearest town to W1
            if tmp_df.loc[1,'moved'] == 'No':          
                tmp_df.loc[0,'nearest_town'] = tmp_df.loc[1,'nearest_town'] # copy W2 town to W1 

            # copy state and location description within no-move-period    
            start_idx = 0
            for moved_idx in tmp_df.index[tmp_df.moved == 'Yes']:
                tmp_df.loc[start_idx:moved_idx-1,'state'] = tmp_df.loc[start_idx,'state']
                tmp_df.loc[start_idx:moved_idx-1,'loc_description'] = tmp_df.loc[start_idx,'loc_description']
                start_idx = moved_idx
            tmp_df.loc[start_idx:,'state'] = tmp_df.loc[start_idx,'state']
            tmp_df.loc[start_idx:,'loc_description'] = tmp_df.loc[start_idx,'loc_description']
                
                
    county_data = county_data.append(tmp_df)
    county_data.reset_index(drop = True, inplace = True)
    


# define counties (from zip; from town & state)

In [8]:
from tqdm import tqdm
####  ZIP COUNTIES ####
for zip_code in tqdm(county_data.zip_code.unique()):
    county_zip = find_counties_zc(zip_code)
    county_data.loc[county_data.zip_code == zip_code, 'county_by_zip'] = county_zip

#### STATE_TOWN COUNTIES ####
county_data['state_town'] = county_data['state'] +'__'+ county_data['nearest_town']
for st in tqdm(county_data['state_town'].unique()):
    if type(st) == str:
        state = st.split('__')[0]
        town = st.split('__')[1]
        county_town = find_counties_cs(town, state)
        county_data.loc[county_data.state_town == st, 'county_by_city_and_state'] = county_town
# drop town-state-string combination
county_data.drop(['state_town'], axis=1 ,inplace=True)

#### COMBINE ZIP AND STATE_TOWN COUNTIES ####
county_data['county_combined'] = county_data['county_by_city_and_state']
zip_county_idx = county_data.index[~county_data.county_by_zip.isna()]
county_data.loc[zip_county_idx, 'county_combined'] = county_data.loc[zip_county_idx, 'county_by_zip']

county_data.to_csv(os.path.join(out_path,'county_data_raw.csv'))

100%|██████████| 1130/1130 [00:05<00:00, 206.68it/s]
100%|██████████| 1287/1287 [11:05<00:00,  1.93it/s]


In [9]:
county_data.loc[county_data.PROLIFIC_PID == '5e6db3f02649e03834220ef0',:]

Unnamed: 0.1,Unnamed: 0,PROLIFIC_PID,wave,zip_code,moved,state,nearest_town,loc_description,county_by_zip,county_by_city_and_state,county_combined
11458,1087,5e6db3f02649e03834220ef0,1,,,virginia,roanoke,suburb_small_city,,Roanoke city,Roanoke city
11459,1949,5e6db3f02649e03834220ef0,2,,No,virginia,roanoke,suburb_small_city,,Roanoke city,Roanoke city
11460,4048,5e6db3f02649e03834220ef0,3,,Yes,virginia,blacksburg,middle_small_city,,Montgomery County,Montgomery County
11461,5249,5e6db3f02649e03834220ef0,4,,No,virginia,blacksburg,middle_small_city,,Montgomery County,Montgomery County
11462,7322,5e6db3f02649e03834220ef0,5,,No,virginia,roanoke,middle_small_city,,Roanoke city,Roanoke city
11463,9689,5e6db3f02649e03834220ef0,7,,No,virginia,blacksburg,middle_small_city,,Montgomery County,Montgomery County


In [10]:
##############INSPECT######################
# county_data.loc[(county_data.PROLIFIC_PID == '5ca17355f1e809001169e3c8')& (county_data.wave>'8'), 'county_combined'] = 'Forsyth County'
# county_data.loc[(county_data.PROLIFIC_PID == '5e6db3f02649e03834220ef0')& (county_data.wave>'2'), 'county_combined'] = 'Montgomery County'
#county_data.loc[(county_data.PROLIFIC_PID == '5e6db3f02649e03834220ef0')& (county_data.wave>'1'), 'county_combined'] = 'Palm Beach County'

# county_data.loc[(county_data.nearest_town == 'woodbury') & (county_data.state == 'minnesota'), 'county_by_city_and_state'] = 'Washington County'      
# county_data.loc[(county_data.nearest_town == 'mountain view') & (county_data.state == 'wyoming'), 'county_by_city_and_state'] = 'Uinta County'      
# county_data.loc[(county_data.nearest_town == 'ontario') & (county_data.state == 'ohio'), 'county_by_city_and_state'] = 'Richland County'

# pid_idx = county_data.index[county_data.PROLIFIC_PID == set_man_9[0]]
# tmp_df = county_data.loc[pid_idx].copy().reset_index(drop = True)
# tmp_df.flag = np.nan
# tmp_df
list(tmp_df['county_combined'])

KeyError: 'county_combined'

In [None]:
# county_data.loc[(county_data.PROLIFIC_PID == '5e706891c396cc64388ef760')& (county_data.wave<'11'), 'county_combined'] = 'Washington County'
county_data.loc[county_data.PROLIFIC_PID == '55b3120cfdf99b6ef6263a71', 'county_combined'] = 'Pasco County'
# county_data.loc[(county_data.PROLIFIC_PID == '5706fb93914b7100093b7925')& (county_data.wave=='16'), 'county_combined'] = 'Will County'
# county_data.loc[(county_data.PROLIFIC_PID == '572d55e1109ab600105176ba')& (county_data.wave=='17'), 'moved'] = 'Yes'
county_data.loc[county_data.PROLIFIC_PID == '586372cae16d530001b345b8', 'county_combined'] = 'Bannock County'
county_data.loc[county_data.PROLIFIC_PID == '59235471e937bd0001ca0245', 'county_combined'] = 'Liberty County'
# county_data.loc[(county_data.PROLIFIC_PID == '594425f07ccfd00001d3f462')& (county_data.wave>'2'), 'county_combined'] = 'Cobb County'
# county_data.loc[county_data.PROLIFIC_PID == '5aa117356475f90001a02fd6', 'county_combined'] = 'Bannock County'
# county_data.loc[(county_data.PROLIFIC_PID == '5acc2d21436f550001003111')& (county_data.wave>'3'), 'county_combined'] = 'Arapahoe County'
# county_data.loc[county_data.PROLIFIC_PID == '5ae2641f5cd05500015ece71', 'county_combined'] = 'Delaware County'
# county_data.loc[(county_data.PROLIFIC_PID == '5b5b715192d8320001510a11')& (county_data.wave=='17'), 'moved'] = 'Yes'
# county_data.loc[county_data.PROLIFIC_PID == '5b9efb73d1ff7a00013bfe84', 'county_combined'] = 'Cumberland County'   
# county_data.loc[(county_data.PROLIFIC_PID == '5bd7ef36e4d5b00001a9b99e')& (county_data.wave=='14'), 'moved'] = 'Yes'
# county_data.loc[county_data.PROLIFIC_PID == '5bdc96a7ba6cc10001854ac8', 'county_combined'] = 'Orange County'
county_data.loc[county_data.PROLIFIC_PID == '5c6fb7c3c114eb00018b3154', 'county_combined'] = 'Bladen County'
# county_data.loc[(county_data.PROLIFIC_PID == '5c71bdec87f8cd0001b458f5')& (county_data.wave=='17'), 'moved'] = 'Yes'
# county_data.loc[(county_data.PROLIFIC_PID == '5caf69d015e92c00131700a8')& (county_data.wave>'7'), 'county_combined'] = 'Tarrant County'
# county_data.loc[county_data.PROLIFIC_PID == '5cbc071f2edc3b00018ab097', 'county_combined'] = 'Alameda County'
# county_data.loc[county_data.PROLIFIC_PID == '5cbcfce214b3cb0015f03471', 'county_combined'] = 'Shelby County'
county_data.loc[(county_data.PROLIFIC_PID == '5cbe04b4f429ff00159de30e')& (county_data.wave=='16'), 'moved'] = 'Yes'
county_data.loc[(county_data.PROLIFIC_PID == '5cbf5b488be75a0016b49758')& (county_data.wave=='15'), 'moved'] = 'Yes'
# county_data.loc[county_data.PROLIFIC_PID == '5cfea2427b91e70001423ffb', 'county_combined'] = 'Muskegon County'
# county_data.loc[(county_data.PROLIFIC_PID == '5d26222d2cbf2c00170c5ff9')& (county_data.wave>'1'), 'county_combined'] = 'Contra Costa County'
# county_data.loc[county_data.PROLIFIC_PID == '5d277c0e28429200011c4fda', 'county_combined'] = 'Williamson County'
# county_data.loc[(county_data.PROLIFIC_PID == '5d4a1c61be2a040001beaedb')& (county_data.wave>'7'), 'county_combined'] = 'Clark County'
# county_data.loc[county_data.PROLIFIC_PID == '5d8934dc4c680f00181616f8', 'county_combined'] = 'Raleigh County'   
# county_data.loc[(county_data.PROLIFIC_PID == '5d9e363c25ceaa0012574295')& (county_data.wave=='17'), 'moved'] = 'Yes'
# county_data.loc[county_data.PROLIFIC_PID == '5db741cc04b632000a9a9800', 'county_combined'] = 'Cuyahoga County'
# county_data.loc[county_data.PROLIFIC_PID == '5dcca2db88381e92c96b9ac5', 'county_combined'] = 'Kane County'
# county_data.loc[(county_data.PROLIFIC_PID == '5dd0c43acaee3e187643d32b')& (county_data.wave>'13'), 'county_combined'] = 'Mahoning County'
# county_data.loc[county_data.PROLIFIC_PID == '5de17df32cf2041dc1c2d8ad', 'county_combined'] = 'Queens County'
# county_data.loc[county_data.PROLIFIC_PID == '5de59f97e448c65315ec4a37', 'county_combined'] = 'Denali Borough'
# county_data.loc[county_data.PROLIFIC_PID == '5e0378a7b199e4e7e6ccd8e9', 'county_combined'] = 'Kings County'   
# county_data.loc[(county_data.PROLIFIC_PID == '5e0583957dd477fb29fae7dd')& (county_data.wave=='16'), 'moved'] = 'Yes'
# county_data.loc[(county_data.PROLIFIC_PID == '5e0cd34376f45046333a5b87')& (county_data.wave=='17'), 'moved'] = 'Yes'
# county_data.loc[county_data.PROLIFIC_PID == '5e67b8c9dfa1b705a81d59f5', 'county_combined'] = 'Sevier County'
# county_data.loc[county_data.PROLIFIC_PID == '5e78d6a0bc70c94664aae415', 'county_combined'] = 'Milwaukee County'
county_data.loc[county_data.PROLIFIC_PID == '5bdd368719b9ba000157fb12', 'county_combined'] = 'Tulare County'
county_data.loc[county_data.PROLIFIC_PID == '5db4a409d27e05000f0eaa00', 'county_combined'] = 'Wilcox County'
county_data.loc[county_data.PROLIFIC_PID == '5dd406d9c23e0d3e6dacc034', 'county_combined'] = 'Delaware County'
county_data.loc[county_data.PROLIFIC_PID == '5e2298499138e055cfd97ae0', 'county_combined'] = 'Hampden County'
county_data.loc[county_data.PROLIFIC_PID == '5dd35aea9b6104366fef1a90', 'county_combined'] = 'Ross County'
county_data.loc[county_data.PROLIFIC_PID == '5e6a418620befe00096983d2', 'county_combined'] = 'Monmouth County'
county_data.loc[county_data.PROLIFIC_PID == '5c1c137b0739430001693cf5', 'county_combined'] = 'St. Johns County'
county_data.loc[county_data.PROLIFIC_PID == '5e33272d78e97401e4944570', 'county_combined'] = 'Leflore County'
county_data.loc[county_data.PROLIFIC_PID == '5c365b5e8821900001b3876c', 'county_combined'] = 'Orange County'
county_data.loc[county_data.PROLIFIC_PID == '5dd43fb9572604418c5fb05c', 'county_combined'] = 'Christian County'
county_data.loc[county_data.PROLIFIC_PID == '5bd45c509afb5b00015c48f4', 'county_combined'] = 'Bronx County'
county_data.loc[county_data.PROLIFIC_PID == '5c27b2157e42a600015bf8ef', 'county_combined'] = 'Bronx County'
county_data.loc[county_data.PROLIFIC_PID == '5dd9ebd239bca1962e17e64c', 'county_combined'] = 'Kaufman County'
county_data.loc[county_data.PROLIFIC_PID == '5e2721b01160079464059b5c', 'county_combined'] = 'Robeson County'
county_data.loc[county_data.PROLIFIC_PID == '5e52148e9a302e245eb71caf', 'county_combined'] = 'Los Angeles County'
county_data.loc[county_data.PROLIFIC_PID == '5c567e2acc53750001751cb0', 'county_combined'] = 'Sacramento County'
county_data.loc[county_data.PROLIFIC_PID == '5d391aa474651f0017882b6b', 'county_combined'] = 'Pottawattamie County'
county_data.loc[county_data.PROLIFIC_PID == '5ca6bbf13b5fcf00100996e9', 'county_combined'] = 'Polk County'
county_data.loc[county_data.PROLIFIC_PID == '5e470b9563f714033f4b7a7f', 'county_combined'] = 'Bristol County'
county_data.loc[county_data.PROLIFIC_PID == '5a8b61fcf1408d000176b02f', 'county_combined'] = 'Coryell County'
# county_data.loc[county_data.PROLIFIC_PID == '5c756f32c3c75a0001334269', 'county_combined'] = 'Brown County'
county_data.loc[county_data.PROLIFIC_PID == '5758a211cb13810006479a0d', 'county_combined'] = 'Washington County'
county_data.loc[county_data.PROLIFIC_PID == '5d26aaf4665b0a00174b8f03', 'county_combined'] = 'Lake County'
# county_data.loc[county_data.PROLIFIC_PID == '5e7691c0629b77207b16964a', 'county_combined'] = 'Alameda County'
# county_data.loc[county_data.PROLIFIC_PID == '5be369b28bc5d60001da2c52', 'county_combined'] = 'Montgomery County'
county_data.loc[county_data.PROLIFIC_PID == '5cfb286697fb270001192074', 'county_combined'] = 'Allegheny County'
county_data.loc[county_data.PROLIFIC_PID == '5e35759f1c0a77521b3e4e7a', 'county_combined'] = 'Dallas County'   
county_data.loc[county_data.PROLIFIC_PID == '5b9b4cc406114100010660f0', 'county_combined'] = 'Philadelphia County'   
county_data.loc[county_data.PROLIFIC_PID == '5e7e46bfa10d910843e02743', 'county_combined'] = 'Kings County'   
county_data.loc[county_data.PROLIFIC_PID == '5c2d2d6cc8ebae0001a41397', 'county_combined'] = 'Kings County'   
county_data.loc[county_data.PROLIFIC_PID == '5c1c74700036a80001193c24', 'county_combined'] = 'Riverside County'   
county_data.loc[county_data.PROLIFIC_PID == '5bf5aa89d944c300012634cc', 'county_combined'] = 'Bronx County'   
county_data.loc[county_data.PROLIFIC_PID == '57dd766bbcd7150001ded5de', 'county_combined'] = 'Montgomery County'   
county_data.loc[county_data.PROLIFIC_PID == '562e4469733ea00005163785', 'county_combined'] = 'Hawaii County'   
county_data.loc[county_data.PROLIFIC_PID == '5e7fbf90e06daa01664bc717', 'county_combined'] = 'Ingham County'
county_data.loc[county_data.PROLIFIC_PID == '5c670a430d80fd00014264f9', 'county_combined'] = 'Kings County'   
county_data.loc[county_data.PROLIFIC_PID == '59c843b646f72100019067ce', 'county_combined'] = 'Dallas County'   
county_data.loc[county_data.PROLIFIC_PID == '5dbdbc9da319ab2ecf2a0887', 'county_combined'] = 'Elmore County'   
county_data.loc[county_data.PROLIFIC_PID == '5bf9bd81549c180001dab194', 'county_combined'] = 'Orange County'   
county_data.loc[county_data.PROLIFIC_PID == '5e415a5f9dd805382f0b6af9', 'county_combined'] = 'Queens County'   
county_data.loc[county_data.PROLIFIC_PID == '5e498a4fd2a6e53fb7b1840e', 'county_combined'] = 'New York County'   
county_data.loc[county_data.PROLIFIC_PID == '5dc8d871031ed662f4e63291', 'county_combined'] = 'Bernalillo County'   
county_data.loc[county_data.PROLIFIC_PID == '5e5ee89299b03c22a97ff062', 'county_combined'] = 'Queens County'   
county_data.loc[county_data.PROLIFIC_PID == '5d62239454f7a50001c70f03', 'county_combined'] = 'Tarrant County'   
county_data.loc[county_data.PROLIFIC_PID == '5e0d6dee4f40fb4f60fc0494', 'county_combined'] = "Prince George's County"
county_data.loc[county_data.PROLIFIC_PID == '5dd31d3216d10c341d2d2c7e', 'county_combined'] = 'Rankin County'
county_data.loc[county_data.PROLIFIC_PID == '5e7d280b351b5c0d4a41d376', 'county_combined'] = 'Kings County'
county_data.loc[county_data.PROLIFIC_PID == '5e77dd22a02ca535d408b61f', 'county_combined'] = 'Middlesex County'
county_data.loc[county_data.PROLIFIC_PID == '5afdde7e59ae1e00017e6737', 'county_combined'] = 'Monmouth County'
county_data.loc[county_data.PROLIFIC_PID == '5d0eb8f40c09d600174dca28', 'county_combined'] = 'Hamilton County'
county_data.loc[county_data.PROLIFIC_PID == '5c313f27867f660001b0601c', 'county_combined'] = 'San Diego County'
county_data.loc[county_data.PROLIFIC_PID == '5e770f11a9d3b527c662aa23', 'county_combined'] = 'Bergen County'
county_data.loc[county_data.PROLIFIC_PID == '55883b3efdf99b4021921cb9', 'county_combined'] = 'Monroe County'
county_data.loc[county_data.PROLIFIC_PID == '549ea92efdf99b4ffa5fa208', 'county_combined'] = 'Emmet County'
county_data.loc[county_data.PROLIFIC_PID == '559b1e92fdf99b426abb6227', 'county_combined'] = 'Norfolk County'
county_data.loc[(county_data.PROLIFIC_PID == '5d620e2c81e64c001a580927')& (county_data.wave=='15'), 'moved'] = 'Yes'
county_data.loc[(county_data.PROLIFIC_PID == '5e519c1b7243ea1d0aa361d6')& (county_data.wave=='16'), 'moved'] = 'Yes'
# county_data.loc[(county_data.PROLIFIC_PID == '5dd0c43acaee3e187643d32b')& (county_data.wave=='15'), 'moved'] = 'Yes'
county_data.loc[county_data.PROLIFIC_PID == '5daa18ed95be5f00133609e8', 'county_combined'] =  'Knox County'
county_data.loc[county_data.PROLIFIC_PID == '5cb912a2e254c8001754eabd', 'county_combined'] =  'Erie County'
county_data.loc[county_data.PROLIFIC_PID == '5d0eecdd348afe00015865aa', 'county_combined'] = 'Kenosha County'
county_data.loc[county_data.PROLIFIC_PID == '5e5c56b36175cb43f89fe34e', 'county_combined'] = 'Door County'
county_data.loc[county_data.PROLIFIC_PID == '59c0782e5364260001dc4740', 'county_combined'] = 'Yazoo County'
county_data.loc[county_data.PROLIFIC_PID == '5e782c5d99df853a6f1315f4', 'county_combined'] = 'Queens County'
county_data.loc[county_data.PROLIFIC_PID == '5e3d8ef3b9b9e3043d928529', 'county_combined'] = 'Kendall County'
county_data.loc[county_data.PROLIFIC_PID == '5dfc674a9195c6974e597c58', 'county_combined'] = 'Pasquotank County'
county_data.loc[county_data.PROLIFIC_PID == '5e5b190a0f5cbf314bd4ee5d', 'county_combined'] = 'Monroe County'

man_edit_pids = ['55b3120cfdf99b6ef6263a71','586372cae16d530001b345b8','59235471e937bd0001ca0245','5c6fb7c3c114eb00018b3154','5cbe04b4f429ff00159de30e','5cbf5b488be75a0016b49758',
                '5bdd368719b9ba000157fb12','5db4a409d27e05000f0eaa00','5dd406d9c23e0d3e6dacc034','5e2298499138e055cfd97ae0','5dd35aea9b6104366fef1a90','5e6a418620befe00096983d2',
                '5c1c137b0739430001693cf5','5e33272d78e97401e4944570','5c365b5e8821900001b3876c','5dd43fb9572604418c5fb05c','5bd45c509afb5b00015c48f4','5c27b2157e42a600015bf8ef',
                '5dd9ebd239bca1962e17e64c','5e2721b01160079464059b5c','5e52148e9a302e245eb71caf','5c567e2acc53750001751cb0','5d391aa474651f0017882b6b','5ca6bbf13b5fcf00100996e9',
                '5e470b9563f714033f4b7a7f','5a8b61fcf1408d000176b02f','5758a211cb13810006479a0d','5d26aaf4665b0a00174b8f03','5cfb286697fb270001192074','5e35759f1c0a77521b3e4e7a',
                '5b9b4cc406114100010660f0','5e7e46bfa10d910843e02743','5c2d2d6cc8ebae0001a41397','5c1c74700036a80001193c24','5bf5aa89d944c300012634cc','5bf5aa89d944c300012634cc',
                '57dd766bbcd7150001ded5de','562e4469733ea00005163785','5e7fbf90e06daa01664bc717','5c670a430d80fd00014264f9','59c843b646f72100019067ce','5dbdbc9da319ab2ecf2a0887',
                '5bf9bd81549c180001dab194','5e415a5f9dd805382f0b6af9','5e498a4fd2a6e53fb7b1840e','5dc8d871031ed662f4e63291','5e5ee89299b03c22a97ff062','5d62239454f7a50001c70f03',
                '5e0d6dee4f40fb4f60fc0494','5dd31d3216d10c341d2d2c7e','5e7d280b351b5c0d4a41d376','5e77dd22a02ca535d408b61f','5afdde7e59ae1e00017e6737','5d0eb8f40c09d600174dca28',
                '5c313f27867f660001b0601c','5e770f11a9d3b527c662aa23','55883b3efdf99b4021921cb9','549ea92efdf99b4ffa5fa208','559b1e92fdf99b426abb6227','5d620e2c81e64c001a580927',
                '5e519c1b7243ea1d0aa361d6','5daa18ed95be5f00133609e8','5cb912a2e254c8001754eabd','5d0eecdd348afe00015865aa','5e5c56b36175cb43f89fe34e','59c0782e5364260001dc4740',
                '5e782c5d99df853a6f1315f4','5e3d8ef3b9b9e3043d928529','5dfc674a9195c6974e597c58','5e5b190a0f5cbf314bd4ee5d']


# manual county assignments follwing visual inspections of county data:
commen issues:
- uszc.SearchEngine cound not identify a county based on town and state (town very small, closest town in different state, ...)
- mismatch in counties due to differnt town listed but listed towns are in close proximity to each other, contain small (likely closer towns) and colsest big city
- move question ('have you moved since teh last wave') likely answered incorrectly, e.g., sudden but consistent change in closest town but no move indicated
- ....

# final data flags:
- flag = 0 --> no isssues
- flag = 1 --> inconststent counties but listed towns towns less than 50km appart
- flag = 2 --> manually adapted after visual inspection 
- flag = 9 --> noisy/ incomplete/ indistinct data, use with caution!

In [None]:

    print('yep')
else:
    print('nope')


In [None]:
for pid in ['5e605d713d60473af936cc65']:
    pid_idx = county_data.index[county_data.PROLIFIC_PID == pid]
    tmp_df = county_data.loc[pid_idx].copy().reset_index(drop = True)
    tmp_df.flag = np.nan
    
start_idx = 0
for moved_idx in tmp_df.index[tmp_df.moved == 'Yes']:
            
        if:
            # all counties identical in move period
            if len(tmp_df.loc[15,'county_by_city_and_state'].unique())==1: 
                tmp_df.loc[start_idx:moved_idx-1,'flag'] = 0
            elif len(tmp_df.loc[start_idx:moved_idx-1,'county_combined'].unique())==1:
                tmp_df.loc[start_idx:moved_idx-1,'flag'] = 0
            
            # more than one county in move period
            # w2-moved = yes --> no town data in w1, considered clean
            elif (len(tmp_df.loc[start_idx:moved_idx-1,'county_combined'])==1) & len(tmp_df.loc[start_idx:moved_idx-1,'wave'] == '1'):
                tmp_df.loc[start_idx:moved_idx-1,'flag'] = 0
            # more than one county and two differnt nearest towns:
            # compute distance between the towns, if the distance is less than 50km --> likely response noise, considered clean
            elif len(tmp_df.loc[start_idx:moved_idx-1,'nearest_town'].unique()) == 2:
                state = tmp_df.loc[start_idx:moved_idx-1,'state'].unique()
                towns = tmp_df.loc[start_idx:moved_idx-1,'nearest_town'].unique()
                long_lat = []
                town_dist = np.inf 
                for town in towns: 
                    try:
                        tmp_lang_lat = geolocator.geocode(state + ' ' + town)
                    except:
                        print(pid)
                    if tmp_lang_lat:
                        long_lat.append(tmp_lang_lat[-1])
                if len(long_lat) == 2:  
                    town_dist = geodesic(long_lat[0], long_lat[1])
                if (town_dist <=100) & (len(tmp_df.loc[start_idx:moved_idx-1,'loc_description'].unique()) == 1):
                    tmp_df.loc[start_idx:moved_idx-1,'flag'] = 1
            else:
                tmp_df.loc[start_idx:moved_idx-1,'flag'] = 9
            start_idx = moved_idx
            
tmp_df

In [None]:
# loop subjects
for pid in tqdm(county_data.PROLIFIC_PID.unique()):
#for pid in ['5e605d713d60473af936cc65']:
    pid_idx = county_data.index[county_data.PROLIFIC_PID == pid]
    tmp_df = county_data.loc[pid_idx].copy().reset_index(drop = True)
    tmp_df.flag = np.nan
    
    # not moved 
    if all(tmp_df.moved !='Yes'):
        # if county string isn't empty
        if tmp_df.county_by_city_and_state.unique()[0]: 
            # all counties identical         
            if len(tmp_df.county_by_city_and_state.unique()) == 1:
                county_data.loc[county_data.PROLIFIC_PID == pid, 'flag'] = 0
            elif len(tmp_df.county_combined.unique()) == 1:
                county_data.loc[county_data.PROLIFIC_PID == pid, 'flag'] = 0

            # more than one county and two differnt nearest towns:
            # compute distance between the towns, if the distance is less than 50km --> likely response noise, considered clean
            elif len(tmp_df.nearest_town.unique()) == 2:
                state = tmp_df.state.unique()
                towns = tmp_df.nearest_town.unique()
                long_lat = []
                town_dist = np.inf 
                for town in towns: 
                    try:
                        tmp_lang_lat = geolocator.geocode(state + ' ' + town)
                    except:
                        print(pid)
                    if tmp_lang_lat:
                        long_lat.append(tmp_lang_lat[-1])
                if len(long_lat) == 2:  
                    town_dist = geodesic(long_lat[0], long_lat[1])
                if (town_dist <=50) & (len(tmp_df.loc_description.unique()) == 1):
                    county_data.loc[county_data.PROLIFIC_PID == pid, 'flag'] = 1
                else:
                    county_data.loc[county_data.PROLIFIC_PID == pid, 'flag'] = 9

            else: 
                county_data.loc[county_data.PROLIFIC_PID == pid, 'flag'] = 9

            # if data is clan,copy zip-code county to all waves
            if all(county_data.loc[county_data.PROLIFIC_PID == pid, 'flag'] == 0):
                zip_county = list(county_data.loc[county_data.PROLIFIC_PID == pid, 'county_by_zip'].dropna())
                if len(zip_county)==1:
                    county_data.loc[county_data.PROLIFIC_PID == pid, 'county_combined'] = zip_county[0]
        # county string is empty
        else:
            county_data.loc[county_data.PROLIFIC_PID == pid, 'flag'] = 9
            

    # moved 
    elif any(tmp_df.moved =='Yes'):
        start_idx = 0
        for moved_idx in tmp_df.index[tmp_df.moved == 'Yes']:
            # county string not empty
            if tmp_df.loc[start_idx:moved_idx-1,'county_by_city_and_state'].unique()[0]: 
                # all counties identical in move period
                if len(tmp_df.loc[start_idx:moved_idx-1,'county_by_city_and_state'].unique())==1:
                    tmp_df.loc[start_idx:moved_idx-1,'flag'] = 0
                elif len(tmp_df.loc[start_idx:moved_idx-1,'county_combined'].unique())==1:
                    tmp_df.loc[start_idx:moved_idx-1,'flag'] = 0

                # more than one county in move period
                # w2-moved = yes --> no town data in w1, considered clean
                elif (len(tmp_df.loc[start_idx:moved_idx-1,'county_combined'])==1) & len(tmp_df.loc[start_idx:moved_idx-1,'wave'] == '1'):
                    tmp_df.loc[start_idx:moved_idx-1,'flag'] = 0
                # more than one county and two differnt nearest towns:
                # compute distance between the towns, if the distance is less than 50km --> likely response noise, considered clean
                elif len(tmp_df.loc[start_idx:moved_idx-1,'nearest_town'].unique()) == 2:
                    state = tmp_df.loc[start_idx:moved_idx-1,'state'].unique()
                    towns = tmp_df.loc[start_idx:moved_idx-1,'nearest_town'].unique()
                    long_lat = []
                    town_dist = np.inf 
                    for town in towns: 
                        try:
                            tmp_lang_lat = geolocator.geocode(state + ' ' + town)
                        except:
                            print(pid)
                        if tmp_lang_lat:
                            long_lat.append(tmp_lang_lat[-1])
                    if len(long_lat) == 2:  
                        town_dist = geodesic(long_lat[0], long_lat[1])
                    if (town_dist <=100) & (len(tmp_df.loc[start_idx:moved_idx-1,'loc_description'].unique()) == 1):
                        tmp_df.loc[start_idx:moved_idx-1,'flag'] = 1
                else:
                    tmp_df.loc[start_idx:moved_idx-1,'flag'] = 9

                if all(tmp_df.loc[start_idx:moved_idx-1,'flag'] == 0) | (all(tmp_df.loc[start_idx:moved_idx-1,'flag'] == 1)):
                    zip_county = list(tmp_df.loc[start_idx:moved_idx-1, 'county_by_zip'].dropna())
                    if len(zip_county)==1:
                        tmp_df.loc[start_idx:moved_idx-1, 'county_combined'] = zip_county[0]
                        
            # county string empty 
            else:
                 tmp_df.loc[start_idx:moved_idx-1,'flag'] = 9
            start_idx = moved_idx
        
        # all counties identical in last move period
        if tmp_df.loc[start_idx:,'county_by_city_and_state'].unique()[0]: 
            if len(tmp_df.loc[start_idx:,'county_by_city_and_state'].dropna().unique())==1:
                tmp_df.loc[start_idx:,'flag'] = 0
            elif len(tmp_df.loc[start_idx:,'county_combined'].dropna().unique())==1:
                tmp_df.loc[start_idx:,'flag'] = 0

            # more than one county in last move period
            # more than one county and two differnt nearest towns:
            # compute distance between the towns, if the distance is less than 50km --> likely response noise, considered clean
            elif len(tmp_df.loc[start_idx:,'nearest_town'].unique()) == 2:
                state = tmp_df.loc[start_idx:,'state'].unique()
                towns = tmp_df.loc[start_idx:,'nearest_town'].unique()
                long_lat = []
                town_dist = np.inf 
                for town in towns: 
                    try:
                        tmp_lang_lat = geolocator.geocode(state + ' ' + town)
                    except:
                        print(pid)
                    if tmp_lang_lat:
                        long_lat.append(tmp_lang_lat[-1])
                if len(long_lat) == 2:  
                    town_dist = geodesic(long_lat[0], long_lat[1])
                if (town_dist <=100) & (len(tmp_df.loc[start_idx:,'loc_description'].unique()) == 1):
                    tmp_df.loc[start_idx:,'flag'] = 1

            else: 
                tmp_df.loc[start_idx:,'flag'] = 9
        # counyt string empty         
        else: 
                tmp_df.loc[start_idx:,'flag'] = 9

        if all(tmp_df.loc[start_idx:moved_idx-1,'flag'] == 0) | (all(tmp_df.loc[start_idx:moved_idx-1,'flag'] == 1)):
                zip_county = list(tmp_df.loc[start_idx:, 'county_by_zip'].dropna())
                if len(zip_county)==1:
                    tmp_df.loc[start_idx:, 'county_combined'] = zip_county[0]
        
        county_data.loc[county_data.PROLIFIC_PID == pid, 'county_combined'] = list(tmp_df['county_combined'])
                               
        # if any move period is flagged the subject is flagged
        if any(tmp_df.flag == 9):
            county_data.loc[county_data.PROLIFIC_PID == pid, 'flag'] = 9
        if any(tmp_df.flag == 1):
            county_data.loc[county_data.PROLIFIC_PID == pid, 'flag'] = 1
        elif all(tmp_df.flag == 0):
            county_data.loc[county_data.PROLIFIC_PID == pid, 'flag'] = 0

# flag manual edits
county_data.loc[county_data.PROLIFIC_PID.isin(man_edit_pids), 'flag'] = 2

# save output
county_data.to_csv(os.path.join(out_path,'county_assigned.csv'))


In [None]:
bad_pids = ['5ced335ede266200161ea979',
'5dd6830d57260460919d9715',
'5c4534332b1bda0001f5791f',
'5dfb87ac265c808aeb82de9a',
'5e865ed591a08928b8616e62',
'5e84f6b23b1cee0ccbec287c',
'5db3b031efa9f6000cf02400',
'5e7f2688ac2c275f03b6ea22',
'5dcec32a367c6d000a4282c2',
'5e80f90be6882f01c5d585f6',
'5a0a4aa0df3fa800015c398e',
'5e217a34e8ff3a469b5dfebc',
'5951c8505f42c10001ff3f93',
'5e6a139a0b0cc30726a72dda',
'583b397f54686200010f1d20',
'5e289d54a06ebe0464cfbe58',
'5de80e0b521f95791f912e49',
'5df82e8739bcde5dd044f741',
'5e784ba802c79b3c2e227bd3',
'5e65dca1d639df34800674a8',
'5c18703fbab95400015bd42c',
'5e703830ffc1a960e852b107',
'5e528b3520410b292bb84dca',
'5e0e632e194d295c13ee5b81',
'5e83b43765a5720e30b71b37',
'5b8375c216aa4400016added',
'5aa637861eda41000136d5d2',
'5df58b28ecb3ea3ed1ceea37',
'5cbf5b488be75a0016b49758',
'567dd32b4f0ef30006dbb718',
'5d3cfeabbaf11000151ed4cd',
'5e77fdfd5369a0382cdaeaa0',
'5e617588e8165506fc39b363',
'5e6155e6f169c0046b6d6518',
'5d3a2d1da46f450019cdf5d3', 
'5c343793ca23620001b2a460',
'5dd5e27fe25acf59990e1adf',
'5d354c9f1c1e8a001e521262',
'5a6f57166923df0001ef8d02',
'5e6be840c715c71aa2645e81',
'5e846b1a3e3e9c0008cb278a',
'5dcf4fb1181abc09aa499cd8',
'5c79a1f254c5d600015fc7af',
'5e25dfb0c4edc4831d8c0e11']

# len(county_data.loc[county_data.flag == 9, 'PROLIFIC_PID'].unique())