In [1]:
import pandas as pd

pii_file = "../original_data/vsv_county_noPII.csv"
df = pd.read_csv(pii_file)
df_1979_plus = df[df.ONSET_YEAR >= 1979]

df_1979_plus.head()

Unnamed: 0,ONSET_YEAR,ONSET_MONTH,ONSET_DAY,COUNTRY,STATE,COUNTY_MUNI,SEROTYPE,SPECIES
246,1981,9.0,,MEX,JAL,acatlan de juarez,,
247,1981,2.0,,MEX,NAY,ahuacatlan,,
248,1981,11.0,,MEX,JAL,amacueca,,
249,1981,4.0,,MEX,VER,angel r. cabada,,
250,1981,4.0,,MEX,VER,angel r. cabada,,


In [2]:
df_1979_plus.shape

(6530, 8)

In [3]:
# drop the rows in which ONSET_DAY is nan
df_1979_plus = df_1979_plus[df_1979_plus.ONSET_MONTH.notna()]
df_1979_plus.shape

(6179, 8)

In [4]:
df_1979_plus = df_1979_plus[df_1979_plus.COUNTY_MUNI.notna()]
df_1979_plus.shape

(5981, 8)

In [5]:
df_1979_plus

Unnamed: 0,ONSET_YEAR,ONSET_MONTH,ONSET_DAY,COUNTRY,STATE,COUNTY_MUNI,SEROTYPE,SPECIES
246,1981,9.0,,MEX,JAL,acatlan de juarez,,
247,1981,2.0,,MEX,NAY,ahuacatlan,,
248,1981,11.0,,MEX,JAL,amacueca,,
249,1981,4.0,,MEX,VER,angel r. cabada,,
250,1981,4.0,,MEX,VER,angel r. cabada,,
...,...,...,...,...,...,...,...,...
6771,2024,2.0,6.0,MEX,VER,tecolutla,,bovine
6772,2024,1.0,23.0,MEX,VER,tihuatlan,,bovine
6773,2024,3.0,5.0,MEX,VER,tihuatlan,,bovine
6774,2024,3.0,5.0,MEX,VER,tlapacoyan,,bovine


# Get n2c

In [6]:
import json

n2c_file = "../supporting_files/county2municodes_ultimate_dict.json"
with open(n2c_file, "r") as f:
    n2c = json.load(f)


## Plain run - 1

In [7]:
states = df_1979_plus.STATE.tolist()
counties = df_1979_plus.COUNTY_MUNI.tolist()

na_nums = 0
no_nans = 0
error_names = []
for state, county in zip(states, counties):
    try:
        name = f"{state}${county}"
        ccode = n2c[name]
        no_nans += 1
    except:
        na_nums += 1
        error_names.append(name)

In [8]:
na_nums, no_nans

(601, 5380)

In [10]:
import os

os.makedirs("../observations", exist_ok=True)

with open("../observations/41_error_counties.txt", "w") as f:
    for name in list(set(error_names)):
        f.write(name + "\n")


## Run2 - Wrongly coded states

#### In the original file
COA-oliana changed to GRO oliana

NAY-guadalupe y calvo to CHH-guadalupe y calvo

COL-uman to YUC-uman
CA-shackelford - TX-shackelford

#### In ultimate list
"SD$oglala lakota": "US46102"

"OAX$san juan mixtepec -dto. 26 -": "MX20209"

"OAX$san pedro mixtepec -dto. 22 -" : "MX20318"

Many COAs and CHS refered in the PII file is actually CHP : Ref:- ../supplimentary_files/muni_list_mx_v2.csv

In [11]:
states = df_1979_plus.STATE.tolist()
counties = df_1979_plus.COUNTY_MUNI.tolist()

converted = []
na_nums = 0
no_nans = 0
error_names = []
for state, county in zip(states, counties):
    try:

        name = f"{state}${county}"

        ccode = n2c[name]
        converted.append(ccode)
        no_nans += 1
    except:
        # CHH _> COL
        if state == "COA" or state == "CHS":
            state = "CHP"
        elif state == "CHH":
            state = "COL"
        elif state == "GRO":
            state = "SON"
        elif state == "HID":
            state = "GRO"
        elif state == "COL":
            state = "CHH"
        elif state == "NLE":
            state = "CHP"
        elif state == "DIF":
            state = "JAL"
        elif state == "CHI":
            state = "CHH"
        elif state == "CHP":
            state = "GRO"
        elif state == "AGU":
            state = "MEX"
        
        if county == "meverick":
            county = "maverick"
        elif county == "temapache":
            county = "alamo temapache"
        elif county == "san bernadino":
            county = "san bernardino"
        elif county == "escuintala":
            county = "escuintla"
        

        try:            
            name = f"{state}${county}"
            ccode = n2c[name]
            converted.append(ccode)
            no_nans += 1
        except:
            converted.append(None)
            na_nums += 1
            error_names.append(name)

In [12]:
df_1979_plus["COUNTY_MUNI_CODE"] = converted
df_1979_plus.head()

Unnamed: 0,ONSET_YEAR,ONSET_MONTH,ONSET_DAY,COUNTRY,STATE,COUNTY_MUNI,SEROTYPE,SPECIES,COUNTY_MUNI_CODE
246,1981,9.0,,MEX,JAL,acatlan de juarez,,,MX14002
247,1981,2.0,,MEX,NAY,ahuacatlan,,,MX18002
248,1981,11.0,,MEX,JAL,amacueca,,,MX14004
249,1981,4.0,,MEX,VER,angel r. cabada,,,MX30015
250,1981,4.0,,MEX,VER,angel r. cabada,,,MX30015


In [13]:
df_1979_plus.isnull().sum()

ONSET_YEAR             0
ONSET_MONTH            0
ONSET_DAY           1794
COUNTRY                0
STATE                  0
COUNTY_MUNI            0
SEROTYPE            1330
SPECIES              963
COUNTY_MUNI_CODE       1
dtype: int64

In [14]:
df_1979_plus.to_csv("../generated_files/vsv_county_noPII_coded.csv", index=False)

In [15]:
with open("../observations/100_error_counties.txt", "w") as f:
    for name in list(set(error_names)):
        f.write(name + "\n")
