In [1]:
# ===================================================================
#  Library
# ===================================================================
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from geopy.geocoders import Nominatim

In [2]:
# ===================================================================
#  Example
# ===================================================================
geolocator = Nominatim(user_agent="test_desu")
location = geolocator.geocode("Florida")
location

Location(Florida, United States, (27.7567667, -81.4639835, 0.0))

In [3]:
# ===================================================================
#  Utils
# ===================================================================
def get_latlng(row):
    geolocator = Nominatim(user_agent="test_desu")
    try:
        location = geolocator.geocode(row[0])
        row[['latitude', 'longitude']] = location.latitude, location.longitude
    except:
        print(f"Could not obtain {row[0]} information")
    return row

In [4]:
# ===================================================================
#  Data Loading
# ===================================================================
df = pd.read_csv("data/train.csv")[["region", "state"]]
df_region = pd.DataFrame(df["region"].unique())
df_state = pd.DataFrame(df["state"].unique())

display(df_region.head(2))
display(df_state.head(2))

Unnamed: 0,0
0,nashville
1,state college


Unnamed: 0,0
0,
1,pa


In [5]:
# ===================================================================
#  region
# ===================================================================

df_region[1] = df_region[0]
## 一番前が都市名
df_region[0] = df_region[0].str.split(pat="/", expand=True)[0]
## ","の前の都市の情報を取得する
df_region[0] = df_region[0].str.split(pat=",", expand=True)[0]
## 複数の都市名が"-"で繋がれているので一番前の都市の情報を取得する
df_region[0] = df_region[0].str.split(pat="-", expand=True)[0]
## 空白消去
df_region[0] = df_region[0].str.strip()

replace_map = {
    "east oregon":"oregon",
    "eastern NC":"Eastern North Carolina",
    "southern illinois":"illinois",
    "north central FL":"Florida"
}
df_region[0].replace(replace_map, inplace=True)


tqdm.pandas()
df_region[['latitude', 'longitude']] = 0
df_region = df_region.progress_apply(get_latlng, axis=1)
display(df_region)
print(df_region.isnull().sum())

  0%|          | 0/372 [00:00<?, ?it/s]

Unnamed: 0,0,1,latitude,longitude
0,nashville,nashville,36.162277,-86.774298
1,state college,state college,40.794450,-77.861639
2,wichita,wichita,37.692236,-97.337545
3,albany,albany,41.000028,19.999962
4,redding,redding,40.586356,-122.391675
...,...,...,...,...
367,southern WV,southern WV,37.846345,-81.991798
368,cumberland valley,cumberland valley,42.662050,-83.545806
369,oneonta,oneonta,33.947131,-86.471493
370,southern maryland,southern maryland,14.470892,120.999971


0            0
1            0
latitude     0
longitude    0
dtype: int64


In [6]:
# ===================================================================
#  state
# ===================================================================
state_dict = {
    'pa': 'Pennsylvania',
    'ks': 'Kansas',
    'ny': 'New York',
    'ca': 'California',
    'al': 'Alabama',
    'or': 'Oregon',
    'va': 'Virginia',
    'mt': 'Montana',
    'nj': 'New Jersey',
    'ma': 'Massachusetts',
    'wi': 'Wisconsin',
    'sc': 'South Carolina',
    'wa': 'Washington',
    'dc': 'Washington D.C.',
    'oh': 'Ohio',
    'in': 'Indiana',
    'de': 'Delaware',
    'fl': 'Florida',
    'nm': 'New Mexico',
    'az': 'Arizona',
    'ok': 'Oklahoma',
    'mn': 'Minnesota',
    'co': 'Colorado',
    'nv': 'Nevada',
    'wv': 'West Virginia',
    'tn': 'Tennessee',
    'mi': 'Michigan',
    'ri': 'Rhode Island',
    'il': 'Illinois',
    'tx': 'Texas',
    'ut': 'Utah',
    'ia': 'Iowa',
    'ga': 'Georgia',
    'md': 'Maryland',
    'mo': 'Missouri',
    'ky': 'Kentucky',
    'nc': 'North Carolina',
    'ak': 'Alaska',
    'id': 'Idaho',
    'ct': 'Connecticut',
    'wy': 'Wyoming',
    'nd': 'North Dakota',
    'me': 'Maine',
    'ar': 'Arkansas',
    'hi': 'Hawaii',
    'sd': 'South Dakota',
    'ne': 'Nebraska',
    'nh': 'New Hampshire',
    'vt': 'Vermont',
    'la': 'Louisiana',
    'ms': 'Mississippi'
}
df_state[1] = df_state[0]
df_state[0].replace(state_dict, inplace=True)
df_state.drop(0, axis=0, inplace=True) # 欠損値削除

tqdm.pandas()
df_state[['latitude', 'longitude']] = 0
df_state = df_state.progress_apply(get_latlng, axis=1)
display(df_state)
print(df_state.isnull().sum())

  0%|          | 0/51 [00:00<?, ?it/s]

Unnamed: 0,0,1,latitude,longitude
1,Pennsylvania,pa,40.969989,-77.727883
2,Kansas,ks,38.27312,-98.582187
3,New York,ny,40.712728,-74.006015
4,California,ca,36.701463,-118.755997
5,Alabama,al,33.258882,-86.829534
6,Oregon,or,43.97928,-120.737257
7,Virginia,va,37.123224,-78.492772
8,Montana,mt,47.375267,-109.638757
9,New Jersey,nj,40.075738,-74.404162
10,Massachusetts,ma,42.378877,-72.032366


0            0
1            0
latitude     0
longitude    0
dtype: int64


In [8]:
# ===================================================================
#  save data
# ===================================================================
df_state[[1, "latitude", "longitude"]].rename(
    columns={1:"state", "latitude":"state_latitude", "longitude":"state_longitude"}
    ).to_csv("data/state_coordinate.csv", index=False)
df_region[[1, "latitude", "longitude"]].rename(
    columns={1:"region", "latitude":"region_latitude", "longitude":"region_longitude"}
    ).to_csv("data/region_coordinate.csv", index=False)