In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
import pandas as pd
import numpy as np
import mapclassify
import requests
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('/Users/alia/Documents/Github/DoDContractApp/Raw Data/Service Contracts/FY2022.csv')
df.head()

In [None]:
# Keep only columns of interest
df = df[['contract_award_unique_key',
         'total_obligated_amount',
         'award_base_action_date',
         'awarding_agency_code',
         'awarding_agency_name',
         'awarding_sub_agency_code',
         'awarding_sub_agency_name',
         'awarding_office_code',
         'awarding_office_name',
         'funding_agency_code',
         'funding_agency_name',
         'funding_sub_agency_code',
         'funding_sub_agency_name',
         'funding_office_code',
         'funding_office_name',
         'recipient_name',
         'primary_place_of_performance_city_name',
         'primary_place_of_performance_state_code',
         'primary_place_of_performance_state_name',
         'primary_place_of_performance_zip_4',
         'product_or_service_code',
         'product_or_service_code_description',
         'dod_claimant_program_code',
         'dod_claimant_program_description',
         'naics_code',
         'naics_description',
         'last_modified_date']]

In [None]:
# Drop rows where city and zip code for place of performance are blank
df = df.dropna(subset=['primary_place_of_performance_city_name', 'primary_place_of_performance_zip_4'])

# Convert zip code to string
df['primary_place_of_performance_zip_4'] = df['primary_place_of_performance_zip_4'].astype(int)
df['primary_place_of_performance_zip_4'] = df['primary_place_of_performance_zip_4'].astype(str)

# Create a full address string for place of performance (POP) and then drop columns of individual address components
df['geopy_pop']= df['primary_place_of_performance_city_name'] + ', ' + df['primary_place_of_performance_state_code'] + ' ' + (df['primary_place_of_performance_zip_4'].str[:5])
df = df.drop(columns =['primary_place_of_performance_city_name',
                       'primary_place_of_performance_state_code',
                       'primary_place_of_performance_state_name',
                       'primary_place_of_performance_zip_4'])

# Manually edit abbreviations in addresses based on a manual inspection
df['geopy_pop'] = df['geopy_pop'].str.replace('AFB','AIR FORCE BASE')
df['geopy_pop'] = df['geopy_pop'].str.replace('M C B H KANEOHE BAY','MARINE CORPS BASE HAWAII')
df['geopy_pop'] = df['geopy_pop'].str.replace('JBSA','JOINT BASE SAN ANTONIO')
df['geopy_pop'] = df['geopy_pop'].str.replace(' FT ',' FORT ')
df['geopy_pop'] = df['geopy_pop'].str.replace('JBPHH','JOINT BASE PEARL HARBOR-HICKAM')

In [None]:
def lat_long(address):
    """This function returns the latitude and longitude coordinates for an address string, and returns a blank if the location is not found. 
    There is a 2 second delay for rate limiting.
    Input: Address (string)
    Output: Coordinates (tuple)
    """
    # Instantiate the geolocator and geocode objects
    geolocator = Nominatim(user_agent="aliakader",timeout=5)
    geocode = RateLimiter(geolocator.geocode, min_delay_seconds=2)
    
    # Find the location of an address string
    location = geocode(address)
    
    # If location is found, return (lat,long), else return a blank
    if location is not None:
        return location.latitude, location.longitude
    else:
        return ''

In [None]:
# Store POP addresses in a list
addresses = df['geopy_pop'].tolist()
unique_addresses = list(set(addresses))

In [None]:
# Create an empty list to store coordinates
coords = []

In [None]:
# Use lat_long function to generate coordinates for all addresses (this takes a few hours)
for address in unique_addresses:
    coord = lat_long(address)
    print(coord)
    coords.append(coord)

In [None]:
# Add coordinates to df as new column
address_coords = pd.DataFrame()
address_coords['Address'] = unique_addresses
address_coords['Coordinates'] = coords

# Save df as CSV to avoid running lat_long function again
address_coords.to_csv('/Users/alia/Documents/Github/DoDContractApp/Clean Data/address_coordinates.csv',index=False)

In [None]:
df = df.rename({'geopy_pop':'Address'}, axis=1)
df = df.merge(address_coords,how='left',on='Address')
df = df.drop(columns='Address')
df['lat'], df['long'] = df.Coordinates.str
df = df.drop(columns='Coordinates')
dat.to_csv('/Users/alia/Documents/Github/DoDContractApp/Clean Data/data_coords.csv')