# Web Scraping of state polices departments from www.50states.com

In [222]:
import requests
from lxml import html
import pandas as pd
from requests.structures import CaseInsensitiveDict
import os


### Define List of States

In [223]:

states_names ={
               "Alabama":"AL","Alaska":"AK",
               "Arizona":"AZ",
               "Arkansas":"AR","California":"CA","Colorado":"CO",
               "Connecticut":"CT","Delaware":"DE","Florida":"FL","Georgia":"GA","Hawaii":"HI",
               "Idaho":"ID","Illinois":"IL","Indiana":"IN","Iowa":"IA","Kansas":"KS","Kentucky":"KY",
               "Louisiana":"LA","Maine":"ME",
               "Maryland":"MD","Massachusetts":"MA","Michigan":"MI","Minnesota":"MN","Mississippi":"MS",
               "Missouri":"MO","Montana":"MT",
               "Nebraska":"NE","Nevada":"NV","New_Hampshire":"NH","New_Jersey":"NJ","New_Mexico":"NM",
               "New_York":"NY","North_Carolina":"NC",
               "North_Dakota":"ND","Oregon":"OR","Pennsylvania":"PA","Rhode_Island":"RI",
               "South_Carolina":"SC","South_Dakota":"SD","Tennessee":"TN","Texas":"TX","Utah":"UT","Vermont":"VT",
               "Virginia":"VA","Washington":"WA","West Virginia":"WV","Wisconsin":"WI","Wyoming":"WY"
               }

### Define list variables


In [249]:
name_list = []
address_list = []
state_list = []

## Define functions

### get_50states_static_page
 - Get 50states.com html page content.
 - Input - None
 - Output - Dictionary 
     - state_name - list of states names
     - states_list - list of content of html page
     - elements_list - count of elements to extract(acount of all police departments per state)

In [250]:
def get_50states_static_page (statesNames):

    states_dict = dict()
    states_list = []
    elements_list = []
    
    for state_name,state_abbr in statesNames.items():
        
        page = requests.get('https://www.50states.com/'+state_name+'/police_departments.htm')
        tree = html.fromstring(page.content)
        count_elements = tree.xpath('count(//strong)')
        
        #print(state_name+" "+str(count_elements))
        states_list.append(tree)
        elements_list.append(int(count_elements))
        states_dict[state_abbr]=state_name,states_list,elements_list
        #print(type(states_dict))
        
    return states_dict

### get_names_addresses
 - extract names and addresses from html object
 - Input - Dictionary
 - Output - lists 
     - name_list - list of states names
     - address_list - list of departments addresses
     - state_list - list of state abbreviations

In [287]:
def get_names_addresses(states):
    
    i=0
    for key, value in states.items():
        try:
    
            #print(key)
            elements = value[2][i]
            html = value[1][i]
            #print(elements)
            i=i+1

            for x in  range(1,elements):
                #print(x)
                depName = str(html.xpath('/html/body/div[3]/div[3]/div/div[1]/div[2]/main/div/article/div/ul/li['+str(x)+']/strong/text()')).replace('[\'', '').replace('\']', '').replace('\"', '').replace('[', '').replace(']', '')
                address = html.xpath('/html/body/div[3]/div[3]/div/div[1]/div[2]/main/div/article/div/ul/li['+str(x)+']/text()')
                if len(depName)>3:
                    name_list.append(depName)
                    address_list.append(address[3]+','+address[4])
                    state_list.append(state_abbr)
        except IndexError:
            pass
    return name_list,address_list,state_list


## Add data to daframe

In [289]:
names,address,states = get_names_addresses(get_50states_static_page (states_names))


In [290]:
df = pd.DataFrame(columns = ['name','address','state'])

df["name"] = names
df["address"] = address
df["state"] = states

df.head(6)

Unnamed: 0,name,address,state
0,Abbeville Police Dept,"101 E Washington St, Abbeville, 36310",WY
1,Albertville City Police Dept,"116 W Main St, Albertville, 35950",WY
2,Aliceville Police Dept,"215 1st St NE, Aliceville, 35442",WY
3,Altoona Police Dept,"2844 Main St, Altoona, 35952",WY
4,Animal Control,"934 N Ripley St, Montgomery, 36104",WY
5,Arab Police Dept,"1447 Hog Jaw Rd, Arab, 35016",WY


### Write to file

In [302]:
path = r"C:\Users\Alina\NIBRS_target\facts"

df.to_csv(os.path.join(path,"agencies_addresses"),index=True) 