In [79]:
import requests  #library used to download web pages.
from bs4 import BeautifulSoup  #import Beautiful Soup functions to parse data returned from the website
import pandas as pd  #import pandas to convert list of table rows to a data frame

In [80]:
#specify url
URL = "https://en.wikipedia.org/wiki/List_of_counties_in_Maryland#List_of_counties"

In [81]:
#connecting to website using variable page and using .get() function from the requests library to retrieve data from the URL
page = requests.get(URL)

In [82]:
#inspecting the results of the request
type(page)

requests.models.Response

In [83]:
#verify successful connection
#a 200 OK status means the request was successful, and the server responded with the data requested
page.status_code

200

In [84]:
#save string format of website to a variable HTMLstr
HTMLstr = page.text

In [85]:
#look at the first several hundred characters in the website string
print(HTMLstr[:300])


<!DOCTYPE html>
<html class="client-nojs" lang="en" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>List of counties in Maryland - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""]


In [86]:
#parse (analyze a string or text into logical syntactic components) html using Beautiful soup and store to variable soup
#first argument: raw HTML content; second argument: specify the HTML parser to use
soup = BeautifulSoup(HTMLstr, "html.parser")

In [87]:
#look at contents of page - wall of text
#soup

In [88]:
#format page to include indentation giving same visual presentation as the parse tree created from the raw HTML content
#print(soup.prettify())

In [90]:
#find all attribute <a> tags on the page and set it to the variale all_links
#all_links = soup.find_all("a")

#and then set all hyperlink references (href's) in the <a> tags to a variabe to list and print using .get() function in a for loop
#for link in all_links:
#    print(link.get("href"))

In [91]:
#find all table <table> tags and set to variable all_tables
all_tables = soup.find_all("table")
#all_tables

In [92]:
#There is more than one table so get <table> tag for data in list of counties table - 'wikitable sortable'
correct_table = soup.find('table', class_='wikitable sortable')
#correct_table

In [93]:
#set empty lists to hold data for each column (8 cols, 24 rows)
A=[] #county      0
B=[] #FIPS code   0
C=[] #County seat 1
D=[] #Est.        2
E=[] #Origin      3
F=[] #Etymology   4
G=[] #Population  7
H=[] #Area        8

#find all the table row <tr> tags; using a for loop go thru each one (row)
    #and then, get all the table data <td> tags for each <tr> tag and set them to a temp. var cells
    #if the number of <td> cells is 11 then append the empty lists to fill in the data
for row in correct_table.findAll("tr"):
    heads = row.findAll("th")
    cells = row.findAll("td")
    
    if len(cells) == 10:
        
        A.append(heads[0].find(text=True))  #gets info in county column and adds it to list A
        B.append(cells[0].find(text=True))  #gets info in FIPS code column and adds it to list B
        C.append(cells[1].find(text=True))  #gets info in County seat column and adds it to list C
        D.append(cells[2].find(text=True))  #gets info in Est. column and adds it to list D
        E.append(cells[3].find(text=True))  #gets info in Origin column and adds it to list E
        F.append(cells[4].find(text=True))  #gets info in Etymology column and adds it to list F
        G.append(cells[7].find(text=True))  #gets info in Poplulation column and adds it to list G
        H.append(cells[8].find(text=True))  #gets info in Area column and adds it to list H

In [94]:
print(len(A))
print(len(B))
print(len(C))
print(len(D))
print(len(E))
print(len(F))
print(len(G))
print(len(H))

24
24
24
24
24
24
24
24


In [95]:
#verify data in list A
A

['Allegany County',
 'Anne Arundel County',
 'Baltimore County',
 'Baltimore City',
 'Calvert County',
 'Caroline County',
 'Carroll County',
 'Cecil County',
 'Charles County',
 'Dorchester County',
 'Frederick County',
 'Garrett County',
 'Harford County',
 'Howard County',
 'Kent County',
 'Montgomery County',
 "Prince George's County",
 "Queen Anne's County",
 'Somerset County',
 "St. Mary's County",
 'Talbot County',
 'Washington County',
 'Wicomico County',
 'Worcester County']

In [96]:
#turn list A into a dataframe first
df = pd.DataFrame(A, columns=['County'])  

#add lists B-H as new columns in the new dataframe
df['FIPS code'] = B
df['County seat'] = C
df['Est.'] = D
df['Origin'] = E
df['Etymology'] = F
df['Population'] = G
df['Area'] = H

df.head() 

Unnamed: 0,County,FIPS code,County seat,Est.,Origin,Etymology,Population,Area
0,Allegany County,1,Cumberland,1789,Formed from part of Washington County.,From the Lenape Indian word,74012,430
1,Anne Arundel County,3,Annapolis,1650,Formed from part of St. Mary's County.,Anne Arundell,550488,588
2,Baltimore County,5,Towson,1659,Formed from unorganized territory,"Cecil Calvert, 2nd Baron Baltimore",817455,682
3,Baltimore City,510,Baltimore City,1851,Founded in 1729. Detached in 1851 from Baltimo...,"Cecil Calvert, 2nd Baron Baltimore",621342,92
4,Calvert County,9,Prince Frederick,1654,Formed as Patuxent County from unorganized ter...,The,89628,345


In [97]:
#export scraped data, now dataframe to a csv file
df.to_csv("MD_Counties.csv")