In [1]:
import urllib.request
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re

## The web scraping using BeautifulSoup

In [2]:
urls = ["http://killedbypolice.net/kbp2018", "http://www.killedbypolice.net/kbp2017", "http://www.killedbypolice.net/kbp2016", "http://www.killedbypolice.net/kbp2015", "http://www.killedbypolice.net/kbp2014", "http://www.killedbypolice.net/kbp2013"]

In [3]:
killedbypolice = pd.DataFrame(columns = ["date", "state", "gender", "race", "name", "age", "manner of death"])

In [4]:
for url in urls:
    # turn page into beautiful soup HTML
    page = urllib.request.urlopen(url)
    soup = BeautifulSoup(page, "html.parser")
    
    # all of the information is in one "row", so split it up
    table = soup.find_all("tr")[3].text
    rows = table.split("\n")
    
    # parse information into columns
    datecol = []
    statecol = []
    genracecol = []
    namecol = []
    agecol = []
    mannerdeathcol = []
    for row in rows:
        idnum = ""
        if "(" in row:
            idnum = row[row.find("(")+1 : row.find(")")]
            row = row[row.find(")")+2:]
        date = ""
        for i in range(len(row)):
            if i!=0 and (row[i].isupper() or row[i]=="\t"):
                date = row[0:i]
                row = row[i:]
                break
        state = ""
        genderrace = ""
        for i in range(len(row)):
            if row[i].islower():
                attrs = row[0:i-1].strip()
                state = attrs[0:2]
                genderrace = attrs[2:]
                row = row[i-1:]
                break
        name = row[:row.find(",")]
        try:
            age = int(row[row.find(",")+2:row.find("\t")])
        except:
            continue

        row = row[row.find("\t")+1:]
        mannerdeath = row[0]

        datecol.append(date)
        statecol.append(state)
        genracecol.append(genderrace)
        namecol.append(name)
        agecol.append(age)
        mannerdeathcol.append(mannerdeath)
    datecol = np.array(datecol)
    statecol = np.array(statecol)
    gendercol = []
    racecol = []
    for entry in genracecol:
        gendercol.append(entry[0])
        if "/" in entry:
            racecol.append(entry[2])
        else:
            racecol.append("")
    gendercol = np.array(gendercol)
    racecol = np.array(racecol)
    namecol = np.array(namecol)
    agecol = np.array(agecol)
    deaths = {"G": "shot",
             "T": "tasered",
             "R": "physical force",
             "C": "chemical",
             "V": "vehicle",
              "O": "other",
             "f": "",
             "\t": "",
             " ": "",
             '"': ""}
    mannerdeathcol = [entry.replace(entry, deaths[entry]) for entry in mannerdeathcol]
    mannerdeathcol = np.array(mannerdeathcol)
    
    onedf = pd.DataFrame({"date": datecol,
                                  "state": statecol,
                                  "gender": gendercol,
                                   "race": racecol,
                                  "name": namecol,
                                   "age": agecol,
                                  "manner of death": mannerdeathcol})
    killedbypolice = killedbypolice.append(onedf, ignore_index = True)

In [5]:
killedbypolice.head(10)

Unnamed: 0,age,date,gender,manner of death,name,race,state
0,23,"July 31, 2018",M,shot,Skyler Martin,,AZ
1,35,"July 30, 2018",M,shot,James Edward Blackmon,B,AR
2,73,"July 30, 2018",M,shot,Richard Black,W,CO
3,32,"July 28, 2018",M,shot,Michael Neal,,LA
4,44,"July 28, 2018",M,shot,Joseph Santos,L,PA
5,33,"July 28, 2018",M,shot,Arthur Kenzie Garner,,NC
6,60,"July 27, 2018",F,shot,Cynthia Fields,,GA
7,25,"July 27, 2018",M,shot,Lamar C. Richardson Jr.,B,OH
8,30,"July 26, 2018",M,shot,Iman Joseph Buford,,AR
9,43,"July 26, 2018",M,shot,William Earnest Brooks,,GA


## Cleaning up the data so it matches the format of other datasets

In [6]:
dates = killedbypolice["date"]
months = [date[: date.find(" ")] for date in dates]
killedbypolice["month"] = months
years = [date[date.find(",") + 2: ] for date in dates]
killedbypolice["year"] = years

In [7]:
killedbypolice.head(10)

Unnamed: 0,age,date,gender,manner of death,name,race,state,month,year
0,23,"July 31, 2018",M,shot,Skyler Martin,,AZ,July,2018
1,35,"July 30, 2018",M,shot,James Edward Blackmon,B,AR,July,2018
2,73,"July 30, 2018",M,shot,Richard Black,W,CO,July,2018
3,32,"July 28, 2018",M,shot,Michael Neal,,LA,July,2018
4,44,"July 28, 2018",M,shot,Joseph Santos,L,PA,July,2018
5,33,"July 28, 2018",M,shot,Arthur Kenzie Garner,,NC,July,2018
6,60,"July 27, 2018",F,shot,Cynthia Fields,,GA,July,2018
7,25,"July 27, 2018",M,shot,Lamar C. Richardson Jr.,B,OH,July,2018
8,30,"July 26, 2018",M,shot,Iman Joseph Buford,,AR,July,2018
9,43,"July 26, 2018",M,shot,William Earnest Brooks,,GA,July,2018


In [8]:
killedbypolice.to_csv("killedbypolice.csv", index=False)