# Comparing Names of People Killed Across Different Datasets

The first part of the analysis is to see if the people killed are consistently recorded by all 3 datasets. To do that, we will see how many names match between them.

In [1]:
import pandas as pd
import numpy as np
from difflib import SequenceMatcher
import Levenshtein as lev

In [59]:
kbp = pd.read_csv("data/killedbypolice.csv")
mpv = pd.read_csv("data/MPVDataset.csv")
wapo = pd.read_csv("https://raw.githubusercontent.com/washingtonpost/data-police-shootings/master/fatal-police-shootings-data.csv")

In [3]:
kbp.tail()

Unnamed: 0,age,date,gender,manner of death,name,race,state,month,year
4723,34,"September 26, 2013",M,shot,Tavaris Gulley,,AL,September,2013
4724,22,"September 25, 2013",M,shot,Luke Castello,W,FL,September,2013
4725,27,"September 25, 2013",M,,Eric Poore,W,IN,September,2013
4726,32,"September 25, 2013",M,shot,Erick Balint,L,CA,September,2013
4727,21,"September 24, 2013",M,shot,Connor Bishop Zion,W,CA,September,2013


In [4]:
mpv.tail()

Unnamed: 0,Victim's name,Victim's age,Victim's gender,Victim's race,URL of image of victim,Date of Incident (month/day/year),Street Address of Incident,City,State,Zipcode,...,Criminal Charges?,Link to news article or photo of official document,Symptoms of mental illness?,Unarmed,Alleged Weapon (Source: WaPo),Alleged Threat Level (Source: WaPo),Fleeing (Source: WaPo),Body Camera (Source: WaPo),WaPo ID (If included in WaPo database),Unnamed: 24
6314,Andrew L. Closson,21,Male,White,http://www.superiortelegram.com/sites/default/...,1/1/13,U.S. Highway 53,Gordon,WI,54838.0,...,No Known Charges,http://www.superiortelegram.com/content/deputy...,Drug or alcohol use,Allegedly Armed,,,,,,
6315,Mark Chavez,49,Male,Hispanic,http://www.tricitytribuneusa.com/wp-content/up...,1/1/13,912 Loma Linda Ave.,Farmington,NM,87401.0,...,No Known Charges,http://www.daily-times.com/farmington-news/ci_...,No,Allegedly Armed,,,,,,
6316,Andrew Layton,26,Male,White,http://bloximages.chicago2.vip.townnews.com/ma...,1/1/13,410 S Riverfront Drive,Mankato,MN,56001.0,...,No Known Charges,http://www.tmcnet.com/usubmit/2013/02/21/69388...,No,Allegedly Armed,,,,,,
6317,Tyree Bell,31,Male,Black,http://content.omaha.com/media/maps/ps/2013/ja...,1/1/13,3727 N. 42nd St.,Omaha,NE,68111.0,...,No Known Charges,http://www.ketv.com/news/Police-chief-details-...,Yes,Allegedly Armed,,,,,,
6318,Christopher Tavares,21,Male,Hispanic,http://www.krdo.com/image/view/-/17980228/medR...,1/1/13,Highway 50 and North Elizabeth Street,Pueblo,CO,81008.0,...,No Known Charges,http://www.krdo.com/news/Pueblo-Police-shoot-k...,No,Allegedly Armed,,,,,,


In [5]:
wapo.head()

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera
0,3,Tim Elliot,2015-01-02,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False
1,4,Lewis Lee Lembke,2015-01-02,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False
2,5,John Paul Quintero,2015-01-03,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False
3,8,Matthew Hoffman,2015-01-04,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False
4,9,Michael Rodriguez,2015-01-04,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False


## Different choices of comparing names

It's not just a simple "in"

In [7]:
"Joseph Johnson" in "Joseph Walden Johnson Jr."

False

In [18]:
lev.ratio("Joseph Walden Johnson Jr.", "Joseph Johnson Jr.")

0.8372093023255814

In [9]:
SequenceMatcher(None, "Joseph Walden Johnson Jr.", "Joseph Johnson").ratio()

0.717948717948718

## Dates After 2015

From checking the head and tail of the datasets, Mapping Police Violence and Killed By Police data go back to 2013, but Washington Post data only starts in 2015. We'll start the comparison when all 3 datasets record data, 2015.

In [32]:
# Cleaning data so the format is standardized
mpv_dates = mpv["Date of Incident (month/day/year)"].values
mpv_months = [int(date[: date.find("/")]) for date in mpv_dates]
mpv_years = [int("20" + date[date.rfind("/") + 1: ]) for date in mpv_dates]
mpv["month"] = mpv_months
mpv["year"] = mpv_years
past2015_mpv = mpv[mpv['year'] >= 2015]

In [33]:
dates_kbp = kbp["date"].values
months_kbp = [date.split()[0] for date in dates_kbp]
years_kbp = [int(date.split()[2]) for date in dates_kbp]
kbp["month"] = months_kbp
kbp["year"] = years_kbp
past2015_kbp = kbp[kbp["year"] >= 2015]

In [35]:
wapo_names = wapo['name'].values
mpv_names = past2015_mpv["Victim's name"].values
kbp_names = past2015_kbp['name'].values

In [37]:
# cleaning data
new_mpv = []
for name in mpv_names:
    if name != "Name withheld by police":
        new_mpv.append(name)
mpv_names = new_mpv

new_wapo = []
for name in wapo_names:
    if name.lower() != "tk tk":
        new_wapo.append(name)
wapo_names = new_wapo

In [40]:
print("Number of names in the Washington Post dataset:", "\t", len(wapo_names))
print("Number of names in the Mapping Police Violence dataset:", "\t", len(mpv_names))
print("Number of names in the Killed By Police dataset:", "\t", len(kbp_names))

Number of names in the Washington Post dataset: 	 3491
Number of names in the Mapping Police Violence dataset: 	 4001
Number of names in the Killed By Police dataset: 	 3524


## Common Names Across All 3 Datasets

In [51]:
list12 = []
for name1 in wapo_names:
    for name2 in mpv_names:
        if lev.ratio(name1, name2) > 0.8:
            list12.append(name2)
            break

In [52]:
common_names = []
for name12 in list12:
    for name3 in kbp_names:
        if lev.ratio(name12, name3) > 0.8:
            common_names.append(name3)
            break
len(common_names)

2480

### Common Names Across Killed By Police and Washington Post

In [53]:
kbp_x_wapo = []
list12missing = []
for name1 in wapo_names:
    found = False
    for name2 in kbp_names:
        if lev.ratio(name1, name2) > 0.8:
            kbp_x_wapo.append(name1)
            found = True
            break
    if not found:
        list12missing.append(name1)
len(kbp_x_wapo)

2667

### Common Names Across Killed By Police and Mapping Police Violence

In [54]:
kbp_x_mpv = []
list12missing = []
for name1 in mpv_names:
    found = False
    for name2 in kbp_names:
        if lev.ratio(name1, name2) > 0.8:
            kbp_x_mpv.append(name1)
            found = True
            break
    if not found:
        list12missing.append(name1)
len(kbp_x_mpv)

3030

### Common Names Across Mapping Police Violence and Washington Post

In [55]:
kbp_x_mpv = []
list12missing = []
for name1 in mpv_names:
    found = False
    for name2 in wapo_names:
        if lev.ratio(name1, name2) > 0.8:
            kbp_x_mpv.append(name1)
            found = True
            break
    if not found:
        list12missing.append(name1)
len(kbp_x_mpv)

3122

In [56]:
list12missing[0:10]

['Jesse Darian Carrillo',
 'Tim Gohann Braun',
 'Joey Loop',
 'William "T.J." T. Jefferson McCollum',
 'Harold Eugene Kraai',
 'Ryan Turner Force',
 'Frank W. "Franko" Dripps IV',
 'Christian T. Webb',
 'Millard Barry Clark Jr.',
 'Daniel Isaiah Norris']

## Common Names Across Mapping Police Violence and Washington Post

In [45]:
list13 = []
for name1 in wsp:
    for name2 in mpv:
        if lev.ratio(name1, name2) > 0.7:
            list13.append(name2)
            break
len(list13)

3120

## Common Names Across Killed By Police and Mapping Police Violence

In [46]:
list23 = []
for name1 in kbp:
    for name2 in mpv:
        if lev.ratio(name1, name2) > 0.7:
            list23.append(name2)
            break
len(list23)

2983