In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fatal-police-shootings-in-the-us/PercentagePeopleBelowPovertyLevel.csv
/kaggle/input/fatal-police-shootings-in-the-us/ShareRaceByCity.csv
/kaggle/input/fatal-police-shootings-in-the-us/MedianHouseholdIncome2015.csv
/kaggle/input/fatal-police-shootings-in-the-us/PercentOver25CompletedHighSchool.csv
/kaggle/input/fatal-police-shootings-in-the-us/PoliceKillingsUS.csv
/kaggle/input/usacities-geojson-file/us_cities.geojson


# About
This dataset gives us insight into shootings in the United States which has been a major issue in the country. Recently the death of George Floyd started black lives matter protest all over the country. This notebook explores this data, tries to interpret the data to find any kind of racial bias in these shootings. This particular notebook explores all csv files individually. In the upcoming notebooks I will be delving into looking at groups of csv files to provide more information

# Read The Datasets

In [2]:
pb_poverty = pd.read_csv("/kaggle/input/fatal-police-shootings-in-the-us/PercentagePeopleBelowPovertyLevel.csv",encoding = "ISO-8859-1")
p_killing_US = pd.read_csv("/kaggle/input/fatal-police-shootings-in-the-us/PoliceKillingsUS.csv",encoding = "ISO-8859-1")
s_race_city = pd.read_csv("/kaggle/input/fatal-police-shootings-in-the-us/ShareRaceByCity.csv",encoding = "ISO-8859-1")
P_HighSchool = pd.read_csv("/kaggle/input/fatal-police-shootings-in-the-us/PercentOver25CompletedHighSchool.csv",encoding = "ISO-8859-1")
M_2015 = pd.read_csv("/kaggle/input/fatal-police-shootings-in-the-us/MedianHouseholdIncome2015.csv",encoding = "ISO-8859-1")

# Analyse individual datasets independently
Analyse each dataset independently. Visualize each and every data point

# Poverty Rates dataset

In [3]:
pb_poverty.head()

Unnamed: 0,Geographic Area,City,poverty_rate
0,AL,Abanda CDP,78.8
1,AL,Abbeville city,29.1
2,AL,Adamsville city,25.5
3,AL,Addison town,30.7
4,AL,Akron town,42.0


In [4]:
pb_poverty.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29329 entries, 0 to 29328
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Geographic Area  29329 non-null  object
 1   City             29329 non-null  object
 2   poverty_rate     29329 non-null  object
dtypes: object(3)
memory usage: 687.5+ KB


In [5]:
import plotly.graph_objects as go
def plot_doughnut_chart_by_race(df,geo_feature_name,area_name,col_name,num_cities=10):
    '''
    Plots doughnut chart by area
    
    Args: df: The dataframe from which data is to be plotted
          geo_feature_name: Geographical feature column name
          area_name: The area whose doughnut chart is to be plotted
          col_name: The feature which is to be plotted
          num_cities: top number of cities to be plotted
    
    Returns: None
    
    Output: Doughnut chart representing share of that particular race in that particular area. 
    '''
    area = df[df[geo_feature_name]==area_name]
    area = area.sort_values(by=["City"],ascending=False)
    fig = go.Figure(data=[go.Pie(labels=area["City"][:num_cities], values=area[col_name][:num_cities], hole=.3)])
    fig.update_layout(title="Top "+str(num_cities)+" cities in terms of "+col_name+" in "+area_name,template="plotly_dark")
    fig.show()
plot_doughnut_chart_by_race(pb_poverty,"Geographic Area","AK","poverty_rate")

In [6]:
pb_poverty = pb_poverty[pb_poverty["poverty_rate"]!="-"]
pb_poverty["poverty_rate"] = pb_poverty["poverty_rate"].astype("float")

In [7]:
poverty_state = pb_poverty.groupby(["Geographic Area"])
poverty_state = poverty_state.mean()

In [8]:
poverty_state.head()

Unnamed: 0_level_0,poverty_rate
Geographic Area,Unnamed: 1_level_1
AK,19.852994
AL,20.647089
AR,22.963216
AZ,25.666441
CA,17.12465


# Visualize poverty rates per state

In [9]:
import plotly.express as px
fig = px.bar(x=poverty_state.index,y=poverty_state.poverty_rate)
fig.update_layout(title="Poverty Rate Visualization",xaxis_title="State",yaxis_title="Poverty rate",template="plotly_dark")
fig.update_traces(marker_color="mediumseagreen")

Let's visualise the distribution of the killings across the country using chlorpeth maps

In [10]:
fig = px.choropleth(locations=poverty_state.index, locationmode="USA-states", color=poverty_state.poverty_rate, scope="usa",template="plotly_dark")
fig.show()

# Study of share of Race per state

In [11]:
s_race_city.head()

Unnamed: 0,Geographic area,City,share_white,share_black,share_native_american,share_asian,share_hispanic
0,AL,Abanda CDP,67.2,30.2,0.0,0.0,1.6
1,AL,Abbeville city,54.4,41.4,0.1,1.0,3.1
2,AL,Adamsville city,52.3,44.9,0.5,0.3,2.3
3,AL,Addison town,99.1,0.1,0.0,0.1,0.4
4,AL,Akron town,13.2,86.5,0.0,0.0,0.3


In [12]:
s_race_city = s_race_city[s_race_city!="(X)"]
s_race_city.dropna(inplace=True)
s_race_city.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29248 entries, 0 to 29267
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Geographic area        29248 non-null  object
 1   City                   29248 non-null  object
 2   share_white            29248 non-null  object
 3   share_black            29248 non-null  object
 4   share_native_american  29248 non-null  object
 5   share_asian            29248 non-null  object
 6   share_hispanic         29248 non-null  object
dtypes: object(7)
memory usage: 1.8+ MB


In [13]:
for col in s_race_city.columns:
    if col not in ["Geographic area","City"]:
        print(col)
    
        s_race_city[col] = pd.to_numeric(s_race_city[col])
        
s_race_state = s_race_city.groupby(["Geographic area"])
s_race_state = s_race_state.mean()

share_white
share_black
share_native_american
share_asian
share_hispanic


In [14]:
s_race_state.head()

Unnamed: 0_level_0,share_white,share_black,share_native_american,share_asian,share_hispanic
Geographic area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AK,45.65,0.56733,45.864773,1.388352,2.149148
AL,72.507266,23.322318,0.659343,0.479758,2.980104
AR,78.449538,16.296858,0.759889,0.477079,4.273013
AZ,59.929047,0.954545,28.5898,0.726608,20.144568
CA,71.866293,2.692018,1.723087,5.568206,29.649868


# Using chloropeth maps visualize mean share of race per state

In [15]:
from plotly.subplots import make_subplots
main_plot = make_subplots(rows=1, cols=3)


fig1 = px.choropleth(locations=s_race_state.index, locationmode="USA-states", color=s_race_state.share_white, scope="usa",template="plotly_dark")
fig1.update_layout(title="USA White")
fig2 = px.choropleth(locations=s_race_state.index, locationmode="USA-states", color=s_race_state.share_black, scope="usa",template="plotly_dark")
fig2.update_layout(title="USA Black")
fig3 = px.choropleth(locations=s_race_state.index, locationmode="USA-states", color=s_race_state.share_hispanic, scope="usa",template="plotly_dark")
fig3.update_layout(title="USA Hispanic")

fig1.show()
fig2.show()
fig3.show()

In [16]:
import plotly.graph_objects as go
def plot_doughnut_chart_by_area(df,city_name):
    '''
    Plots doughnut chart by area
    
    Args: df: The dataframe from which data is to be plotted
          city_name: name of city
         
    
    Returns: None
    
    Output: Doughnut chart representing share of that particular race in that particular city. 
    '''
    area = df[df["City"]==city_name]
    
    fig = go.Figure(data=[go.Pie(labels=area.columns[2:], values=area.iloc[:,2:].values[0], hole=.3)])
    fig.update_layout(title="Race distribution in "+city_name,template="plotly_dark")
    fig.show()
plot_doughnut_chart_by_area(s_race_city,"Woodson CDP")

# Analyse police killings

In [17]:
p_killing_US.rename(columns={"state":"Geographic Area"},inplace=True)
p_killing_US.head()

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,Geographic Area,signs_of_mental_illness,threat_level,flee,body_camera
0,3,Tim Elliot,02/01/15,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False
1,4,Lewis Lee Lembke,02/01/15,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False
2,5,John Paul Quintero,03/01/15,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False
3,8,Matthew Hoffman,04/01/15,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False
4,9,Michael Rodriguez,04/01/15,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False


In [18]:
groups  = p_killing_US.groupby("Geographic Area")
counts = groups.count()["id"]
fig3 = px.choropleth(locations=counts.index, locationmode="USA-states", color=counts.values, scope="usa",template="plotly_dark")
fig3.update_layout(title="Number of Killings in the USA by state")


As clearly seen in terms of the number of killings California pretty much takes a huge majority

# Analyzing Killings per state by race

**Race: Black**

In [19]:
def plot_racial_shootings(df,race_name):
    '''Plots chlorpeth map showing number of people killed by race
       args: df: Name of dataframe
             race_name: Name of race
       output: Map colour coded according to number of people of the given race
       returns None
    '''
    race_dict = {"B":"Black","W":"White","A":"Asian","O":"Others","H":"Hispanic","N":"Native American"}
    groups2 = df.groupby(["Geographic Area","race"])
    states = groups2.count().id.xs(race_name, level=1, drop_level=False).index.get_level_values(0)
    share_black_shot = groups2.count().id.xs(race_name, level=1, drop_level=False).values
    fig3 = px.choropleth(locations=states, locationmode="USA-states", color=share_black_shot, scope="usa",template="plotly_dark")
    
    fig3.update_layout(title="Number of "+race_dict[race_name]+" Killings in the USA by state")
    fig3.show()
plot_racial_shootings(p_killing_US,"B")

It is observed that while, in total number of killings, states like Florida, Ohio, Illinois were quite behind, the numbers jump up when specifically observing black shootings

In [20]:
plot_racial_shootings(p_killing_US,"W")


Alot of white killings seem to have occured in Texas, the rest of the distribution pretty much seems to correspond to the total number of killings since most of the people in the dataset are of the white race

In [21]:
plot_racial_shootings(p_killing_US,"H")


Not alot of data is available for people of the hispanic race so we don't get alot of information

In [22]:
plot_racial_shootings(p_killing_US,"N")


While not alot of states have data here, most of the native americans have been shot dead in the state of Arizona and Alaska. Even Wichita has quite a bit

In [23]:
plot_racial_shootings(p_killing_US,"A")


Again not alot of information. It sure is sad to see though that California has the highest number of killings for every race

In [24]:
from plotly.subplots import make_subplots

def show_count_plot(df,rows,cols,start_index=0):
    '''
    This function plots the counts of the desired features in the data
    
    args: df: Data to plot
          start_index: Index to start plotting from
          rows: Number of rows in subplot
          cols: Number of columns in subplot
    
    returns: None
    
    output: Countplots of required features
    
    
    '''

    s_titles = [col for col in df.columns[start_index:]]
    fig = make_subplots(rows=rows,cols=cols,subplot_titles=(s_titles))
    k = start_index
    for i in range(1,rows+1):
        for j in range(1,cols+1):
            plot_data = p_killing_US.iloc[:,k].value_counts()
            col_name = p_killing_US.columns[k]
            fig.add_trace(
            go.Bar(x=plot_data.index,y=plot_data.values,name=col_name),
            row=i,col=j,
            )
            k+=1
            if(k>13):
                break
    fig.update_layout(width=1000,height=1000,template="plotly_dark")
    fig.show()
    
show_count_plot(p_killing_US,4,3,start_index=3)

# Analysis #1
How many of them with body_cam were white?

In [25]:
body_cam = p_killing_US[p_killing_US["body_camera"]==True]
body_cam.head()

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,Geographic Area,signs_of_mental_illness,threat_level,flee,body_camera
8,16,Autumn Steele,06/01/15,shot,unarmed,34.0,F,W,Burlington,IA,False,other,Not fleeing,True
14,27,Omarr Julian Maximillian Jackson,07/01/15,shot,gun,37.0,M,B,New Orleans,LA,False,attack,Foot,True
16,32,James Dudley Barker,08/01/15,shot,shovel,42.0,M,W,Salt Lake City,UT,False,attack,Not fleeing,True
18,37,Thomas Hamby,08/01/15,shot,gun,49.0,M,W,Syracuse,UT,False,attack,Not fleeing,True
24,46,John Edward O'Keefe,13/01/15,shot,gun,34.0,M,W,Albuquerque,NM,False,attack,Foot,True


In [26]:
body_cam = p_killing_US[p_killing_US["body_camera"]==False]
body_cam.head()

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,Geographic Area,signs_of_mental_illness,threat_level,flee,body_camera
0,3,Tim Elliot,02/01/15,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False
1,4,Lewis Lee Lembke,02/01/15,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False
2,5,John Paul Quintero,03/01/15,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False
3,8,Matthew Hoffman,04/01/15,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False
4,9,Michael Rodriguez,04/01/15,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False


# Conspiracy Mode
The dataset mentions a column called body_camera which tells us whether a body camera was present on site or not. According to the entire dataset, the number of white people killed is the maximum. Let's analyse only the cases where a body cam was present

In [27]:
ser = body_cam.race.value_counts()
ser_total = p_killing_US.race.value_counts()
ser_div = ser.divide(ser_total)
fig = go.Figure(data=[go.Pie(labels=ser_div.index, values=ser_div.values, hole=.3)])
fig.update_layout(title="Percentage racial distribution of shootings where the body camera was turned off",template="plotly_dark")
fig.show()

Well what do you know. In terms of percentage out of the total number of people of a particular race, The percentage of other races rises significantly as compared to the white race

In [28]:
def plot_doughnut_composition(df,feature_name,category_name,og_df):
    '''
    Plots doughnut chart of racial composition fitting a certain criteria
    
    args: df: input dataframe
          feature_name: feature being plotted
          category_name: category to be avoided eg. don't visualise people who were not fleeing
          og_df: original dataframe from which df is selected
    output: Doughnut plot
    returns: None
    '''

    fleeing = df
    for c in category_name:
        fleeing = fleeing[fleeing[feature_name]!=c]

    fleeing.dropna(inplace=True)

    fleeing_race = fleeing.race.value_counts()/og_df.race.value_counts()
    
    fig = go.Figure(data=[go.Pie(labels=fleeing_race.index, values=fleeing_race.values, hole=.3)])
    fig.update_layout(title="Percentage racial distribution of shootings where the body camera was turned off and were termed to be fleeing",template="plotly_dark")
    fig.show()

plot_doughnut_composition(body_cam,"flee",["Not fleeing"],p_killing_US)
    



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



It seems like when the body camera was off, percentage of black people shot who were reported to be fleeing was higher. Conspiracy? Let's explore some more

Another view of the same

In [29]:
fleeing = body_cam[body_cam["flee"]!="Not fleeing"]
fleeing.dropna(inplace=True)
fleeing_race = fleeing.race.value_counts()/p_killing_US.race.value_counts()
print(fleeing_race)
fig2 = px.bar(x=fleeing_race.index,y=fleeing_race.values,template="plotly_dark")
fig2.update_traces(marker_color="mediumseagreen")
fig2.update_layout(title="Fleeing americans without bodycam",xaxis_title="race",yaxis_title="count")
fig2.show()

A    0.179487
B    0.312298
H    0.295508
N    0.193548
O    0.250000
W    0.252290
Name: race, dtype: float64




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



This is interesting since there is now way of finding out if they were actually fleeing. Why does the number of white people drop down so suddenly? Does this point to racism in the country? The data sure raises some questions. Let's analyse how many of these people shot when there was no bodycam present were reported to be armed.

In [30]:

plot_doughnut_composition(body_cam,"armed",["undetermined","unarmed"],p_killing_US)


Pretty much a similar distribution. Let's checkout how many of them were armed with guns

In [31]:
guns = body_cam[body_cam["armed"]=="gun"]
guns_race = guns.race.value_counts()/p_killing_US.race.value_counts()
fig = go.Figure(data=[go.Pie(labels=guns_race.index, values=guns_race.values, hole=.3)])
fig.update_layout(title="Distribution of gun ownership of the victims",template="plotly_dark")
fig.show()

In [32]:
fig = px.bar(x=guns_race.index,y=guns_race.values)
fig.update_layout(title="Distribution of guns across races",xaxis_title="Race",yaxis_title="Counts",template="plotly_dark")
fig.update_traces(marker_color="mediumseagreen")

In this case the number of white americans who were apparently reported to be carrying guns was highest, but not significantly. The number of black americans is just there. Let's checkout the racial difference in the cases with body cam and without body cam at a glance

In [33]:
p_killing_US.head()

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,Geographic Area,signs_of_mental_illness,threat_level,flee,body_camera
0,3,Tim Elliot,02/01/15,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False
1,4,Lewis Lee Lembke,02/01/15,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False
2,5,John Paul Quintero,03/01/15,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False
3,8,Matthew Hoffman,04/01/15,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False
4,9,Michael Rodriguez,04/01/15,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False


In [34]:
fleeing = p_killing_US[p_killing_US["flee"]!="Not fleeing"]
groups = fleeing.groupby(["body_camera","race"])
grouped_counts = groups.count().dropna()["id"]
arr1 = grouped_counts.values[:6]
arr2 = grouped_counts.values[6:]
arr2 = np.insert(arr2,4,0)

body_camera = ["A","B","H","N","O","W"]

fig = go.Figure()

fig.add_trace(go.Bar(
        x = body_camera,
        y = arr1,
        name="No body camera"
))

fig.add_trace(go.Bar(
       x = body_camera,
       y = arr2,
       name= "With body camera"
))

fig.update_layout(barmode="group",title="Racial distribution of people shot without body cam reported to be fleeing",template="plotly_dark")
fig.show()


In [35]:
armed = p_killing_US[p_killing_US["armed"]!="unarmed"]
armed = armed[armed["armed"]!="undetermined"]
groups = fleeing.groupby(["body_camera","race"])
grouped_counts = groups.count().dropna()["id"]
race_counts = p_killing_US.race.value_counts().values

arr1 = grouped_counts.values[:6]

arr2 = grouped_counts.values[6:]
arr2 = np.insert(arr2,4,0)
body_camera = ["A","B","H","N","O","W"]

fig = go.Figure()

fig.add_trace(go.Bar(
        x = body_camera,
        y = arr1,
        name="No body camera"
))

fig.add_trace(go.Bar(
       x = body_camera,
       y = arr2,
       name= "With body camera"
))

fig.update_layout(barmode="group",title="Racial distribution of people without body cams reported to be armed",template="plotly_dark")
fig.show()


Let's analyse the threat level

In [36]:
p_killing_US.threat_level.value_counts()

attack          1611
other            766
undetermined     158
Name: threat_level, dtype: int64

So there are three types of threat levels. Let's check out the distribution of races in the attack level threat

In [37]:
threat = p_killing_US[p_killing_US.threat_level == "attack"]
threat_percentage = threat.race.value_counts()/p_killing_US.race.value_counts()
fig = go.Figure(data=[go.Pie(labels=threat_percentage.index, values=threat_percentage.values, hole=.3)])
fig.update_layout(template="plotly_dark")
fig.show()

White and black people seem to be the most dangerous. Next in line the native americans and the hispanics follow, a bar chart view of the same

In [38]:
fig = go.Figure()
fig.add_trace(go.Bar(x=threat_percentage.index,y=threat_percentage.values))
fig.update_layout(title="Attack level distribution",xaxis_title="Race",yaxis_title="Count",template="plotly_dark")
fig.show()


In [39]:
groups = p_killing_US.groupby(["threat_level","race"])
grouped_counts = groups.count().dropna()["id"]
arr1 = grouped_counts.values[:6]

arr2 = grouped_counts.values[6:12]
arr3 = grouped_counts.values[12:]
arr3 = np.insert(arr3,4,0)
race = ["A","B","H","N","O","W"]

fig = go.Figure()

fig.add_trace(go.Bar(
        x = race,
        y = arr1,
        name="attack"
))

fig.add_trace(go.Bar(
       x = race,
       y = arr2,
       name= "other"
))

fig.add_trace(go.Bar(
       x = race,
       y=arr3,
       name="undetermined"
))
fig.update_layout(barmode="group",title="Racial distribution of people without body cams reported to be armed",xaxis_title="race",yaxis_title="count",template="plotly_dark")
fig.show()


In [40]:
groups = p_killing_US.groupby(["threat_level","race"])
grouped_counts = groups.count().dropna()["id"]


Let's check which state the maximum attack level victims belonged to

In [41]:
fig = px.bar(x=p_killing_US["Geographic Area"].value_counts().index,y=p_killing_US["Geographic Area"].value_counts().values)

fig.update_layout(title="Distribution of attackers according to state",xaxis_title="State",
                  yaxis_title="Number of killings",template="plotly_dark")

Turns out most of the victims were shot in CA -> California. Let's analyse statewise racial distribution for the top 3 states: California, Texas and Florida

In [42]:
def plot_statewise_bar(df,state_name):
    '''Plots statewise racial distribution bar plot
    
    args: df: Dataframe from which data is to be plotted
             state_name: Name of state which is to be plotted
    
    output: barplot of racial distribution
    
    returns None
    '''
    CA = df[df["Geographic Area"]==state_name]
    counts = CA.race.value_counts()
    fig = px.bar(x=counts.index,y=counts.values,template="plotly_dark")
    fig.update_layout(title=state_name+" killings racial distribution",xaxis_title="Race",yaxis_title="Count")
    fig.show()
plot_statewise_bar(p_killing_US,"CA")

In [43]:
plot_statewise_bar(p_killing_US,"TX")

In [44]:
plot_statewise_bar(p_killing_US,"FL")

As it turns out number of white people killed seems to be higher in two out of the top three states

# High School Dataset

In [45]:
P_HighSchool.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29329 entries, 0 to 29328
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Geographic Area       29329 non-null  object
 1   City                  29329 non-null  object
 2   percent_completed_hs  29329 non-null  object
dtypes: object(3)
memory usage: 687.5+ KB


In [46]:
P_HighSchool = P_HighSchool[P_HighSchool.percent_completed_hs!="-"]

In [47]:
P_HighSchool.percent_completed_hs = pd.to_numeric(P_HighSchool.percent_completed_hs)

In [48]:
### Again lets go back to chloropeth maps
grouped_state = P_HighSchool.groupby("Geographic Area")
hs_state = grouped_state.mean()

In [49]:
fig1 = px.choropleth(locations=hs_state.index, locationmode="USA-states", color=hs_state.percent_completed_hs, scope="usa",template="plotly_dark")
fig1.show()

As it turns out in the northern states like North Dakota the percentage of people who have completed high school  seems to be maximum as compared to the southern states like Texas

In [50]:
##### Composition #####

# Analyze median household income dataset

In [51]:
M_2015.head()

Unnamed: 0,Geographic Area,City,Median Income
0,AL,Abanda CDP,11207
1,AL,Abbeville city,25615
2,AL,Adamsville city,42575
3,AL,Addison town,37083
4,AL,Akron town,21667


In [52]:
M_2015 = M_2015[M_2015["Median Income"]!="(X)"]
M_2015.dropna(inplace=True)


In [53]:
temp_list = []
for rec in M_2015["Median Income"]:
    
    if ("-" in rec):
        rec = rec.rstrip("-")
    if("+" in rec):
        rec = rec.rstrip("+")
    if("," in rec):
        ls = rec.split(",")
        rec = "".join(ls)
    temp_list.append(rec)
M_2015["Median Income"] = temp_list

In [54]:
M_2015["Median Income"] = pd.to_numeric(M_2015["Median Income"])
grouped_count = M_2015.groupby("Geographic Area")
M_2015_state = grouped_count.mean()

In [55]:
M_2015_state.head()

Unnamed: 0_level_0,Median Income
Geographic Area,Unnamed: 1_level_1
AK,52099.594406
AL,40577.309524
AR,35593.408915
AZ,42388.439678
CA,62608.440177


In [56]:
fig1 = px.choropleth(locations=M_2015_state.index, locationmode="USA-states", color=M_2015_state["Median Income"], scope="usa",template="plotly_dark")
fig1.show()

States like Maryland and New Jersey seem to be on the top as compared to states like Arizona which lie at the bottom in case of mean Median Income

# Analyze datasets together

In [57]:
#pb_poverty 
#p_killing_US 
#s_race_city 
#P_HighSchool 
#M_2015 
s_race_city = s_race_city.rename(columns={"Geographic area":"Geographic Area"})
temp1 = pb_poverty.merge(s_race_city,on=["City","Geographic Area"])
#temp2 = temp1.merge(p_killing_US,on="City")
temp2 = temp1.merge(P_HighSchool,on=["City","Geographic Area"])
temp3 = temp2.merge(M_2015,on=["City","Geographic Area"])
temp3 =temp3.rename(columns={"City":"city"})
p_killing_US = p_killing_US.rename(columns={"state":"Geographic Area"})
temp3.head()

Unnamed: 0,Geographic Area,city,poverty_rate,share_white,share_black,share_native_american,share_asian,share_hispanic,percent_completed_hs,Median Income
0,AL,Abanda CDP,78.8,67.2,30.2,0.0,0.0,1.6,21.2,11207.0
1,AL,Abbeville city,29.1,54.4,41.4,0.1,1.0,3.1,69.1,25615.0
2,AL,Adamsville city,25.5,52.3,44.9,0.5,0.3,2.3,78.9,42575.0
3,AL,Addison town,30.7,99.1,0.1,0.0,0.1,0.4,81.4,37083.0
4,AL,Akron town,42.0,13.2,86.5,0.0,0.0,0.3,68.6,21667.0


Let's analyse the relations between population by race and poverty rates

In [58]:
### Plot poverty_rate against white####

fig1 = px.histogram(temp3,x="poverty_rate",y="share_white",height=600,width=900,template="plotly_dark")
fig2 = px.scatter(temp3,x="poverty_rate",y="share_white",height=600,width=900,template="plotly_dark")
fig1.update_layout(title="Share of white people vs the poverty rate")
fig1.update_traces(marker_color="mediumseagreen")
fig1.show()
fig2.show()

In [59]:
### Plot poverty_rate against share black###
fig1 = px.histogram(temp3,x="poverty_rate",y="share_black",height=600,width=900,template="plotly_dark")
fig1.update_layout(title="Share of black people vs the poverty rate")
fig1.update_traces(marker_color="mediumseagreen")
fig2 = px.scatter(temp3,x="poverty_rate",y="share_black",height=600,width=900,template="plotly_dark")
fig1.show()
fig2.show()

In [60]:
###Plot poverty_rate against share_hispanic###
fig1 = px.histogram(temp3,x="poverty_rate",y="share_hispanic",height=600,width=900,template="plotly_dark")
fig1.update_layout(title="Share of hispanic people vs the poverty rate")
fig1.update_traces(marker_color="mediumseagreen")
fig2 = px.scatter(temp3,x="poverty_rate",y="share_hispanic",height=600,width=900,template="plotly_dark")
fig1.show()
fig2.show()

In [61]:
###Plot poverty_rate against share_native###

fig1 = px.histogram(temp3,x="poverty_rate",y="share_native_american",height=600,width=900,template="plotly_dark")
fig1.update_layout(title="Share of Native American people vs the poverty rate")
fig1.update_traces(marker_color="mediumseagreen")
fig2 = px.scatter(temp3,x="poverty_rate",y="share_native_american",height=600,width=900,template="plotly_dark")
fig2.update_layout(title="Share of Native American people vs the poverty rate")
fig1.show()
fig2.show()

In [62]:
###Plot poverty_rate against share_asian###

fig1 = px.histogram(temp3,x="poverty_rate",y="share_asian",height=600,width=900,template="plotly_dark")
fig1.update_layout(title="Share of Asian people vs the poverty rate")
fig1.update_traces(marker_color="mediumseagreen")
fig2 = px.scatter(temp3,x="poverty_rate",y="share_asian",height=600,width=900,template="plotly_dark")
fig1.show()
fig2.show()

As seen clearly from the above histogram plots, Interesting thing to observe is except for the Asian population, the maximum share in the other races exists between a poverty rate of 0.3-0.7

Now let's see how poverty rates affect education. The relation between poverty rate and percentage of people who have completed high school in that city

In [63]:
fig = px.scatter(temp3,x="poverty_rate",y="percent_completed_hs",template="plotly_dark")
fig.update_traces(marker_color="mediumseagreen")
fig.show()

Unexpected as it may seem,more number of people in the mid range neighbourhoods seem to finish high school than the number of people in the richer areas

In [64]:
fig = px.scatter(temp3,x="Median Income",y="percent_completed_hs",template="plotly_dark")
fig.update_traces(marker_color="mediumseagreen")
fig.show()

At the same time more people living in areas with higher median income seem to finish high school

In [65]:
fig = px.scatter(temp3,x="poverty_rate",y="Median Income",template="plotly_dark")
fig.update_traces(marker_color="mediumseagreen")
fig.show()

Rightly so Median Income and poverty rate seem to be inversely proportional

In [66]:
p_killing_US.head()

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,Geographic Area,signs_of_mental_illness,threat_level,flee,body_camera
0,3,Tim Elliot,02/01/15,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False
1,4,Lewis Lee Lembke,02/01/15,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False
2,5,John Paul Quintero,03/01/15,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False
3,8,Matthew Hoffman,04/01/15,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False
4,9,Michael Rodriguez,04/01/15,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False


In [67]:
def plot_feature_vs_num_killed(df,feature):
    '''Plots a feature mean per state vs number of people killed per state
        args: df: Name of dataframe
             feature: Name of feature
        output: scatterplot of feature vs num killed
        returns: None
    '''

    group = df.groupby("Geographic Area")
    group_counts = group.count().id
    group_t3 = temp3.groupby("Geographic Area")
    mean_poverty_rate = group_t3.mean()[feature]
    df_mean = pd.concat([mean_poverty_rate,group_counts],axis=1)
    df_mean.rename(columns={"id":"Number of people killed"},inplace=True)
    fig = px.scatter(x=df_mean[feature],y=df_mean["Number of people killed"],color=df_mean.index,template="plotly_dark")
    fig.update_layout(title="Mean poverty rate per state by number of people killed per state",xaxis_title=feature,yaxis_title="Num Killed")
    #fig.update_traces(marker_color="mediumseagreen")
    fig.show()
plot_feature_vs_num_killed(p_killing_US,"poverty_rate")

Don't see alot of relation between poverty rate of a state and number of people killed per state

In [68]:
plot_feature_vs_num_killed(p_killing_US,"percent_completed_hs")

Still not alot of relation, except for the fact that most of the states seem to have people who have a high percentage of people finishing high school

In [69]:
plot_feature_vs_num_killed(p_killing_US,"Median Income")

Most of the people shot are still in the mid range. As seen there does not seem to be alot of relation between number of people killed and poverty_rate, median income and percentage of people completing high school