## Environment Set Up 

In [2]:
### IMPORT REQUIRED LIBRARIES ###

import re 
from datetime import datetime 

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
# import seaborn as sns 

# Set option for pd dataframes to show all columns when called 
pd.set_option("display.max_columns", None)

## Load and pre-process data

In [22]:
# PROTESTS DATA 

# Read data 
protests_df = pd.read_csv("ccc_compiled.csv",encoding="unicode_escape") 

# Mutate column types to requrement 
protests_df["date"] = pd.to_datetime(protests_df["date"])
protests_df["month_year"] = pd.to_datetime(protests_df['date']).dt.to_period('M').dt.to_timestamp()
protests_df["month_year"] = pd.to_datetime(protests_df["month_year"])

## Filter dataset to get only required events
# After May 25th, size_cat>1, issues contains ("policing" | "racisim"), online != 1

# Keeping data collected between death date and August 2020
protests_df_filtered = protests_df[(protests_df["date"] > datetime(2020, 5, 24)) & (protests_df["date"] < datetime(2020, 9, 1))]

# Filter for non-online issues only 
protests_df_filtered = protests_df_filtered[protests_df_filtered["online"] != 1]

# Filter for issues where the issues included "policing" or "racism"
protests_df_filtered = protests_df_filtered[protests_df_filtered["issues"].str.contains("policing|racism", case=True, na=False)]

# Filter where size_cat >= 1; size_cat key: 0 = unknown; 1 = 1-99; 2 = 100-999; 3 = 1,000-9,999; 4 = 10,000+ 
protests_df_filtered = protests_df_filtered[protests_df_filtered["size_cat"] >= 1]
print(len(protests_df_filtered))

# Drop unrequired data. Check protests_df object for full data 
protests_df_filtered = protests_df_filtered.drop(protests_df_filtered.iloc[:, 6:13], axis = 1)
protests_df_filtered = protests_df_filtered.drop(['size_text'], axis = 1)
protests_df_filtered = protests_df_filtered.drop(protests_df_filtered.iloc[:, 13:53], axis = 1)

  exec(code_obj, self.user_global_ns, self.user_ns)


6393


In [23]:
# COUNTY POPULATION DATA 

# Read data 
county_pop_df = pd.read_csv(
    "population_by_county/DECENNIALPL2020.P1_data_with_overlays_2021-11-21T135424.csv", 
    header = 1, 
) 

# Remove unrequried fields - remove population by race. Keep total only. 
county_pop_df_filtered = county_pop_df.drop(county_pop_df.iloc[:,3:], axis = 1)

# Remame columns 
county_pop_df_filtered = county_pop_df_filtered.set_axis(['fips_code', 'county', 'population'], axis = 1)

# Mutate fips_code column for join with protests dataset 
county_pop_df_filtered["fips_code"] = county_pop_df_filtered["fips_code"].apply(lambda x: float(x[9:]))

In [24]:
# JOIN PROTESTS AND COUNTY POPULATION DATA 

df = pd.merge(protests_df_filtered, county_pop_df_filtered, on='fips_code', how='left')

# Check if merge worked as expected 
print(len(df) == len(protests_df_filtered))

True


In [31]:
#Create a field for protest size relative to county population. 

df["perc_cnty_pop"] = df["size_mean"] / df["population"] * 100

In [33]:
#Export CSV for GIS work.

df.to_csv("protests.csv")

In [137]:
print(len(df))
df.head()

6393


Unnamed: 0,date,locality,state,location,location_detail,county_x,claims,valence,issues,size_low,size_high,size_mean,size_cat,lat,lon,resolved_locality,resolved_county,resolved_state,fips_code,month_year,county_y,population,perc_cnty_pop
0,2020-05-26,Chicago,IL,"Chicago, IL",Chicago Public Safety Headquarters,Cook,antiracism; against police brutality; for just...,1.0,policing; racism,50.0,70.0,60.0,1,41.878114,-87.629798,Chicago,Cook County,IL,17031.0,2020-05-01,"Cook County, Illinois",5275541.0,0.001137
1,2020-05-26,Hammond,IN,"Hammond, IN",Hammond Police Department,Lake,antiracism; against police brutality,1.0,policing; racism,60.0,80.0,70.0,1,41.583369,-87.500041,Hammond,Lake County,IN,18089.0,2020-05-01,"Lake County, Indiana",498700.0,0.014036
2,2020-05-26,Ann Arbor,MI,"Ann Arbor, MI",intersection near Washtenaw County Sheriff's O...,Washtenaw,"for police accountability, for racial justice",0.0,policing; racism,100.0,100.0,100.0,2,42.280826,-83.743038,Ann Arbor,Washtenaw County,MI,26161.0,2020-05-01,"Washtenaw County, Michigan",372258.0,0.026863
3,2020-05-26,Minneapolis,MN,"Minneapolis, MN",E 38th St and Chicago Ave; Minneapolis Police ...,Hennepin,antiracism; against police brutality,1.0,policing; racism,3000.0,3000.0,3000.0,3,44.977753,-93.265011,Minneapolis,Hennepin County,MN,27053.0,2020-05-01,"Hennepin County, Minnesota",1281565.0,0.234089
4,2020-05-26,Minneapolis,MN,"Minneapolis, MN",Minneapolis Police Department Third Precinct S...,Hennepin,antiracism; against police brutality,1.0,policing; racism,300.0,300.0,300.0,2,44.977753,-93.265011,Minneapolis,Hennepin County,MN,27053.0,2020-05-01,"Hennepin County, Minnesota",1281565.0,0.023409


In [164]:
#Need to create a country average protesters per day relative to population. 

by_county = df[["fips_code", "size_mean","population"]]

by_county = by_county.groupby(["fips_code","population"],as_index=False).sum()

by_county.rename(columns={"size_mean":"sum_daily_bodies"},inplace=True)

by_county["avg_daily_protesters_ratio"] = by_county["sum_daily_bodies"] / 61 / by_county["population"] 

#need to fix fips_code so that I can join with my county shapefiles.

by_county["fips_code"]= by_county["fips_code"].apply((lambda x: str(int(x))))

by_county.head()

Unnamed: 0,fips_code,population,sum_daily_bodies,avg_daily_protesters_ratio
0,1015,116441.0,24.0,3e-06
1,1023,12665.0,60.0,7.8e-05
2,1039,37570.0,3.0,1e-06
3,1043,87866.0,30.0,6e-06
4,1049,71608.0,40.0,9e-06


In [165]:
#Export CSV for GIS work.

by_county.to_csv("by_county.csv")