In [73]:
#Import libraries
import numpy as np
import pandas as pd
import seaborn as sn
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
import missingno as msno
import plotly.express as px

In [74]:
#Load data
df = pd.read_csv('data/df_incidents_poverty_house_data_preparation_final.csv')

In [75]:
#View columns
df.columns

Index(['state', 'city_or_county', 'latitude', 'longitude',
       'avg_age_participants', 'datetime', 'males_ratio',
       'state_congressional_district', 'povertyPercentage', 'killed_ratio',
       'injured_ratio', 'unharmed_ratio', 'candidate_winning_ratio',
       'males_in_month_ratio', 'killed_in_month_ratio',
       'injured_in_month_ratio', 'unharmed_in_month_ratio'],
      dtype='object')

In [76]:
#Describe data
df.describe()

Unnamed: 0,latitude,longitude,avg_age_participants,males_ratio,povertyPercentage,killed_ratio,injured_ratio,unharmed_ratio,candidate_winning_ratio,males_in_month_ratio,killed_in_month_ratio,injured_in_month_ratio,unharmed_in_month_ratio
count,16651.0,16651.0,16651.0,16651.0,16651.0,16651.0,16651.0,16651.0,16651.0,16651.0,16651.0,16651.0,16651.0
mean,37.578831,-89.011474,30.535704,0.776875,12.625656,0.185624,0.363682,0.056959,0.709851,0.064896,0.058707,0.062587,0.030713
std,4.65209,12.010334,10.740324,0.391515,2.870646,0.348589,0.444429,0.160943,0.119773,0.064326,0.125588,0.111554,0.097688
min,25.7648,-124.224,5.0,0.0,7.1,0.0,0.0,0.0,0.41284,0.0,0.0,0.0,0.0
25%,33.7411,-90.3648,22.0,0.666667,10.8,0.0,0.0,0.0,0.61927,0.015625,0.0,0.0,0.0
50%,38.8002,-87.6148,28.0,1.0,12.2,0.0,0.0,0.0,0.69754,0.047619,0.0,0.0,0.0
75%,41.6807,-81.9936,37.0,1.0,14.2,0.25,1.0,0.0,0.786679,0.090909,0.04,0.083333,0.0
max,48.4929,-68.7986,67.0,1.0,20.8,1.0,1.0,0.666667,1.0,0.333333,0.5,0.5,0.5


In [77]:
# Convert 'datetime' column to datetime format
df['datetime'] = pd.to_datetime(df['datetime'])

In [78]:
#filter incidents from 2014 to 2017
filtered_df = df[df['datetime'].dt.year.isin([2014, 2015, 2016, 2017])].copy()

In [79]:
#Calculate incident severity score
killed_weight = 1.5
injured_weight = 1.0
unharmed_weight = 0.5
age_weight = 0.1
poverty_weight = 0.2
demographic_factor = 0.05

# Calculating the score for each incident in the dataframe
filtered_df.loc[:, 'score'] = ((filtered_df['killed_ratio'] * killed_weight) + 
                               (filtered_df['injured_ratio'] * injured_weight) + 
                               (filtered_df['unharmed_ratio'] * unharmed_weight)) * \
                               (1 + demographic_factor * (filtered_df['avg_age_participants'] * age_weight + 
                                                          filtered_df['povertyPercentage'] * poverty_weight))


In [80]:
# Group by city and week of the year, then count the incidents
# The week is extracted using isocalendar().week
weekly_scores = filtered_df.groupby(['city_or_county', filtered_df['datetime'].dt.isocalendar().week])['score'].sum()

In [81]:
# Convert the series to a DataFrame and reset the index
weekly_scores_df = weekly_scores.to_frame(name='weekly_score').reset_index()

In [82]:
# Calculate the total number of weeks in 4 years
total_weeks = 52 * 4

In [83]:
# Filter out cities with a low number of incident weeks
# Keep only cities with number of weeks with incidents > 15% of total weeks
valid_cities = weekly_scores_df.groupby('city_or_county').filter(lambda x: len(x) > total_weeks * 0.15)

In [84]:
# Display the first few rows of the resulting DataFrame
print(valid_cities.head())

    city_or_county  week  weekly_score
249        Atlanta     1       3.02700
250        Atlanta     2       3.40750
251        Atlanta     4       1.24800
252        Atlanta     5       3.49975
253        Atlanta    10       3.91500
