In [97]:
#Import libraries
import numpy as np
import pandas as pd
import seaborn as sn
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
import missingno as msno
import plotly.express as px

In [98]:
#Load data
df = pd.read_csv('data/df_incidents_poverty_house_data_preparation_final.csv')

In [99]:
#View columns
df.columns

Index(['state', 'city_or_county', 'latitude', 'longitude',
       'avg_age_participants', 'datetime', 'males_ratio',
       'state_congressional_district', 'povertyPercentage', 'killed_ratio',
       'injured_ratio', 'unharmed_ratio', 'candidate_winning_ratio',
       'males_in_month_ratio', 'killed_in_month_ratio',
       'injured_in_month_ratio', 'unharmed_in_month_ratio'],
      dtype='object')

In [100]:
#Describe data
df.describe()

Unnamed: 0,latitude,longitude,avg_age_participants,males_ratio,povertyPercentage,killed_ratio,injured_ratio,unharmed_ratio,candidate_winning_ratio,males_in_month_ratio,killed_in_month_ratio,injured_in_month_ratio,unharmed_in_month_ratio
count,16651.0,16651.0,16651.0,16651.0,16651.0,16651.0,16651.0,16651.0,16651.0,16651.0,16651.0,16651.0,16651.0
mean,37.578831,-89.011474,30.535704,0.776875,12.625656,0.185624,0.363682,0.056959,0.709851,0.064896,0.058707,0.062587,0.030713
std,4.65209,12.010334,10.740324,0.391515,2.870646,0.348589,0.444429,0.160943,0.119773,0.064326,0.125588,0.111554,0.097688
min,25.7648,-124.224,5.0,0.0,7.1,0.0,0.0,0.0,0.41284,0.0,0.0,0.0,0.0
25%,33.7411,-90.3648,22.0,0.666667,10.8,0.0,0.0,0.0,0.61927,0.015625,0.0,0.0,0.0
50%,38.8002,-87.6148,28.0,1.0,12.2,0.0,0.0,0.0,0.69754,0.047619,0.0,0.0,0.0
75%,41.6807,-81.9936,37.0,1.0,14.2,0.25,1.0,0.0,0.786679,0.090909,0.04,0.083333,0.0
max,48.4929,-68.7986,67.0,1.0,20.8,1.0,1.0,0.666667,1.0,0.333333,0.5,0.5,0.5


In [101]:
# Convert 'datetime' column to datetime format
df['datetime'] = pd.to_datetime(df['datetime'])

In [102]:
#filter incidents from 2014 to 2017
filtered_df = df[df['datetime'].dt.year.isin([2014, 2015, 2016, 2017])].copy()

In [103]:

# Calculating the score for each incident in the dataframe
filtered_df.loc[:, 'score'] = (filtered_df['killed_ratio'] * 1.5) + (filtered_df['injured_ratio'] * 1.0) + (filtered_df['unharmed_ratio'] * 0.5)
                            


In [104]:
# Group by city and week of the year, then count the incidents
# The week is extracted using isocalendar().week
weekly_scores = filtered_df.groupby(['city_or_county', filtered_df['datetime'].dt.isocalendar().week])['score'].sum()

In [105]:
# Convert the series to a DataFrame and reset the index
weekly_scores_df = weekly_scores.to_frame(name='weekly_score').reset_index()

In [106]:
weekly_scores_df.head()

Unnamed: 0,city_or_county,week,weekly_score
0,Abbeville,2,1.0
1,Aberdeen,19,0.5
2,Abilene,8,0.0
3,Abilene,9,0.0
4,Abilene,10,0.0


In [107]:
print(len(weekly_scores_df))

7629


In [108]:
# Calculate the total number of weeks in 4 years
total_weeks = 52 * 4

In [109]:
# Filter out cities with a low number of incident weeks
# Keep only cities with number of weeks with incidents > 15% of total weeks
valid_cities = weekly_scores_df.groupby('city_or_county').filter(lambda x: len(x) > total_weeks * 0.15)

In [110]:
print(len(valid_cities))

2160


In [111]:
# Display the first few rows of the resulting DataFrame
print(valid_cities.head())

    city_or_county  week  weekly_score
249        Atlanta     1          2.25
250        Atlanta     2          2.75
251        Atlanta     4          1.00
252        Atlanta     5          2.75
253        Atlanta    10          3.00


In [112]:
valid_cities.describe()

Unnamed: 0,week,weekly_score
count,2160.0,2160.0
mean,26.652315,3.211786
std,15.204822,5.014121
min,1.0,0.0
25%,14.0,1.0
50%,27.0,2.0
75%,40.0,3.5
max,53.0,48.75


In [113]:
#Print the columns in the dataframe
valid_cities.columns

Index(['city_or_county', 'week', 'weekly_score'], dtype='object')

In [114]:
#Plot the time series of weekly scores for each city
fig = px.line(valid_cities, x='week', y='weekly_score', color='city_or_county', title='Weekly Scores for Each City')
fig.show()

In [115]:
#Transform  the data and plot the time series of weekly scores for each city
fig = px.line(valid_cities, x='week', y='weekly_score', color='city_or_county', title='Weekly Scores for Each City', log_y=True)
fig.show()

In [116]:
#Plot for two cities
fig = px.line(valid_cities[valid_cities['city_or_county'].isin(['Dayton', 'Atlanta'])], x='week', y='weekly_score', color='city_or_county', title='Weekly Scores for Each City', log_y=True)
fig.show()

KeyError: 'city'