### Imports

In [3]:
import math
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt

import collections
from scipy.stats import pearsonr

### Load Dataset

In [5]:
def convert_dtype(x):
    if not x:
        return ''
    try:
        return float(x)   
    except:        
        return ''

In [7]:
incidents_df = pd.read_csv('data/incidents.csv', converters={'n_participants_child': convert_dtype,
                                                             'n_participants_teen': convert_dtype,
                                                             'n_participants_adult': convert_dtype})
incidents_df.head()

Unnamed: 0,date,state,city_or_county,address,latitude,longitude,congressional_district,state_house_district,state_senate_district,participant_age1,...,n_males,n_females,n_killed,n_injured,n_arrested,n_unharmed,n_participants,notes,incident_characteristics1,incident_characteristics2
0,2015-05-02,Indiana,Indianapolis,Lafayette Road and Pike Plaza,39.8322,-86.2492,7.0,94.0,33.0,19.0,...,1.0,0.0,0,1,0.0,0.0,1.0,Teen wounded while walking - Security guard at...,Shot - Wounded/Injured,
1,2017-04-03,Pennsylvania,Kane,5647 US 6,41.6645,-78.7856,5.0,,,62.0,...,1.0,0.0,1,0,0.0,0.0,1.0,shot self after accident,"Shot - Dead (murder, accidental, suicide)",Suicide^
2,2016-11-05,Michigan,Detroit,6200 Block of East McNichols Road,42.419,-83.0393,14.0,4.0,2.0,,...,,,0,1,0.0,1.0,2.0,1 inj.,Shot - Wounded/Injured,
3,2016-10-15,District of Columbia,Washington,"1000 block of Bladensburg Road, NE",38.903,-76.982,1.0,,,,...,1.0,0.0,0,1,0.0,0.0,2.0,,Shot - Wounded/Injured,
4,2030-06-14,Pennsylvania,Pittsburgh,California and Marshall Avenues,40.4621,-80.0308,14.0,,,,...,1.0,0.0,0,1,0.0,1.0,2.0,,Shot - Wounded/Injured,"Drive-by (car to street, car to car)"


In [8]:
poverty_df = pd.read_csv('data/povertyByStateYear.csv')
poverty_df.head()

Unnamed: 0,state,year,povertyPercentage
0,United States,2020,11.5
1,Alabama,2020,14.8
2,Alaska,2020,11.5
3,Arizona,2020,12.1
4,Arkansas,2020,15.8


In [9]:
district_house_df = pd.read_csv('data/year_state_district_house.csv')
district_house_df.head()

Unnamed: 0,year,state,congressional_district,party,candidatevotes,totalvotes
0,1976,ALABAMA,1,REPUBLICAN,98257,157170
1,1976,ALABAMA,2,REPUBLICAN,90069,156362
2,1976,ALABAMA,3,DEMOCRAT,106935,108048
3,1976,ALABAMA,4,DEMOCRAT,141490,176022
4,1976,ALABAMA,5,DEMOCRAT,113553,113560


### Data Integration

In [10]:
joined_df = incidents_df.copy()

In [11]:
joined_df.insert(1, "year", [int(joined_df["date"][i][0:4]) for i in range(joined_df.shape[0])])

In [12]:
joined_df.insert(29, "povertyPercentage", [poverty_df.loc[(poverty_df["year"] == joined_df.loc[i, "year"]) &
                                                          (poverty_df["state"] == joined_df.loc[i, "state"]),
                                                          "povertyPercentage"].values
                                                          for i in range(joined_df.shape[0])])


In [13]:
joined_df.insert(30, "party", [district_house_df.loc[(district_house_df["year"] == joined_df.loc[i, "year"]) &
                                                     (district_house_df["state"] == joined_df.loc[i, "state"]) &
                                                     (district_house_df["congressional_district"] == joined_df.loc[i, "congressional_district"]),
                                                     "party"].values
                                                     for i in range(joined_df.shape[0])])

In [15]:
joined_df.insert(31, "candidatevotes", [district_house_df.loc[(district_house_df["year"] == joined_df.loc[i, "year"]) &
                                                              (district_house_df["state"] == joined_df.loc[i, "state"]) &
                                                              (district_house_df["congressional_district"] == joined_df.loc[i, "congressional_district"]),
                                                              "candidatevotes"].values
                                                              for i in range(joined_df.shape[0])])

In [16]:
joined_df.insert(32, "totalvotes", [district_house_df.loc[(district_house_df["year"] == joined_df.loc[i, "year"]) &
                                                          (district_house_df["state"] == joined_df.loc[i, "state"]) &
                                                          (district_house_df["congressional_district"] == joined_df.loc[i, "congressional_district"]),
                                                          "totalvotes"].values
                                                          for i in range(joined_df.shape[0])])

In [17]:
joined_df.to_csv("data/joined_dataset.csv")

## 1. Data Understanding

### Dataset description

1. **date**: date of incident occurrence
2. **year**: year of incudent occurrence
3. **state**: state where incident took place
4. **city_or_county**: city or county where incident took place 
5. **address**: address where incident took place
6. **latitude**: latitude of the incident
7. **longitude**: longitude of the incident
8. **congressional_district**: congressional district where the incident took place 
9. **state_house_district**: state house district
10. **state_senate_district**: state senate district where the incident took place
11. **participant_age1**: exact age of one (randomly chosen) participant in the incident
12. **participant_age_group1**: exact age group of one (randomly chosen) participant in the incident
13. **participant_gender1**: exact gender of one (randomly chosen) participant in the incident
14. **min_age_participants**: minimum age of the participants in the incident
15. **avg_age_participants**: average age of the participants in the incident
16. **max_age_participants**: maximum age of the participants in the incident
17. **n_participants_child**: number of child participants 0-11
18. **n_participants_teen**: number of teen participants 12-17
19. **n_participants_adult**: number of adult participants (18 +)
20. **n_males**: number of males participants
21. **n_females**: number of females participants
22. **n_killed**: number of people killed
23. **n_injured**: number of people injured
24. **n_arrested**: number of arrested participants
25. **n_unharmed**: number of unharmed participants
26. **n_participants**: number of participants in the incident
27. **notes**:
28. **incident_characteristics1**:
29. **incident_characteristics1**:
30. **povertyPercentage**: poverty percentage for the corresponding state and year
31. **party**: winning party fort the corresponding congressional_district in the state, in the corresponding year
32. **candidateVotes**: number of votes obtained by the winning party in the corresponding election
33. **totalVotes**: number total votes for the corresponding election

### Data quality assessment

In [18]:
joined_df = joined_df.replace('', np.NaN)

In [19]:
joined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239677 entries, 0 to 239676
Data columns (total 33 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   date                       239677 non-null  object 
 1   year                       239677 non-null  int64  
 2   state                      239677 non-null  object 
 3   city_or_county             239677 non-null  object 
 4   address                    223180 non-null  object 
 5   latitude                   231754 non-null  float64
 6   longitude                  231754 non-null  float64
 7   congressional_district     227733 non-null  float64
 8   state_house_district       200905 non-null  float64
 9   state_senate_district      207342 non-null  float64
 10  participant_age1           147379 non-null  float64
 11  participant_age_group1     197558 non-null  object 
 12  participant_gender1        203315 non-null  object 
 13  min_age_participants       16

In [20]:
joined_df.isnull().any()

date                         False
year                         False
state                        False
city_or_county               False
address                       True
latitude                      True
longitude                     True
congressional_district        True
state_house_district          True
state_senate_district         True
participant_age1              True
participant_age_group1        True
participant_gender1           True
min_age_participants          True
avg_age_participants          True
max_age_participants          True
n_participants_child          True
n_participants_teen           True
n_participants_adult          True
n_males                       True
n_females                     True
n_killed                     False
n_injured                    False
n_arrested                    True
n_unharmed                    True
n_participants               False
notes                         True
incident_characteristics1     True
incident_characteris

### Variables distribution

In [21]:
joined_df.describe()

Unnamed: 0,year,latitude,longitude,congressional_district,state_house_district,state_senate_district,participant_age1,n_participants_child,n_participants_teen,n_participants_adult,n_males,n_females,n_killed,n_injured,n_arrested,n_unharmed,n_participants
count,239677.0,231754.0,231754.0,227733.0,200905.0,207342.0,147379.0,197568.0,197571.0,197572.0,203315.0,203315.0,239677.0,239677.0,212051.0,212051.0,239677.0
mean,2017.151879,37.546598,-89.338348,8.001265,55.447132,20.47711,30.295707,16.122004,8.46191,18.516014,1.520252,0.21234,0.25229,0.494007,0.468439,0.494169,1.636895
std,4.145756,5.130763,14.359546,8.480835,42.048117,14.20456,13.363592,3294.717771,2223.925791,3232.716199,0.996767,0.490888,0.521779,0.729952,0.851035,0.925566,1.252514
min,2013.0,19.1114,-171.429,0.0,1.0,1.0,0.0,-977.0,-947.0,-991.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2015.0,33.9034,-94.158725,2.0,21.0,9.0,21.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,2016.0,38.5706,-86.2496,5.0,47.0,19.0,27.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,2017.0,41.437375,-80.048625,10.0,84.0,30.0,37.0,0.0,0.0,2.0,2.0,0.0,0.0,1.0,1.0,1.0,2.0
max,2030.0,71.3368,97.4331,53.0,901.0,94.0,311.0,886365.0,762487.0,827900.0,61.0,23.0,50.0,53.0,63.0,20.0,103.0


Correct outliers data

### Pairwise correlation

## 2. Data Preparation