### Imports

In [None]:
import math
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

import collections

## 1. Load Dataset

In [None]:
incidents_df = pd.read_csv("data/incidents.csv")

In [None]:
incidents_df.isna().sum() / incidents_df.shape[0] * 100

In [None]:
def convert_dtype(x):
    if not x:
        return np.NaN
    try:
        return float(x)
    except:
        print(f"Wrong type (removed): {x}")
        return "syntactically wrong"

In [None]:
incidents_df = pd.read_csv(
    "data/incidents.csv",
    converters={
        "n_participants_child": convert_dtype,
        "n_participants_teen": convert_dtype,
        "n_participants_adult": convert_dtype,
        "min_age_participants": convert_dtype,
        "avg_age_participants": convert_dtype,
        "max_age_participants": convert_dtype,
    },
)

In [None]:
incidents_df

In [None]:
poverty_df = pd.read_csv("data/povertyByStateYear.csv")

poverty_df

In [None]:
district_house_df = pd.read_csv("data/year_state_district_house.csv")
district_house_df

## 2. Data Quality Assessment

### Syntactic accuracy

In [None]:
wrong_type_vars = [
    "n_participants_child",
    "n_participants_teen",
    "n_participants_adult",
    "min_age_participants",
    "avg_age_participants",
    "max_age_participants",
]

for var in wrong_type_vars:
    print(var)
    print(
        incidents_df.loc[incidents_df[var] == "syntactically wrong"].shape[0]
        / incidents_df.shape[0]
        * 100
    )

Now I can change these error to NaN values.

In [None]:
for var in wrong_type_vars:
    incidents_df.loc[incidents_df[var] == "syntactically wrong", var] = np.NaN

In [None]:
incidents_df["participant_age_group1"].unique()

### Semantic Accuracy

In [None]:
incidents_df.insert(
    1, "year", [int(incidents_df["date"][i][0:4]) for i in range(incidents_df.shape[0])]
)

There are a lot of incidents with a wrong date. For the moment, we consider these years as missing values, but we keep the information about the date, which will be useful to try to correct the year of the incident.

In [None]:
print(
    incidents_df.loc[incidents_df["year"] > 2018, "year"].shape[0]
    / incidents_df.shape[0]
    * 100
)
print(
    incidents_df.loc[incidents_df["year"] < 2013, "year"].shape[0]
    / incidents_df.shape[0]
    * 100
)

In [None]:
incidents_df.loc[incidents_df["year"] > 2018, "year"] = np.NaN
incidents_df.loc[incidents_df["year"] < 2013, "year"] = np.NaN

We also verify that in 2018 recordings stop on March 31st.

In [None]:
tdf = incidents_df.loc[incidents_df["year"] == 2018]
tdf.sort_values("date").tail()

We check if there are negative values for variables that we expect to be non-negative. 
We also want to remove improbable (too large) age values. We symbolically consider the maximum acceptable age to be 116, which is the oldest man on Earth's age.

In [None]:
non_negative_vars = [
    "congressional_district",
    "state_house_district",
    "state_senate_district",
    "participant_age1",
    "min_age_participants",
    "avg_age_participants",
    "max_age_participants",
    "n_participants_child",
    "n_participants_teen",
    "n_participants_adult",
    "n_males",
    "n_females",
    "n_killed",
    "n_injured",
    "n_arrested",
    "n_unharmed",
    "n_participants",
]

age_vars = [
    "participant_age1",
    "min_age_participants",
    "avg_age_participants",
    "max_age_participants",
]

for var in non_negative_vars:
    print(var)
    tmp = incidents_df.loc[incidents_df[var].notna()]
    if var in age_vars:
        print(
            tmp.loc[tmp[var] < 0].shape[0] / incidents_df.shape[0] * 100
            + tmp.loc[tmp[var] > 116].shape[0] / incidents_df.shape[0] * 100
        )
        incidents_df.loc[incidents_df[var] < 0, var] = np.NaN
        incidents_df.loc[incidents_df[var] > 116, var] = np.NaN
    else:
        print(tmp.loc[tmp[var] < 0].shape[0] / incidents_df.shape[0] * 100)
        incidents_df.loc[incidents_df[var] < 0, var] = np.NaN

Other variables that should be constrained in a specific range are coordinates. Now, to be precise we should check that all coordinates match the respsective county or city, but here we just investigate cases where coordinates are not in the USA.

In [None]:
print(np.min(incidents_df["latitude"]))
print(np.max(incidents_df["latitude"]))
print(np.min(incidents_df["longitude"]))
print(np.max(incidents_df["longitude"]))

In [None]:
incidents_df.loc[incidents_df["longitude"] > -60]

In [None]:
incidents_df.loc[incidents_df["longitude"] > -60, "longitude"] = np.NaN

Fortunately, there are only 5 cases where the longitude attribute seems wrong. So we can try to check if changing the sign of the longitude is consistent with the rest of the information on the location.

Using google maps, we verify that all the 5 locations match if the lognitude's sign is reverse. Moreover, these 5 incidents are all TSA Actions in different airports.

We can now correct the coordinates.

In [None]:
incidents_df.loc[incidents_df["longitude"] > -60, "longitude"] = -incidents_df.loc[
    incidents_df["longitude"] > -60, "longitude"
]

In [None]:
tmp_notna = incidents_df.loc[incidents_df["min_age_participants"].notna()]
tmp_notna = tmp_notna.loc[tmp_notna["max_age_participants"].notna()]
tmp_notna = tmp_notna.loc[tmp_notna["avg_age_participants"].notna()]

tmp = tmp_notna.loc[
    tmp_notna["max_age_participants"] >= tmp_notna["avg_age_participants"]
]
tmp = tmp.loc[tmp["min_age_participants"] <= tmp["participant_age1"]]

tmp.shape[0] / tmp_notna.shape[0] * 100

In [None]:
tmp_notna = incidents_df.loc[incidents_df["min_age_participants"].notna()]
tmp_notna = tmp_notna.loc[tmp_notna["max_age_participants"].notna()]
tmp_notna = tmp_notna.loc[tmp_notna["participant_age1"].notna()]

tmp = tmp_notna.loc[tmp_notna["max_age_participants"] >= tmp_notna["participant_age1"]]
tmp = tmp.loc[tmp["min_age_participants"] <= tmp["participant_age1"]]

tmp.shape[0] / tmp_notna.shape[0] * 100

Now we check that the number of a special categrory of participants if not larger than the total number of participants.

In [None]:
for col in [
    "n_killed",
    "n_injured",
    "n_unharmed",
    "n_arrested",
    "n_participants_child",
    "n_participants_teen",
    "n_participants_adult",
    "n_males",
    "n_females",
]:
    print(col)
    tmp_notna = incidents_df.loc[incidents_df[col].notna()]
    tmp = tmp_notna.loc[tmp_notna[col] > tmp_notna["n_participants"]]

    print(tmp.shape[0] / tmp_notna.shape[0] * 100)

    incidents_df[col][tmp.index] = np.NaN

Another thing that we may want to check is the information about the number of participants.
We expect the fllowing equalities to hold:
* n_participants = n_males + n_females 
* n_participants = n_participants_child + n_participants_teen + n_participants_adult
* n_participants = n_killed + n_unharmed + n_injured 
or
* n_participants = n_killed + n_unharmed + n_injured + n_arrested

In [None]:
tmp_notna = incidents_df.loc[incidents_df["n_killed"].notna()]
tmp_notna = tmp_notna.loc[tmp_notna["n_injured"].notna()]
tmp_notna = tmp_notna.loc[tmp_notna["n_unharmed"].notna()]
tmp_notna = tmp_notna.loc[tmp_notna["n_arrested"].notna()]
tmp_notna = tmp_notna.loc[tmp_notna["n_participants"].notna()]

tmp = tmp_notna.loc[
    tmp_notna["n_killed"]
    + tmp_notna["n_injured"]
    + tmp_notna["n_unharmed"]
    + tmp_notna["n_arrested"]
    == tmp_notna["n_participants"]
]

tmp.shape[0] / tmp_notna.shape[0] * 100

In [None]:
tmp_notna = incidents_df.loc[incidents_df["n_participants_child"].notna()]
tmp_notna = tmp_notna.loc[tmp_notna["n_participants_teen"].notna()]
tmp_notna = tmp_notna.loc[tmp_notna["n_participants_adult"].notna()]
tmp_notna = tmp_notna.loc[tmp_notna["n_participants"].notna()]

tmp = tmp_notna.loc[
    tmp_notna["n_participants_child"]
    + tmp_notna["n_participants_teen"]
    + tmp_notna["n_participants_adult"]
    == tmp_notna["n_participants"]
]

tmp.shape[0] / tmp_notna.shape[0] * 100

In [None]:
tmp_notna = incidents_df.loc[incidents_df["n_males"].notna()]
tmp_notna = tmp_notna.loc[tmp_notna["n_females"].notna()]
tmp_notna = tmp_notna.loc[tmp_notna["n_participants"].notna()]

tmp = tmp_notna.loc[
    tmp_notna["n_males"] + tmp_notna["n_females"] == tmp_notna["n_participants"]
]

tmp.shape[0] / tmp_notna.shape[0] * 100

Even if the count is not always correct, for now we keep the information about the group composition as it could still be useful. Also because it would be difficult to identify the incorrect value among the different attributes in the sum.

In [None]:
incidents_df.loc[incidents_df["n_participants"] == 0].shape[0] / incidents_df.shape[
    0
] * 100

In [None]:
for var in [
    "n_males",
    "n_females",
    "n_killed",
    "n_injured",
    "n_unharmed",
    "n_arrested",
    "n_participants_adult",
    "n_participants_teen",
    "n_participants_child",
]:
    print(
        incidents_df.loc[
            (incidents_df["n_participants"] == 0) & (incidents_df[var] > 0)
        ].shape[0]
    )

In [None]:
incidents_df.loc[incidents_df["n_participants"] == 0, "n_participants"] = np.NaN

In [None]:
(incidents_df.isnull().sum() / incidents_df.shape[0]) * 100

### Data Integration

In [None]:
joined_df = incidents_df.copy()

In [None]:
joined_df["povertyPercentage"] = " "
joined_df["party"] = " "
joined_df["candidatevotes"] = " "
joined_df["totalvotes"] = " "

valid dates: 1/1/2013 to 31/3/2018. \
there are a lot of incidents with wrong dates (2028-2030). \
Ho googlato un po' di notizie usando le note del dataset, le città ecc. e sembra che 2028->2013; 2029->2014; 2030->2015. (Non si possono controllare uno a uno perché sono migliaia, quindi ne estraiamo un po' a caso e ci fidiamo?)

In [None]:
pd.options.mode.chained_assignment = None

for i in range(joined_df.shape[0]):
    if int(joined_df["date"][i][0:4]) > 2018:
        joined_df["year"][i] = int(joined_df["date"][i][0:4]) - 15

In [None]:
for i in range(joined_df.shape[0]):
    povertyPercentage = poverty_df.loc[
        (poverty_df["year"] == joined_df.loc[i, "year"])
        & (poverty_df["state"] == joined_df.loc[i, "state"]),
        "povertyPercentage",
    ].values

    if len(povertyPercentage) == 1:
        joined_df["povertyPercentage"][i] = povertyPercentage[0]
    else:
        joined_df["povertyPercentage"][i] = np.NaN

Congressional elections occur every 2 years!

In [None]:
for i in range(joined_df.shape[0]):
    party = district_house_df.loc[
        (district_house_df["year"] // 2 == joined_df["year"][i] // 2)
        & (district_house_df["state"] == joined_df["state"][i].upper())
        & (
            district_house_df["congressional_district"]
            == joined_df["congressional_district"][i]
        ),
        "party",
    ].values

    if len(party) == 1:
        joined_df["party"][i] = party[0]
    else:
        joined_df["party"][i] = " "

In [None]:
for i in range(joined_df.shape[0]):
    candidatevotes = district_house_df.loc[
        (district_house_df["year"] // 2 == joined_df.loc[i, "year"] // 2)
        & (district_house_df["state"] == joined_df.loc[i, "state"].upper())
        & (
            district_house_df["congressional_district"]
            == joined_df.loc[i, "congressional_district"]
        ),
        "candidatevotes",
    ].values

    if len(candidatevotes) == 1:
        joined_df["candidatevotes"][i] = candidatevotes[0]
    else:
        joined_df["candidatevotes"][i] = np.NaN

In [None]:
for i in range(joined_df.shape[0]):
    totalvotes = district_house_df.loc[
        (district_house_df["year"] // 2 == joined_df.loc[i, "year"] // 2)
        & (district_house_df["state"] == joined_df.loc[i, "state"].upper())
        & (
            district_house_df["congressional_district"]
            == joined_df.loc[i, "congressional_district"]
        ),
        "totalvotes",
    ].values

    if len(totalvotes) == 1:
        joined_df["totalvotes"][i] = totalvotes[0]
    else:
        joined_df["totalvotes"][i] = np.NaN

In [None]:
joined_df.to_csv("data/joined_dataset.csv")

## 3. Data preparation

### Fill missing values

In [None]:
joined_df = pd.read_csv("data/joined_dataset.csv")

In [None]:
def custom_mean(series):
    return series.dropna().mean()


def custom_mode(series):
    return series.dropna().mode()

#### 1. Coordinates

In [None]:
joined_df["latitude"].fillna(
    joined_df.groupby(["state", "city_or_county"])["latitude"].transform(custom_mean),
    inplace=True,
)
joined_df["longitude"].fillna(
    joined_df.groupby(["state", "city_or_county"])["longitude"].transform(custom_mean),
    inplace=True,
)

If we have no info about city, we use states:

In [None]:
joined_df["latitude"].fillna(
    joined_df.groupby(["state"])["latitude"].transform(custom_mean),
    inplace=True,
)
joined_df["longitude"].fillna(
    joined_df.groupby(["state"])["longitude"].transform(custom_mean),
    inplace=True,
)

#### 2. Age

In [None]:
joined_df["avg_age_participants"].fillna(
    custom_mean(joined_df["avg_age_participants"]),
    inplace=True,
)

joined_df["min_age_participants"].fillna(
    custom_mean(joined_df["min_age_participants"]),
    inplace=True,
)

joined_df["max_age_participants"].fillna(
    custom_mean(joined_df["max_age_participants"]),
    inplace=True,
)

#### 3. Participants

In [None]:
joined_df["n_participants"].fillna(
    custom_mean(joined_df["n_participants"]),
    inplace=True,
)

children/teen/adults

In [None]:
tot = (
    custom_mean(joined_df["n_participants_adult"] / joined_df["n_participants"])
    + custom_mean(joined_df["n_participants_teen"] / joined_df["n_participants"])
    + custom_mean(joined_df["n_participants_child"] / joined_df["n_participants"])
)

residual = 1 - tot

avg_children_ratio = custom_mean(
    joined_df["n_participants_child"] / joined_df["n_participants"]
) * (1 + residual)
avg_teen_ratio = custom_mean(
    joined_df["n_participants_teen"] / joined_df["n_participants"]
) * (1 + residual)

joined_df["n_participants_child"].fillna(
    (avg_children_ratio * joined_df["n_participants"]).astype(int), inplace=True
)
joined_df["n_participants_teen"].fillna(
    (avg_teen_ratio * joined_df["n_participants"]).astype(int), inplace=True
)
joined_df["n_participants_adult"].fillna(
    (
        joined_df["n_participants"]
        - (joined_df["n_participants_teen"] + joined_df["n_participants_child"])
    ).astype(int),
    inplace=True,
)

for i in range(joined_df.shape[0]):
    joined_df["n_participants_adult"][i] = joined_df["n_participants"][i] - (
        joined_df["n_participants_teen"][i] + joined_df["n_participants_child"][i]
    )

males/females

In [None]:
tot = custom_mean(joined_df["n_males"] / joined_df["n_participants"]) + custom_mean(
    joined_df["n_females"] / joined_df["n_participants"]
)

residual = 1 - tot

avg_females_ratio = custom_mean(
    joined_df["n_females"] / joined_df["n_participants"]
) * (1 + residual)

joined_df["n_females"].fillna(
    (avg_females_ratio * joined_df["n_participants"]).astype(int), inplace=True
)

for i in range(joined_df.shape[0]):
    joined_df["n_males"][i] = joined_df["n_participants"][i] - (
        joined_df["n_females"][i]
    )

killed/injured/arrested/unharmed

In [None]:
tot = (
    custom_mean(joined_df["n_killed"] / joined_df["n_participants"])
    + custom_mean(joined_df["n_injured"] / joined_df["n_participants"])
    + custom_mean(joined_df["n_arrested"] / joined_df["n_participants"])
    + custom_mean(joined_df["n_unharmed"] / joined_df["n_participants"])
)

residual = 1 - tot

avg_killed_ratio = custom_mean(joined_df["n_killed"] / joined_df["n_participants"]) * (
    1 + residual
)
avg_injured_ratio = custom_mean(
    joined_df["n_injured"] / joined_df["n_participants"]
) * (1 + residual)
avg_arrested_ratio = custom_mean(
    joined_df["n_arrested"] / joined_df["n_participants"]
) * (1 + residual)

joined_df["n_killed"].fillna(
    (avg_killed_ratio * joined_df["n_participants"]).astype(int), inplace=True
)
joined_df["n_injured"].fillna(
    (avg_injured_ratio * joined_df["n_participants"]).astype(int), inplace=True
)
joined_df["n_arrested"].fillna(
    (avg_arrested_ratio * joined_df["n_participants"]).astype(int), inplace=True
)
for i in range(joined_df.shape[0]):
    joined_df["n_unharmed"][i] = joined_df["n_participants"][i] - (
        joined_df["n_killed"][i]
        + joined_df["n_injured"][i]
        + joined_df["n_arrested"][i]
    )

#### 4. party and votes

In [None]:
joined_df.loc[joined_df["party"] == " "].shape[0] / joined_df.shape[0]

In [None]:
joined_df.loc[joined_df["party"] == " ", "party"] = np.NaN
joined_df.loc[joined_df["totalvotes"] == 0, "totalvotes"] = np.NaN

In [None]:
joined_df["party"].fillna(
    joined_df.groupby(["year", "state"])["party"].transform(custom_mode).values[0],
    inplace=True,
)

In [None]:
joined_df["totalvotes"].fillna(
    joined_df.groupby(["year", "party"])["totalvotes"].transform(custom_mean),
    inplace=True,
)

avg_votes_ratio = custom_mean(joined_df["candidatevotes"] / joined_df["totalvotes"])

joined_df["candidatevotes"].fillna(
    (joined_df["totalvotes"] * avg_votes_ratio).astype(int),
    inplace=True,
)

In [None]:
print(joined_df.loc[joined_df["party"] == " "].shape[0] / joined_df.shape[0])
print(joined_df.loc[joined_df["totalvotes"] == " "].shape[0] / joined_df.shape[0])
print(joined_df.loc[joined_df["candidatevotes"] == " "].shape[0] / joined_df.shape[0])

### Drop duplicates

In [None]:
joined_df = joined_df.drop_duplicates()

### Remove / modify attributes

In [None]:
joined_df = joined_df.drop(
    columns=[
        "address",
        "city_or_county",
        "congressional_district",
        "state_house_district",
        "state_senate_district",
        "participant_age1",
        "participant_age_group1",
        "participant_gender1",
        "notes",
        "incident_characteristics1",
        "incident_characteristics2",
    ]
)

In [None]:
joined_df.insert(
    1,
    "month",
    [int(incidents_df["date"][i][5:7]) for i in range(incidents_df.shape[0])],
)

joined_df = joined_df.drop(columns=["date"])

In [None]:
joined_df.insert(
    23,
    "votes_ratio",
    [
        joined_df["candidatevotes"][i] / joined_df["totalvotes"][i]
        for i in range(incidents_df.shape[0])
    ],
)

joined_df = joined_df.drop(columns=["candidatevotes"])

In [None]:
joined_df.loc[joined_df["n_participants"] == 0]

In [None]:
joined_df.isna().sum() / joined_df.shape[0] * 100

In [None]:
joined_df[
    [
        "min_age_participants",
        "max_age_participants",
        "n_participants_child",
        "n_participants_teen",
        "n_participants_adult",
        "povertyPercentage",
    ]
] = joined_df[
    [
        "min_age_participants",
        "max_age_participants",
        "n_participants_child",
        "n_participants_teen",
        "n_participants_adult",
        "povertyPercentage",
    ]
].astype(
    float
)

In [None]:
columns_to_normalize = [
    "n_arrested",
    "n_unharmed",
    "n_killed",
    "n_injured",
    "n_participants_adult",
    "n_participants_teen",
    "n_participants_child",
    "n_males",
    "n_females",
]

# Normalize columns by dividing each element by n_participants
joined_df[columns_to_normalize] = joined_df[columns_to_normalize].div(
    joined_df["n_participants"], axis=0
)

joined_df = joined_df.rename(
    columns={
        "n_arrested": "arrested_ratio",
        "n_unharmed": "unharmed_ratio",
        "n_killed": "killed_ratio",
        "n_injured": "injured_ratio",
        "n_participants_adult": "adults_ratio",
        "n_participants_teen": "teen_ratio",
        "n_participants_child": "children_ratio",
        "n_males": "males_ratio",
        "n_females": "females_ratio",
    }
)

### Additional information

In [None]:
population_df = pd.read_csv("data/population.csv")
population_df = population_df[["placeName", "Date:Count_Person", "Value:Count_Person"]]
population_df = population_df.astype({})
population_df = population_df.rename(
    columns={
        "placeName": "state",
        "Date:Count_Person": "year",
        "Value:Count_Person": "population",
    }
)

In [None]:
joined_df["population"] = " "

for i in range(joined_df.shape[0]):
    year_condition = population_df["year"] == joined_df["year"][i]
    state_condition = population_df["state"] == joined_df["state"][i]
    population = population_df.loc[
        year_condition & state_condition, "population"
    ].values

    if len(population) == 1:
        joined_df["population"][i] = population[0]
    else:
        joined_df["population"][i] = np.NaN

In [None]:
joined_df.to_csv("data/joined_dataset.csv")

## 4. Variables distribution

### Party

In [None]:
colors = {
    "DEMOCRAT": "blue",
    "DEMOCRATIC-FARMER-LABOR": "purple",
    "REPUBLICAN": "red",
    " ": "gray",
}
usa = plt.imread("images/NorthAmerica.png")

fig, axs = plt.subplots(2, 2, figsize=(18, 9))
for i, y in enumerate([2012, 2014, 2016, 2018]):
    axs[i // 2, i % 2].imshow(usa, extent=[-180, -60, 20, 80])
    year_df = joined_df.loc[joined_df["year"] // 2 == y // 2]
    if y == 2012:
        axs[i // 2, i % 2].set_title(f"2013", fontsize=20)
    elif y == 2018:
        axs[i // 2, i % 2].set_title(f"2018", fontsize=20)
    else:
        axs[i // 2, i % 2].set_title(f"{y}-{y+1}", fontsize=20)
    axs[i // 2, i % 2].scatter(
        [year_df["longitude"][i] for i in year_df.index],
        [year_df["latitude"][i] for i in year_df.index],
        s=3,
        c=[colors[year_df["party"][i]] for i in year_df.index],
    )
    axs[i // 2, i % 2].set_xlim([-180, -60])
    axs[i // 2, i % 2].set_ylim([18, 75])
    axs[i // 2, i % 2].plot([0], "r", label="REPUBLICAN")
    axs[i // 2, i % 2].plot([0], "b", label="DEMOCRAT")
    axs[i // 2, i % 2].plot([0], "purple", label="DEMOCRATIC-FARMER-LABOR")
    axs[i // 2, i % 2].plot([0], "g", label="unavailable")
    axs[i // 2, i % 2].legend()

### Population

In [None]:
import random
import matplotlib.colors as mcolors

# Generate a list of 50 random RGBA colors
random_colors = [
    (random.random(), random.random(), random.random(), random.uniform(0.5, 1.0))
    for _ in range(51)
]

colors = {}
state_list = list(joined_df.sort_values("state", ascending=False)["state"].unique())
for i, state in enumerate(state_list):
    colors[state] = random_colors[i]

import random

In [None]:
import seaborn as sns

usa = plt.imread("images/NorthAmerica.png")

fig, axs = plt.subplots(
    1,
    2,
    figsize=(30, 10),
    gridspec_kw={"height_ratios": [1], "width_ratios": [1, 1]},
)

axs[0].imshow(usa, extent=[-180, -60, 20, 80])
axs[0].scatter(
    [joined_df["longitude"][i] for i in joined_df.index],
    [joined_df["latitude"][i] for i in joined_df.index],
    s=1,
    c=[colors[joined_df["state"][i]] for i in joined_df.index],
)
axs[0].set_xlim([-170, -65])
axs[0].set_ylim([18, 80])

for i, value in enumerate(
    joined_df.sort_values("state", ascending=False)["state"].unique()
):
    count = joined_df.loc[joined_df["state"] == value].shape[0]
    axs[1].barh(value, count, color=random_colors[i])

axs[1].set_xlabel("Number of incidents from Jan 2013 to Mar 2018")

### Poverty percentage

In [None]:
import matplotlib.cm as cm

usa = plt.imread("images/NorthAmerica.png")

fig, axs = plt.subplots(
    1,
    2,
    figsize=(30, 10),
    gridspec_kw={"height_ratios": [1], "width_ratios": [1, 1]},
)

axs[0].imshow(usa, extent=[-180, -60, 20, 80])
scatter = axs[0].scatter(
    [joined_df["longitude"][i] for i in joined_df.index],
    [joined_df["latitude"][i] for i in joined_df.index],
    s=1,
    c=[joined_df["povertyPercentage"][i] for i in joined_df.index],
)

axs[0].set_xlim([-170, -60])
axs[0].set_ylim([18, 80])

####

states = joined_df.sort_values("state", ascending=False)["state"].unique()
incidents_rate = []

poverty = []
for i, value in enumerate(states):
    poverty.append(
        np.mean(joined_df.loc[joined_df["state"] == value]["povertyPercentage"])
    )

sorted_indices = np.argsort(poverty)

# Use sorted indices to reorder both arrays
poverty = [poverty[i] for i in sorted_indices]
states = [states[i] for i in sorted_indices]

cmap = cm.magma
norm = plt.Normalize(min(poverty), max(poverty))
colors = cmap(norm(poverty))
sm = cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])
cbar = plt.colorbar(sm, ax=axs[1], label="Poverty percentage")

for i, value in enumerate(states):
    count = joined_df.loc[joined_df["state"] == value].shape[0]
    population = population_df[
        (population_df["state"] == value) & (population_df["year"] == 2018)
    ]["population"]
    incidents_rate.append(count / population * 1e6)
    axs[1].barh(value, count / population * 1e6, color=colors[i])

axs[1].set_xlabel("Number of incidents every million people from Jan 2013 to Mar 2018")

## 5. Pairwise correlation

In [None]:
numerical_columns = [
    "year",
    "latitude",
    "longitude",
    "min_age_participants",
    "avg_age_participants",
    "max_age_participants",
    "children_ratio",
    "teen_ratio",
    "adults_ratio",
    "males_ratio",
    "females_ratio",
    "killed_ratio",
    "injured_ratio",
    "arrested_ratio",
    "unharmed_ratio",
    "n_participants",
    "povertyPercentage",
    "votes_ratio",
]

In [None]:
# Compute the correlation matrix
corr = joined_df[numerical_columns].corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(
    corr,
    mask=mask,
    cmap=cmap,
    vmax=0.3,
    center=0,
    square=True,
    linewidths=0.5,
    cbar_kws={"shrink": 0.5},
)

sns.set(font_scale=0.5)

## 6. Summary

In [None]:
final_df = joined_df.copy()
final_df.to_csv("data/final_dataset.csv")

In [None]:
final_df.isna().sum() / joined_df.shape[0] * 100