In [None]:
import pandas as pd
from src.constants import EXTERNAL_PATH

In [None]:
event_types = [
    "Strategic developments",
    "Battles",
    "Explosions/Remote violence",
    "Violence against civilians",
    "Protests",
    "Riots",
]
sub_event_types = [
    "Remote explosive/landmine/IED",
    "Armed clash",
    "Shelling/artillery/missile attack",
    "Air/drone strike",
    "Attack" "Non-state actor overtakes territory",
    "Grenade",
    "Abduction/forced disappearance",
    "Government regains territory",
    "Sexual violence",
    "Suicide bomb",
]

def get_acled_data():
    acled = pd.read_excel(EXTERNAL_PATH / "ACLED__Ukraine_Black_Sea_2020_2023_Feb09.xlsx")
    acled = acled[acled["COUNTRY"] == "Ukraine"]  # only Ukraine
    acled["EVENT_DATE"] = pd.to_datetime(acled["EVENT_DATE"])
    date_of_war = "2022-02-24"
    acled = acled[acled["EVENT_DATE"] >= pd.to_datetime(date_of_war)]

    specific_event_types = [event_types[1], event_types[2], event_types[3]] # only battle, remote violence, violence against civilians
    acled = acled[acled["EVENT_TYPE"].isin(specific_event_types)]
    specific_event_types = [sub_event_types[2], sub_event_types[3]]  # only "Shelling/artillery/missile attack", "Air/drone strike",
    acled = acled[acled["SUB_EVENT_TYPE"].isin(specific_event_types)]
    return acled

In [None]:
df = get_acled_data()
print(df.shape)
df.head()

In [None]:
d_admin1_to_acled = {
    "Autonomous Republic of Crimea": "Crimea",
    'Cherkaska':'Cherkasy',
    'Chernihivska':'Chernihiv',
    'Chernivetska': 'Chernivtsi',
    'Dnipropetrovska': 'Dnipropetrovsk',
    'Donetska':'Donetsk',
    'Ivano-Frankivska':'Ivano-Frankivsk',
    'Kharkivska':'Kharkiv',
    'Khersonska':'Kherson',
    'Khmelnytska':'Khmelnytskyi',
    'Kirovohradska':'Kirovograd',
    'Kyiv':'Kyiv City',
    'Kyivska':'Kyiv',
    'Luhanska':'Luhansk',
    'Lvivska':'Lviv',
    'Mykolaivska':'Mykolaiv',
    'Odeska':'Odesa',
    'Poltavska':'Poltava',
    'Rivnenska':'Rivne',
    'Sevastopol':'Sevastopol City',
    'Sumska':'Sumy',
    'Ternopilska':'Ternopil',
    'Vinnytska':'Vinnytsia',
    'Volynska':'Volyn',
    'Zakarpatska':'Zakarpattia',
    'Zaporizka':'Zaporizhia',
    'Zhytomyrska':'Zhytomyr'
}
d_acled_to_admin1 = {v: k for k, v in d_admin1_to_acled.items()}

In [None]:
df = df.dropna(subset=['ADMIN1'])
df['ADM1_EN'] = df['ADMIN1'].apply(lambda x: d_acled_to_admin1[x])

In [None]:
from src.utils.geometry import load_ukraine_admin_polygons

adm1 = load_ukraine_admin_polygons(adm_level=1)

In [None]:
# merge dataset on ADM1_EN
df_acled_count = df.merge(adm1, on='ADM1_EN', how='left').groupby("ADM1_EN").size().to_frame("acled_count").reset_index()
adm1_ = adm1.merge(df_acled_count, on="ADM1_EN", how="left")
adm1_

In [None]:
adm1_.explore('acled_count',cmap='YlOrRd',  vmin=0, vmax=15000, tiles='Esri.WorldGrayCanvas')

In [None]:
from src.constants import PREDS_PATH
import geopandas as gpd

run_name = '240301'

def load_all_oblasts_aggregated(run_name, folder_preds_agg="oblasts_with_preds_agg"):

    folder = PREDS_PATH / run_name / folder_preds_agg
    adm1 = load_ukraine_admin_polygons(adm_level=1)
    gdf = pd.concat(
        [
            gpd.read_file(folder / f"preds_agg_{o}.geojson")
            for o in adm1.ADM1_EN.unique()
            if (folder / f"preds_agg_{o}.geojson").exists()
        ]
    )
    gdf.set_index(["admin_id", "post_date"], inplace=True)
    return gdf


gdf = load_all_oblasts_aggregated(run_name, folder_preds_agg="oblasts_with_preds_agg")

In [None]:
idx = pd.IndexSlice
gdf_preds = gdf.loc[idx[:,'2023-02-24'],:][[c for c in gdf.columns if not c.startswith('count')] + ['count_mean_0.65']]
gdf_preds

In [None]:
df_preds_count = gdf_preds.groupby('ADM1_EN')['count_mean_0.65'].sum().to_frame('preds_count').reset_index()
adm1_all = adm1_.merge(df_preds_count, on="ADM1_EN", how="left")
adm1_all

In [None]:
# plot columns acled_count vs preds_count to visualize correlation, logx and logy

import matplotlib.pyplot as plt
import seaborn as sns

fig, ax = plt.subplots()
sns.scatterplot(data=adm1_all, x='acled_count', y='preds_count', ax=ax)
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlabel('ACLED count')
ax.set_ylabel('Predicted count')
ax.set_title('ACLED count vs Predicted count for each oblast')

# draw best line fit
import numpy as np
from scipy.stats import linregress

x = adm1_all['acled_count']
y = adm1_all['preds_count']
mask = ~np.isnan(x) & ~np.isnan(y)
x = x[mask]
y = y[mask]
slope, intercept, r_value, p_value, std_err = linregress(np.log(x), np.log(y))
print(r_value)
x = np.linspace(1, x.max(), 100)
y = np.exp(intercept) * x ** slope
ax.plot(x, y, color='red', label=f'Best fit line, slope={slope:.2f}, r_value={r_value:.2f}')
ax.legend()

plt.show()