In [None]:
import requests, datetime, re
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
plt.rcParams['figure.figsize'] = (8, 6)
plt.rcParams['axes.grid'] = True

In [None]:
%matplotlib notebook

In [None]:
class UNHR:
    NEX = '\d+|\d+,\d+'
    
    EX = re.compile(
        f'(a total of)?\s+(?P<total>{NEX})\s+' +
        f'(?P<kind>killed|injured)\s+' +
        f'\((?P<details>[a-z0-9,\s]+)\)'
    )
    
    DEX = re.compile(f'(?P<n>{NEX})\s+(?P<kind>[a-z]+)')
    
    REX = f'(\s|&nbsp;)+({NEX})\scasualties\s\((?P<killed>{NEX})\s+killed\s+and\s+(?P<injured>{NEX})\s+injured\)'
    
    DLEX = re.compile(f'Government-controlled\sterritory:{REX}')
    LDNREX = re.compile(f'territory\scontrolled.+:{REX}')
    UEX = re.compile(f'other\sregions\sof\sUkraine.+:{REX}')
    
    @staticmethod
    def url_at(d):
    ## eg. for t = 2022-04-05: https://www.ohchr.org/en/news/2022/04/ukraine-civilian-casualty-update-3-april-2022
        return d.strftime('https://www.ohchr.org/en/news/%Y/%m/ukraine-civilian-casualty-update-%#d-%B-%Y').lower()
         
    @staticmethod
    def s2n(s):
        return int(s.replace(',', ''))
        
    @classmethod
    def extract(cls, d, silent=True):
        url = cls.url_at(d)
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) '
                            + 'AppleWebKit/537.36 (KHTML, like Gecko) '
                            + 'Chrome/39.0.2171.95 Safari/537.36'
        }
        if not silent:
            print('getting content from', url)
        data = {}
        r = requests.get(url, headers=headers)
        if r.url != url: ## => page redirected => ignore
            return data
        
        s = r.content.decode()
  
        end = 0
        for _ in range(2):
            m = cls.EX.search(s, pos=end)
            if m is None:
                return data
            end = m.end()
            kind = m.group('kind')
            data[kind] = {'total': cls.s2n(m.group('total'))}
            dm = cls.DEX.search(m.group('details'))
            while dm:
                data[kind][dm.group('kind')] = cls.s2n(dm.group('n'))
                dm = cls.DEX.search(m.group('details'), pos=dm.end())

#         m = cls.EX.search(s)
#         while m:
#             kind = m.group('kind')
#             data[kind] = {'total': cls.s2n(m.group('total'))}
#             dm = cls.DEX.search(m.group('details'))
#             while dm:
#                 data[kind][dm.group('kind')] = cls.s2n(dm.group('n'))
#                 dm = cls.DEX.search(m.group('details'), pos=dm.end())
#             end = m.end()
#             m = cls.EX.search(s, pos=end)

        for region, rex in [('DL', cls.DLEX),
                            ('LDNR', cls.LDNREX),
                            ('U', cls.UEX)]:
            m = rex.search(s, pos=end)
            if m is None:
                print(s[end:end+300], region, rex)
                raise RuntimeError
            end = m.end()
            for kind in ['killed', 'injured']:
                if kind not in data:
                    print(d)
                data[kind][region] = cls.s2n(m.group(kind))
                #data.setdefault(kind, {})[region] = cls.s2n(m.group(kind))
            
        return data
    
    @classmethod
    def extract_all(cls, dstart=datetime.date(2022, 3, 7), 
                    silent=False):
        one_day = pd.Timedelta(days=1)
        d = pd.to_datetime(dstart)
        today = pd.to_datetime(datetime.date.today())
        data = {}
        missing = []
        while d <= today:
            dd = cls.extract(d)
            if len(dd) == 0:
                if 'k' in locals():
                    data.setdefault(k, pd.DataFrame()).at[d, :] = None
                else:
                    missing.append(d)
            else:
                for k, v in dd.items():
                    data.setdefault(k, pd.DataFrame()).at[
                        d, v.keys()
                    ] = v.values()
            if not silent:
                print(d, 'processed')
            d += one_day
        df = pd.DataFrame() if len(data)==0 else pd.concat(data, axis=1) 
        if len(missing) > 0:
            df = pd.concat([df, pd.DataFrame(index=missing)], axis=0).sort_index()
        return df
#extract(t - pd.Timedelta(days=2))

In [None]:
UNHR.extract(pd.to_datetime('2022-07-11'))

In [None]:
initial = False or 'data' not in globals()
if initial:
    data = UNHR.extract_all()
else:
    data = pd.concat([data.loc[:data.last_valid_index()], 
                    UNHR.extract_all(
                        dstart=data.dropna().index[-1] 
                        + pd.Timedelta(days=1))], axis=0)
data

In [None]:
#
kind = 'killed'
df = data[kind]
daily = df.total.loc[:df.total.last_valid_index()].interpolate().diff()
ax = df.dropna().plot(grid=True, style='.-')
ax.legend(loc='upper left')
ax2 = ax.twinx()
daily.plot(style='.-', color='red', ax=ax2, label='daily')
avg = daily.mean()
ax2.plot([df.index[0], df.index[-1]], [avg, avg], 'k--', label='avg')
daily.rolling('7D').mean().plot(ax=ax2, color='orange', label='7d MA')
ax2.legend(loc='upper right')
plt.title(f'Civilian {kind} daily avg = {avg}')
plt.tight_layout()
one_day = pd.Timedelta(days=1)
plt.xlim(df.index[0] - one_day, df.index[-1] + one_day)

In [None]:
kind = 'killed'
df = data[kind].copy()
df['month'] = df.index.strftime('%Y-%m')
df.groupby('month').total.first().diff().shift(-1).dropna()#.sum()

In [None]:
kind = 'killed'
df = data[kind].copy()
daily = df.total.loc[:df.total.last_valid_index()].interpolate().diff().shift(-1).to_frame()
daily['month'] = daily.index.strftime('%Y-%m')
daily.groupby('month').sum()

In [None]:
official = pd.DataFrame([
    ['2022-02', '336', '461'],
    ['2022-03', '3,028', '2,384'],
    ['2022-04', '660', '1,253'],
    ['2022-05', '453', '1,012'],
    ['2022-06', '361', '1,029'],
    ['2022-07', '51', '124']
], 
    columns=['month', 'killed', 'injured']
).set_index('month')
for c in official.columns:
    official[c] = official[c].str.replace(',', '').astype(int)
official

In [None]:
df = pd.concat([
    official[['killed']], 
    daily.groupby('month').sum()
],
    axis=1
).rename(columns={'total': 'implied', 'killed': 'official'})
df.at['2022-02', 'implied'] = data['killed'].iloc[0].total
df['imbalance'] = -df.cumsum().diff(axis=1)['implied']
#df#.cumsum()
df.plot(kind='bar')

In [None]:
## killed by region
kind = 'killed'
df = data[kind]
daily = df.loc[:df.total.last_valid_index()].interpolate().diff().rolling('7D').mean().shift(-4)
ax = daily[['DL', 'LDNR', 'U']].plot()
plt.title(kind)

In [None]:
plt.title('- killed / -- injured')

In [None]:
## injured by region
kind = 'injured'
df = data[kind]
daily = df.loc[:df.total.last_valid_index()].interpolate().diff().rolling('7D').mean().shift(-4)
daily[['DL', 'LDNR', 'U']].plot(ax=ax, style='--')
plt.title(kind)

In [None]:
#
kind = 'injured'
df = data[kind]
daily = df.total.loc[:df.total.last_valid_index()].interpolate().diff()
ax = df.dropna().plot(grid=True, style='.-')
ax.legend(loc='upper left')
ax2 = ax.twinx()
daily.plot(style='.-', color='red', ax=ax2, label='daily')
avg = daily.mean()
ax2.plot([df.index[0], df.index[-1]], [avg, avg], 'k--', label='avg')
daily.rolling('7D').mean().plot(ax=ax2, color='orange', label='7d MA')
ax2.legend(loc='upper right')
plt.title(f'Civilian {kind}, daily avg = {avg}')
plt.tight_layout()
one_day = pd.Timedelta(days=1)
plt.xlim(df.index[0] - one_day, df.index[-1] + one_day)

In [None]:
data[pd.MultiIndex.from_product([['injured', 'killed'], ['total']])].dropna().plot()

In [None]:
df = data['killed']
daily = df.total.loc[:df.total.last_valid_index()].interpolate().diff()
daily.rolling('7D').mean().plot(figure=plt.figure(), color='orange', label='7d MA')
data['killed', 'total'
    ].interpolate().diff().rolling('7D').mean().plot(style='.-')#kind='bar')

In [None]:
data[pd.MultiIndex.from_product([['injured', 'killed'], ['total']])
    ].dropna().diff().rolling('7D').mean().plot(style='.-')#kind='bar')
plt.tight_layout()

In [None]:
data.to_csv(f'c:/data/ukr/un-{datetime.date.today()}.csv')

In [None]:
df = data['killed'][['boys', 'girls', 'men', 'women']]
daily = df.loc[:df.last_valid_index()].interpolate().diff()
daily.plot()

In [None]:
daily.men.sum() / daily.women.sum()

In [None]:
(daily.men.rolling(5).sum() / daily.women.rolling(5).sum()).plot(figure=plt.figure())
plt.plot(daily.index, np.ones(len(daily)) * daily.men.sum() / daily.women.sum(), '--')

In [None]:
daily.men[daily.men.rolling(5).sum() / daily.women.rolling(5).sum() > 1.5].sum()

In [None]:
(daily.boys.rolling(5).sum() / daily.girls.rolling(5).sum()).plot(figure=plt.figure(), title='boys / girls')
plt.plot([daily.index[0], daily.index[-1]], [1, 1], '--')

In [None]:
(daily.men.rolling(5).sum() / daily.women.rolling(5).sum()).plot(figure=plt.figure(), title='men / women')
plt.plot([daily.index[0], daily.index[-1]], [1, 1], '--')

In [None]:
2500 / 3200

# Interactive Visualisation

In [None]:
import ipywidgets as ui
import seaborn as sns

In [None]:
tmp = data.stack().stack().reset_index().rename(columns)
tmp

In [None]:
from ukr.un import UN

In [None]:
UN.test()

In [None]:
import pandas as pd

In [None]:
pd.read_csv(
    'c:/data/ukr/un-2022-06-20.csv',
    header=[0,1],
    index_col=0,
    parse_dates=[0]
)