In [3]:
import numpy as np
import pandas as pd

# Components

In [4]:
exclude_regions = ['Автономна pеспубліка Крим', 'м.Київ', 'м.Севастополь', 'АР Крим', 'Автономна Республіка Крим', 'Автономна \nРеспубліка Крим',
                   'Автономна\r\n Республіка Крим', 'Севастополь', 'Київ']

### Incidence by region and age groups

In [5]:
incidence = pd.read_csv("./../FORM_7/incidence.csv")

incidence = incidence[~incidence.region.isin(exclude_regions)]

incidence.head()

Unnamed: 0,year,category,region,age_group,incidence
20,2010,Злоякiснi новоутворення-всього C00-C97...,Вінницька,до 1 р.,1
21,2010,Злоякiснi новоутворення-всього C00-C97...,Вінницька,1-4 р.,5
22,2010,Злоякiснi новоутворення-всього C00-C97...,Вінницька,5-9 р.,2
23,2010,Злоякiснi новоутворення-всього C00-C97...,Вінницька,10-14р,5
24,2010,Злоякiснi новоутворення-всього C00-C97...,Вінницька,15-19р,4


### Incidence by region and stage

In [6]:
stage_incidence = pd.read_csv("stage_date.csv")

stage_incidence = stage_incidence[~stage_incidence.region.isin(exclude_regions)]

stage_incidence.head()

Unnamed: 0,year,region,mtumors,syncmtumors,insitu,ncervix
10,2010,Вінницька,356,112,113,103
11,2010,Волинська,177,50,81,77
12,2010,Дніпропетровська,767,212,174,150
13,2010,Донецька,906,234,57,24
14,2010,Житомирська,285,94,33,30


#### F470100

In [7]:
f47_0100_df = pd.read_csv("F470100.csv")
f47_0100_df['category'] = f47_0100_df.category.str.replace(' ', '')
aggregated_f47_0100_df = f47_0100_df[f47_0100_df['category'] == "ВСЬОГО"]
aggregated_f47_0100_df = aggregated_f47_0100_df.drop(columns=['category'])

aggregated_f47_0100_df = aggregated_f47_0100_df[~aggregated_f47_0100_df.region.isin(exclude_regions)]
aggregated_f47_0100_df.ybeds = aggregated_f47_0100_df.ybeds.str.replace(',', '.').astype(float)
aggregated_f47_0100_df.ndoctors = aggregated_f47_0100_df.ndoctors.str.replace(',', '.').astype(float)
aggregated_f47_0100_df.nnursing = aggregated_f47_0100_df.nnursing.str.replace(',', '.').astype(float)

aggregated_f47_0100_df[aggregated_f47_0100_df.duplicated(subset=['region', 'year'], keep=False)]

Unnamed: 0,year,region,nhospotal,nbeds,ybeds,nill,nvillage_ill,bed_days,dvisits,hvisits,ndoctors,nnursing


#### F471700 - equipment

In [8]:
f_47_1700_df = pd.read_csv('F471700.csv')

categories_variables = {
    "nx_ray": "Чис.закл.,які мають:рентг.від.(каб)",
    "nflurography": "флюорографiчнi вiддiлення (кабiнети)",
    "nradiology": "радiологiчнi (променевої терапiї)",
    "nradlab": "лабораторії радіоізотопної діагност.",
    "nсt": "комп'ютерної томографії",
    "ncardiogram": "каб.електрокардіог.та функц.діагност.",
    "ndiaglab": "клініко-діагностичні лабораторії",
    "nbacter": "бактеріологічні лабораторії",
    "nbiochem": "біохімічні лабораторії",
    "ncyto": "цитологічні лабораторії",
    "nimun": "імунологічні лабораторії",
    "nphysic": "фізіотерапевтичні відділення (каб.)",
    "nendoscop": "ендоскопічні відділення (кабінети)",
    "nultrasound": "ультразвукової діагностики",
    "ndialysis": "відділення нефрології та діалізу"
}

equipment_dfs = []
for column_name, category in categories_variables.items():
    df = f_47_1700_df[f_47_1700_df['category'] == category][['year', 'region', 'total']]
    df = df.rename(columns={"total": column_name})
    if len(df) == 0:
        raise Exception("inconsistent")
    equipment_dfs.append(df)

equipment_df = equipment_dfs[0]
for df in equipment_dfs[1:]:
    equipment_df = pd.merge(equipment_df, df, on=["year", "region"], how="outer")


equipment_df = equipment_df[~equipment_df.region.isin(exclude_regions)]

equipment_df.head()

Unnamed: 0,year,region,nx_ray,nflurography,nradiology,nradlab,nсt,ncardiogram,ndiaglab,nbacter,nbiochem,ncyto,nimun,nphysic,nendoscop,nultrasound,ndialysis
0,2008,Івано-Франківська,72,48,2,0,3,137,208,6,0,3,1,208,35,52,5
2,2008,Волинська,47,31,1,1,3,122,137,3,1,2,0,139,26,36,2
3,2008,Вінницька,90,61,4,1,4,241,267,6,1,1,0,275,39,53,1
4,2008,Дніпропетровська,163,117,7,3,6,265,262,7,2,3,5,340,90,109,2
5,2008,Донецька,244,147,10,3,4,257,326,10,5,6,3,396,81,127,5


### Air pollution

In [9]:
air_pollution_df = pd.read_csv("air_polution.csv")
air_pollution_df = air_pollution_df.rename(columns={'Region': 'region'})

air_pollution_df = air_pollution_df[air_pollution_df['year'] >= 2008]
air_pollution_df = air_pollution_df.drop(columns=["Unnamed: 0"])

air_pollution_df = air_pollution_df[~air_pollution_df.region.isin(exclude_regions)]

air_pollution_df.head()

Unnamed: 0,region,year,air_pollution
451,Вінницька,2008,130.3
452,Волинська,2008,10.0
453,Дніпропетровська,2008,952.3
454,Донецька,2008,1533.4
455,Житомирська,2008,19.1


In [10]:
air_pollution_df.year.min(), air_pollution_df.year.max()

(np.int64(2008), np.int64(2023))

### Water pollution

In [11]:
water_pollution = pd.read_csv("merged_dumps.csv")
water_pollution = water_pollution.rename(columns={'Region': 'region'})

water_pollution = water_pollution[water_pollution['year'] >= 2008]

water_pollution = water_pollution[~water_pollution.region.isin(exclude_regions)]

water_pollution[water_pollution.not_cleaned_dumps == "–"] = np.nan

columns_to_convert = ['polluted_dumps', 'not_cleaned_dumps', 'dumps_not_cleaned_enough',	'num_clearing_plants']
for col in columns_to_convert:
    water_pollution[col] = pd.to_numeric(water_pollution[col], errors='coerce')



Imputation

In [12]:
missing_regions_years = pd.DataFrame(
    [("Вінницька",	2010),
    ("Миколаївська",	2010),
    ("Миколаївська",	2010),
    ("Полтавська",	2010),
    ("Миколаївська",	2010),
    ("Чернігівська",	2010),
    ("Полтавська",	2011),
    ("Чернігівська",	2011),
    ("Полтавська",	2008),
    ("Чернігівська",	2008),
    ("Полтавська",	2009),
    ("Чернігівська",	2009),
    ("Миколаївська",	2012),
    ("Полтавська",	2012),
    ("Чернігівська",	2012)],
    columns=['region', 'year']
)

water_pollution = pd.concat([water_pollution, missing_regions_years])

In [13]:
water_pollution_dfs = []

for region in water_pollution.region.unique():
    region_wp = water_pollution[water_pollution.region == region].sort_values(by='year')

    columns_to_impute = ['polluted_dumps', 'not_cleaned_dumps', 'dumps_not_cleaned_enough',	'num_clearing_plants']

    for column in columns_to_impute:
        region_wp[column] = region_wp[column].ffill().bfill()

    water_pollution_dfs.append(region_wp)

water_pollution = pd.concat(water_pollution_dfs)

water_pollution.head()

Unnamed: 0,region,year,polluted_dumps,not_cleaned_dumps,dumps_not_cleaned_enough,num_clearing_plants
451,Вінницька,2008.0,2.0,0.0,2.0,95.0
476,Вінницька,2009.0,2.0,0.0,2.0,98.0
0,Вінницька,2010.0,2.0,0.0,2.0,98.0
526,Вінницька,2011.0,2.0,0.0,2.0,102.0
551,Вінницька,2012.0,1.0,0.0,1.0,103.0


In [14]:
water_pollution.year.min(), water_pollution.year.max()

(np.float64(2008.0), np.float64(2022.0))

### GDP

In [15]:
gdp = pd.read_csv("gdp.csv")
gdp = gdp.rename(columns={"Region": "region"})

gdp = gdp[gdp['year'] >= 2008]

gdp = gdp[~gdp.region.isin(exclude_regions)]

gdp.head()

Unnamed: 0,region,year,gdp
101,Вінницька,2008,12061
102,Волинська,2008,12340
103,Дніпропетровська,2008,30918
104,Донецька,2008,26028
105,Житомирська,2008,11545


In [16]:
gdp.year.min(), gdp.year.max()

(np.int64(2008), np.int64(2021))

### CPI

In [17]:
cpi = pd.read_csv("inflation.csv", sep=r"\s+")
cpi.cpi = cpi.cpi.map(lambda x: float(x.replace(',', '.')))

cpi = cpi[cpi['year'] >= 2008]

cpi.head()

Unnamed: 0,year,cpi
8,2008,122.3
9,2009,112.3
10,2010,109.1
11,2011,104.6
12,2012,99.8


In [18]:
cpi.year.min(), cpi.year.max()

(np.int64(2008), np.int64(2024))

### Population

In [19]:
population = pd.read_csv("population.csv")
population = population.rename(columns={"Територіальний розріз": "region", "Період": "year", "Значення cпостереження": "population"})

population = population[population["year"] >= 2008]

population = population[~population.region.isin(exclude_regions)]

population.head()

Unnamed: 0,region,year,population
53,Вінницька,2008,1672217.0
54,Вінницька,2009,1659975.0
55,Вінницька,2010,1650608.0
56,Вінницька,2011,1641201.0
57,Вінницька,2012,1634187.0


In [20]:
population.year.min(), population.year.max()

(np.int64(2008), np.int64(2022))

# Datasets

### Age Group - dataset

In [21]:
age_group_incidence = pd.merge(incidence, aggregated_f47_0100_df, on=['year', 'region'], how='left')
age_group_incidence = pd.merge(age_group_incidence, equipment_df, on=["year", "region"], how="left")
age_group_incidence = pd.merge(age_group_incidence, gdp, on=["year", "region"], how="left")
age_group_incidence = pd.merge(age_group_incidence, air_pollution_df, on=["year", "region"], how="left")
age_group_incidence = pd.merge(age_group_incidence, water_pollution, on=["year", "region"], how="left")
age_group_incidence = pd.merge(age_group_incidence, cpi, on="year", how="left")
age_group_incidence = pd.merge(age_group_incidence, population, on=['year', 'region'], how='left')

age_group_incidence = age_group_incidence.drop_duplicates(subset=['year', 'region', 'category', 'age_group'])

age_group_incidence.to_csv("age_group_incidence.csv", index=False)

In [22]:
total_male_category = "Злоякiснi новоутворення-всього         C00-C97    ч"
total_female_category = "C00-C97    ж"
total_incidences_in_regions = age_group_incidence[age_group_incidence.category.isin([total_male_category, total_female_category])].groupby(['region', 'year']).agg({'incidence': 'sum'}).reset_index()
total_incidences_in_regions = total_incidences_in_regions.rename(columns={"incidence": "tincidence"})


### Stage dataset

In [26]:
stage_dataset = pd.merge(stage_incidence, aggregated_f47_0100_df, on=['year', 'region'], how='left')
stage_dataset = pd.merge(stage_dataset, equipment_df, on=["year", "region"], how="left")
# stage_dataset = pd.merge(stage_dataset, gdp, on=["year", "region"], how="left")
stage_dataset = pd.merge(stage_dataset, air_pollution_df, on=["year", "region"], how="left")
stage_dataset = pd.merge(stage_dataset, water_pollution, on=["year", "region"], how="left")
stage_dataset = pd.merge(stage_dataset, cpi, on="year", how="left")
stage_dataset = pd.merge(stage_dataset, population, on=['year', 'region'], how='left')
stage_dataset = pd.merge(stage_dataset, total_incidences_in_regions, on=['year', 'region'], how='left')

stage_dataset = stage_dataset.drop_duplicates(subset=['year', 'region'])

stage_dataset.to_csv("stage_incidence.csv", index=False)