In [1]:
from pyaxis import pyaxis
import pandas as pd
import numpy as np

In [2]:
px = pyaxis.parse(uri="data_umrli.PX", encoding='ISO-8859-2')

# store data as pandas dataframe
data_df = px['DATA']

data_df

Unnamed: 0,MESEC,DAN V MESECU,DATA
0,2000M01,Dan v mesecu - SKUPAJ,2000
1,2000M01,1,65
2,2000M01,2,69
3,2000M01,3,65
4,2000M01,4,63
...,...,...,...
9467,2024M08 (začasni podatki),27,60
9468,2024M08 (začasni podatki),28,62
9469,2024M08 (začasni podatki),29,68
9470,2024M08 (začasni podatki),30,69


In [3]:
data_df.iloc[370:390]

Unnamed: 0,MESEC,DAN V MESECU,DATA
370,2000M12,18,41
371,2000M12,19,49
372,2000M12,20,52
373,2000M12,21,52
374,2000M12,22,68
375,2000M12,23,52
376,2000M12,24,56
377,2000M12,25,59
378,2000M12,26,63
379,2000M12,27,45


In [4]:
data_df.iloc[-265:-250]

Unnamed: 0,MESEC,DAN V MESECU,DATA
9207,2023M12,23,75
9208,2023M12,24,66
9209,2023M12,25,71
9210,2023M12,26,78
9211,2023M12,27,68
9212,2023M12,28,66
9213,2023M12,29,69
9214,2023M12,30,73
9215,2023M12,31,79
9216,2024M01 (začasni podatki),Dan v mesecu - SKUPAJ,2097


Looks like we have data for each day after 2000 and monthly sums of deaths. Data from 2024 are not final yet.

Let's clean the data

In [5]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9472 entries, 0 to 9471
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   MESEC         9472 non-null   object
 1   DAN V MESECU  9472 non-null   object
 2   DATA          9306 non-null   object
dtypes: object(3)
memory usage: 222.1+ KB


In [6]:
def trnasform_day_to_numeric(val):
    return int(val) if val.isdigit() else np.nan

In [7]:
# remove monthly sums
data_df["DAN V MESECU"] = data_df["DAN V MESECU"].apply(trnasform_day_to_numeric)
data_df = data_df.dropna()

In [8]:
data_df.loc[:, "DATA"] = pd.to_numeric(data_df["DATA"])
data_df.loc[:, "DAN V MESECU"] = pd.to_numeric(data_df["DAN V MESECU"])

In [9]:
data_df["year"] = data_df.loc[:, "MESEC"].str.split(" ").apply(lambda val: int(val[0][:4]))
data_df["month"] = data_df.loc[:, "MESEC"].str.split(" ").apply(lambda val: int(val[0][5:]))
data_df["year_month"] = data_df.loc[:, "MESEC"].str.split(" ").apply(lambda val: val[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_df["year"] = data_df.loc[:, "MESEC"].str.split(" ").apply(lambda val: int(val[0][:4]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_df["month"] = data_df.loc[:, "MESEC"].str.split(" ").apply(lambda val: int(val[0][5:]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_df["year_month"

In [10]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9010 entries, 1 to 9471
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MESEC         9010 non-null   object 
 1   DAN V MESECU  9010 non-null   float64
 2   DATA          9010 non-null   object 
 3   year          9010 non-null   int64  
 4   month         9010 non-null   int64  
 5   year_month    9010 non-null   object 
dtypes: float64(1), int64(2), object(3)
memory usage: 492.7+ KB


In [11]:
data_clean = data_df[["year_month", "year", "month", "DAN V MESECU", "DATA"]].rename(columns={"DATA": "deaths", "DAN V MESECU": "day"})

In [12]:
data_clean

Unnamed: 0,year_month,year,month,day,deaths
1,2000M01,2000,1,1.0,65
2,2000M01,2000,1,2.0,69
3,2000M01,2000,1,3.0,65
4,2000M01,2000,1,4.0,63
5,2000M01,2000,1,5.0,64
...,...,...,...,...,...
9467,2024M08,2024,8,27.0,60
9468,2024M08,2024,8,28.0,62
9469,2024M08,2024,8,29.0,68
9470,2024M08,2024,8,30.0,69


In [13]:
data_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9010 entries, 1 to 9471
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   year_month  9010 non-null   object 
 1   year        9010 non-null   int64  
 2   month       9010 non-null   int64  
 3   day         9010 non-null   float64
 4   deaths      9010 non-null   object 
dtypes: float64(1), int64(2), object(2)
memory usage: 422.3+ KB


In [14]:
data_clean.describe()

Unnamed: 0,year,month,day
count,9010.0,9010.0,9010.0
mean,2011.837514,6.46859,15.730411
std,7.1234,3.438269,8.800954
min,2000.0,1.0,1.0
25%,2006.0,3.0,8.0
50%,2012.0,6.0,16.0
75%,2018.0,9.0,23.0
max,2024.0,12.0,31.0


In [15]:
data_clean["year"].unique()

array([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
       2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021,
       2022, 2023, 2024])

In [16]:
data_clean["month"].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [17]:
data_clean["day"].unique()

array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
       14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
       27., 28., 29., 30., 31.])

In [18]:
data_clean['date'] = pd.to_datetime(data_clean[['year', 'month', 'day']])

In [19]:
data_clean.to_csv("data_clean_deaths.csv")