# Data Curation
transforming the data into a format that is suitable for analysis


In [5]:
import os
import pandas as pd
import re

## Data from MagnusWeb

In [6]:

# Define script_dir for Jupyter Notebook
script_dir = os.getcwd()  # Use current working directory instead of __file__
project_root = os.path.abspath(os.path.join(script_dir, ".."))

input_file = os.path.join(project_root, "data", "source_raw", "magnusweb", "export-7.csv")
output_folder = os.path.join(project_root, "data", "source_cleaned")

# Create output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Read the CSV file with semicolon delimiter and proper quoting
df = pd.read_csv(input_file, delimiter=';', quotechar='"')


In [7]:
# --------------------------------------
# 1. Identify static columns vs. melt columns
# --------------------------------------
# static (identifier) columns
id_cols = [
    "IČO",
    "Název subjektu",
    "Hlavní NACE",
    "Hlavní NACE - kód",
    "Vedlejší NACE CZ",
    "Vedlejší NACE CZ - kód",
    "Hlavní OKEČ",
    "Hlavní OKEČ - kód",
    "Vedlejší OKEČ",
    "Vedlejší OKEČ - kód",
    "Institucionální sektory (ESA 2010)",
    "Institucionální sektory (ESA 95)",
    "Lokalita",
    "Kraj",
    "Počet zaměstnanců",
    "Kategorie obratu",
    "Audit",
    "Konsolidace",
    "Měna",
    "Datum vzniku",
    "Datum zrušení",
    # decide on those two:
    "Rok",
    "Čtvrtletí",
]


# We assume everything else is time-coded columns to be melted:
time_cols = [c for c in df.columns if c not in id_cols]

# --------------------------------------
# 2. Melt the DataFrame
# --------------------------------------
melted = df.melt(
    id_vars=id_cols,       # keep these columns as they are
    value_vars=time_cols,  # the columns to unpivot
    var_name="raw_variable", 
    value_name="value"
)

# --------------------------------------
# 3. Define a function to parse raw_variable into (year, quarter, metric)
# --------------------------------------
def parse_colname(col):
    """
    Parse column names like:
      - '2023/4Q Aktiva celkem'
      - '4Q/2001 Tržby Výkony'
      - '2023 Kategorie obratu'
      - 'Hospodářský výsledek před zdaněním' (no time)
    Returns (year, quarter, metric).
    """
    col = col.strip()
    
    # 1) Check patterns like 'YYYY/Qx' or 'Qx/YYYY'
    #    We'll look for a pattern of either '(\d{4})/(\dQ)' or '(\dQ)/(\d{4})'
    #    We'll then see what's after that for the metric name.
    
    # Regex approach:
    # Explanation:
    #   ^(\d{4})/(\dQ)   => matches e.g. "2023/4Q"
    #   or
    #   ^(\dQ)/(\d{4})   => matches e.g. "4Q/2023"
    # We also allow more flexible (like '(\dQ)/(\d{4})' => '4Q/2001')
    # Then everything that follows is the metric name
    pattern_1 = re.compile(r"""
        ^
        (?:
          (?P<year1>\d{4})           # group year
          /(?P<qtr1>\dQ)             # slash Q
          \s+(?P<metric1>.*)        # remainder
        )
        |
        (?:
          (?P<qtr2>\dQ)
          /(?P<year2>\d{4})
          \s+(?P<metric2>.*)
        )
        $ 
    """, re.VERBOSE)
    
    match_1 = pattern_1.match(col)
    if match_1:
        # figure out if we matched the first or second branch
        if match_1.group("year1") is not None:
            year = match_1.group("year1")
            quarter = match_1.group("qtr1")
            metric = match_1.group("metric1")
        else:
            quarter = match_1.group("qtr2")
            year = match_1.group("year2")
            metric = match_1.group("metric2")
        return year, quarter, metric.strip()
    
    # 2) Check pattern like 'YYYY Kategorie obratu'
    #    That is: 4 digits at start, a space, then the metric
    pattern_2 = re.compile(r"^(\d{4})\s+(.*)$")
    match_2 = pattern_2.match(col)
    if match_2:
        year = match_2.group(1)
        metric = match_2.group(2).strip()
        return year, None, metric

    # 3) If none of the above patterns, assume no time dimension in col
    return None, None, col  # (year=None, quarter=None, metric=col)

# Apply the parser
melted[["year", "quarter", "metric"]] = melted["raw_variable"].apply(
    lambda x: pd.Series(parse_colname(x))
)

# Optionally, you might want to convert year to numeric and 
# quarter to just an integer (if '4Q' => 4).
def quarter_to_int(q):
    """
    Convert string like '1Q', '2Q', '3Q', '4Q' to integer 1..4
    or None if missing/invalid.
    """
    if isinstance(q, str) and q.endswith("Q"):
        return int(q.replace("Q", ""))
    return None if pd.isna(q) else q

melted["quarter"] = melted["quarter"].apply(quarter_to_int)
melted["year"] = pd.to_numeric(melted["year"], errors="coerce")  # or keep as str

# --------------------------------------
# 4. Clean up columns
# --------------------------------------
# We can drop "raw_variable" if we like:
melted.drop(columns=["raw_variable"], inplace=True)

# Reorder columns for clarity
# (Below is just an example order)
final_cols = [
    # Firm-level / static columns
    "IČO",
    "Název subjektu",
    "Hlavní NACE",
    "Hlavní NACE - kód",
    "Vedlejší NACE CZ",
    "Vedlejší NACE CZ - kód",
    "Hlavní OKEČ",
    "Hlavní OKEČ - kód",
    "Vedlejší OKEČ",
    "Vedlejší OKEČ - kód",
    "Institucionální sektory (ESA 2010)",
    "Institucionální sektory (ESA 95)",
    "Lokalita",
    "Kraj",
    "Počet zaměstnanců",
    "Kategorie obratu",
    "Audit",
    "Konsolidace",
    "Měna",
    "Datum vzniku",
    "Datum zrušení",
    # Newly parsed time and metric columns from the melt
    "year",
    "quarter",
    "metric",
    "value",
]
melted = melted[final_cols]


In [8]:
# Rename columns using a mapping dictionary for clarity
rename_mapping = {
    "IČO": "ico",
    "Název subjektu": "name",
    "Hlavní NACE": "main_nace",
    "Hlavní NACE - kód": "main_nace_code",
    "Vedlejší NACE CZ": "sub_nace_cz",
    "Vedlejší NACE CZ - kód": "sub_nace_cz_code",
    "Hlavní OKEČ": "main_okec",
    "Hlavní OKEČ - kód": "main_okec_code",
    "Vedlejší OKEČ": "sub_okec",
    "Vedlejší OKEČ - kód": "sub_okec_code",
    "Institucionální sektory (ESA 2010)": "esa2010",
    "Institucionální sektory (ESA 95)": "esa95",
    "Lokalita": "locality",
    "Kraj": "region",
    "Počet zaměstnanců": "num_employees",
    "Kategorie obratu": "turnover_cat",
    "Audit": "audit",
    "Konsolidace": "consolidation",
    "Měna": "currency",
    "Datum vzniku": "date_founded",
    "Datum zrušení": "date_dissolved",
    "year": "year",
    "quarter": "quarter",
    "metric": "metric",
    "value": "value"
}

# Apply the renaming
melted.rename(columns=rename_mapping, inplace=True)

In [9]:
measure_mapping = {
    "Hospodářský výsledek před zdaněním": "profit_pre_tax",
    "Hospodářský výsledek za účetní období": "profit_net",
    "Provozní hospodářský výsledek": "oper_profit",
    "Náklady": "costs",
    "Obrat, Výnosy": "sales_revenue",
    "Obrat Výnosy": "sales_revenue",
    "Tržby, Výkony": "turnover",
    "Tržby Výkony": "turnover",
    "Aktiva celkem": "total_assets",
    "Stálá aktiva": "fixed_assets",
    "Oběžná aktiva": "current_assets",
    "Ostatní aktiva": "other_assets",
    "Pasiva celkem": "total_liabilities",
    "Vlastní kapitál": "equity",
    "Cizí zdroje": "debt",
    "Ostatní pasiva": "other_liabilities"
}

# Map the metric names to the new names
melted["metric"] = melted["metric"].replace(measure_mapping)

In [10]:
## Data Cleaning 

# 1. Convert audit, consolidation to categorical
melted["audit"] = melted["audit"].astype("category")
melted["consolidation"] = melted["consolidation"].astype("category")

# 2. Convert currency from Czech strings to "CZK"/"EUR" and store as category
currency_map = {
    "Česká koruna": "CZK",
    "Euro": "EUR"
}
melted["currency"] = melted["currency"].replace(currency_map)
melted["currency"] = melted["currency"].astype("category")

# 3. Keep date_dissolved, parse as datetime (although it may be all NaN)
melted["date_dissolved"] = pd.to_datetime(melted["date_dissolved"], errors="coerce")

# 4. Convert date_founded from string to datetime
melted["date_founded"] = pd.to_datetime(melted["date_founded"], errors="coerce")

# 5. Convert esa2010, esa95, locality, region, etc. to categories
melted["esa2010"] = melted["esa2010"].astype("category")
melted["esa95"] = melted["esa95"].astype("category")
melted["locality"] = melted["locality"].astype("category")
melted["region"] = melted["region"].astype("category")

# 6. Convert ICO (firm ID) to string (instead of numeric)
melted["ico"] = melted["ico"].astype(str)

# 7. Convert main_nace, main_nace_code, sub_nace_cz, sub_nace_cz_code, etc. to categories
melted["main_nace"] = melted["main_nace"].astype("category")
melted["main_nace_code"] = melted["main_nace_code"].astype("category")
melted["sub_nace_cz"] = melted["sub_nace_cz"].astype("category")
melted["sub_nace_cz_code"] = melted["sub_nace_cz_code"].astype("category")

# 8. Convert main_okec, main_okec_code, sub_okec, sub_okec_code to category
melted["main_okec"] = melted["main_okec"].astype("category")
melted["main_okec_code"] = melted["main_okec_code"].astype("category")
melted["sub_okec"] = melted["sub_okec"].astype("category")
melted["sub_okec_code"] = melted["sub_okec_code"].astype("category")

# 9. Convert turnover_cat to category
melted["turnover_cat"] = melted["turnover_cat"].astype("category")

# 10. Convert metric to category (assuming you already applied measure name mapping)
melted["metric"] = melted["metric"].astype("category")

# 11. Convert num_employees to integer (pandas nullable Int64 if missing)
melted["num_employees"] = melted["num_employees"].astype("Int64")

# 12. Convert quarter to integer (nullable Int64 if it has NaNs)
melted["quarter"] = melted["quarter"].astype("Int64")

# 13. Convert year to integer (nullable Int64 if it has NaNs)
melted["year"] = melted["year"].astype("Int64")

# 14. 'value' remains float64 (the numeric measure), no change needed


In [None]:
#mapping dictionaries for audit and consolidation
audit_map = {"Ano": "Yes", "Ne": "No"}
consolidation_map = {"Ano": "Yes", "Ne": "No"}

# Apply the mappings. Missing values (NaN) will remain unchanged.
melted["audit"] = melted["audit"].replace(audit_map)
melted["consolidation"] = melted["consolidation"].replace(consolidation_map)

In [12]:
# Convert datetime columns to only the date (Python date objects)
melted["date_founded"] = melted["date_founded"].dt.date
melted["date_dissolved"] = melted["date_dissolved"].dt.date

In [None]:
# summary of the data: column name, data type, number of unique values, first 5 value 
summary = melted.dtypes.to_frame(name='data_type')
summary["num_unique"] = melted.nunique()
summary["first_5_values"] = melted.apply(lambda x: x.unique()[:5].tolist())
summary = summary.reset_index()
summary.columns = ["column_name", "data_type", "num_unique", "first_5_values"]
summary = summary.sort_values(by="column_name")

# print summary
summary


In [None]:
# --------------------------------------
# Save to parquet file
output_file = os.path.join(output_folder, "magnusweb_tidy.parquet")
melted.to_parquet(output_file, engine="pyarrow", compression="snappy")

print("Tidy data saved to:", output_file)
#print(melted.head(10))

In [None]:
# list NACE and NACE codes: 
nace_codes = melted[["main_nace", "main_nace_code"]].drop_duplicates()
nace_codes = nace_codes.sort_values(by="main_nace_code")
nace_codes.reset_index(drop=True, inplace=True)
nace_codes.columns = ["NACE", "NACE_code"]
nace_codes[0:30]


In [None]:
# nace codes: number or characters table
nace_codes["NACE_code"].str.len().value_counts()

In [None]:
# codes with less than 6 characters
nace_codes[nace_codes["NACE_code"].str.len() < 6].sort_values(by="NACE_code")


In [None]:
nace_codes