In [3]:
import pandas as pd
import os

# Romanian month name to number
romanian_months = {
    'ianuarie': 1, 'februarie': 2, 'martie': 3, 'aprilie': 4,
    'mai': 5, 'iunie': 6, 'iulie': 7, 'august': 8,
    'septembrie': 9, 'octombrie': 10, 'noiembrie': 11, 'decembrie': 12
}

# Extract date from folder name (format: statistica-aprilie-2017)
def extract_date_from_folder(folder_name):
    parts = folder_name.lower().split('-')
    if len(parts) == 3:
        ro_month = parts[1]
        year = parts[2]
        month_number = romanian_months.get(ro_month)
        if month_number:
            return pd.to_datetime(f"{year}-{month_number}-1")
    return None

# Clean and transform a single cereri.xlsx file
def clean_applications_file(filepath, date):
    # Read with single header, skip 2 rows
    df = pd.read_excel(filepath, header=0, skiprows=2)

    # Drop first column (empty or index)
    df = df.drop(df.columns[0], axis=1)

    # Rename columns
    df.columns = ['county', 'request_type', 'online', 'window']

    # Fill missing county names
    df['county'] = df['county'].ffill()

    # Remove total rows
    df = df[df['county'].str.upper() != 'TOTAL']

    # Melt into long format
    df_melted = df.melt(
        id_vars=['county', 'request_type'],
        value_vars=['online', 'window'],
        var_name='request_method',
        value_name='number_of_requests'
    )

    # Translate request_type into English
    df_melted['request_type'] = df_melted['request_type'].replace({
        'altele': 'other',
        'informare': 'information',
        'inscriere': 'registration',
        'receptie': 'reception',
    })

    # Add date column
    df_melted.insert(0, 'date', date)

    return df_melted

# Loop through all folders and extract data from cereri.xlsx
def process_application_data(data_path='data', output_dir='cleaned_tables'):
    os.makedirs(output_dir, exist_ok=True)
    all_data = []

    for folder_name in os.listdir(data_path):
        folder_path = os.path.join(data_path, folder_name)
        if os.path.isdir(folder_path):
            date = extract_date_from_folder(folder_name)
            if date is None:
                continue

            for filename in os.listdir(folder_path):
                if filename.endswith('.xlsx') and 'cereri' in filename.lower():
                    file_path = os.path.join(folder_path, filename)
                    try:
                        cleaned_df = clean_applications_file(file_path, date)
                        all_data.append(cleaned_df)
                    except Exception as e:
                        print(f"⚠️ Error processing {file_path}: {e}")

    if all_data:
        final_df = pd.concat(all_data, ignore_index=True)
        output_file = os.path.join(output_dir, 'applications_table.csv')
        final_df.to_csv(output_file, index=False)
        print(f"✅ Applications CSV saved to: {output_file}")
    else:
        print("⚠️ No valid applications files were found.")

# Run the full process
process_application_data()


⚠️ Error processing data/statistica-aprilie-2024/2024-ian_aprilie_cereri.xlsx: Length mismatch: Expected axis has 5 elements, new values have 4 elements
✅ Applications CSV saved to: cleaned_tables/applications_table.csv


In [4]:
df = pd.read_csv('cleaned_tables/applications_table.csv')
display(df.head())
display(df.columns.to_list())

Unnamed: 0,date,county,request_type,request_method,number_of_requests
0,2022-06-01,ALBA,other,online,0.0
1,2022-06-01,ALBA,information,online,2654.0
2,2022-06-01,ALBA,registration,online,2398.0
3,2022-06-01,ALBA,reception,online,1145.0
4,2022-06-01,ARAD,other,online,0.0


['date', 'county', 'request_type', 'request_method', 'number_of_requests']