In [1]:
import numpy as np
import pandas as pd
import os
import glob


In [2]:
# load and name each file by its file name in folder (without extension)
path = 'files/'
var_list = []
countries = []

for f in glob.glob(os.path.join(path, '*.csv')):
    variable_name = os.path.splitext(os.path.basename(f))[0]
    var_list.append(variable_name)
    globals()[variable_name] = pd.read_csv(f, parse_dates=['TIME'],index_col='TIME') # load all files separately, parse date column 'TIME' as index of datetime
    # create a list containing all countries from all files
    for country in globals()[variable_name].LOCATION:
        countries.append(country)
        countries = list(dict.fromkeys(countries))
    



In [3]:
# save formal variable description to use as main sheet name
variable_description = {}

for var in var_list:    
    variable_description[var] = str(globals()[var].iloc[0][1] + ' ' + globals()[var].iloc[0][2] + ' ' + globals()[var].iloc[0][3])

In [4]:
# transform with pivot and complete missing countries
for var in var_list:
    # pivot wider, columns=LOCATION, values=Value
    globals()[var] = globals()[var].pivot(columns='LOCATION', values='Value')
    # add missing countries to all variables
    for country in countries:
        if country in globals()[var].columns:
            continue
        else:
            globals()[var][country] = np.nan


In [5]:
# create dictionary of all missing dates between valid dates for each country in each variable
all_missing_dates = {}

for var in var_list:
    all_missing_dates[var] = {}
    for country in countries:
        try:
            ind = globals()[var][country][globals()[var][country].first_valid_index():globals()[var][country].last_valid_index()].isna()
            ind.drop(ind[ind==False].index, inplace=True)
            ind = ind.index
            all_missing_dates[var][country] = ind
        except:
            pass

In [7]:
# create a dictionary with info about each variable - start date and end date
first_last_date = {}

for var in var_list:
    first_last_date[var] = {'index':['first', 'last']}
    for country in countries:
        first_last_date[var][country] = [globals()[var][country].first_valid_index(), globals()[var][country].last_valid_index()]        

In [8]:
for var in var_list:
    path = "%s.xlsx" % var
    main_sheet = variable_description[var]
    with pd.ExcelWriter(path) as writer:
        globals()[var].to_excel(writer, sheet_name=variable_description[var], index=True)
        pd.DataFrame.from_dict(first_last_date[var], orient='columns').to_excel(writer, sheet_name="first_last", index=True)
        all_missing = pd.DataFrame.from_dict(all_missing_dates[var], orient='index')
        all_missing = all_missing.T
        all_missing.index = np.arange(1, len(all_missing) + 1)
        all_missing.to_excel(writer, sheet_name="all_missing", index=True)
    