# Extract & Load *Indice des prix* 

## Setup

In [7]:
import os
import pathlib
import warnings
import re

from hydra import compose, initialize
from omegaconf import dictconfig
from omegaconf import OmegaConf
import openpyxl
import pandas as pd

### Project Variables

In [8]:
CONFIG_NAME = "indice_des_prix"
SOURCE_SETTINGS_PATH = pathlib.Path('/dataplatform_lab', 'lab', 'notebooks', 'sources')
DATA_PATH = pathlib.Path('/dataplatform_lab', 'lab', 'dwh_data')
EXTRACTS_PATH = pathlib.Path(DATA_PATH, 'extracts')
LOAD_PATH = pathlib.Path(DATA_PATH, 'raw')

### Functions

In [13]:
def get_config() -> dictconfig.DictConfig:
    config_path = SOURCE_SETTINGS_PATH.relative_to(os.getcwd()).as_posix()
    with initialize(
        version_base=None, 
        config_path=config_path
    ):
        return compose(config_name=CONFIG_NAME)


def get_available_year(historic_years, year):
    return list(range(year-1, year-(historic_years+1), -1))


def get_years_to_load(historic_years, years):
    years.sort(reverse=True)
    dc_available_years = {
        y:get_available_year(historic_years, y) for y in years
    }
    dc_years = {}
    for year in years:
        dc = {
            ay:year for ay in dc_available_years[year] if ay not in dc_years
        }
        dc_years.update(dc)
    return dc_years


def get_excel_file(folder_path: pathlib.Path, excel_file_name: str) -> pd.ExcelFile:
    excel_file_path = pathlib.Path(folder_path, excel_file_name)
    return pd.ExcelFile(excel_file_path)


def process_column_names(
    df: pd.DataFrame,
    dimension_cols: None | list = None,
    dimension_cols_rename: None | list  = None
):
    rename_dc = {}
    value_cols = df.columns.difference(dimension_cols)
    if dimension_cols_rename:
        rename_dc.update(
            {
                old:new for old, new in zip(
                    dimension_cols, dimension_cols_rename
                )
            }
        )
        
    for col in value_cols:
        renamed_col = ' '.join([x for x in re.findall(r"[^\W]*",str(col)) if x])
        rename_dc.update({col: renamed_col})

    return (
        df
        .rename(columns=rename_dc)
    )


def merge_multiline_labels(
        df: pd.DataFrame,
        dimension_cols: list
    ) -> pd.DataFrame:
    col_years=list(df.columns.difference(dimension_cols))
    ls_produits = []
    previous = {}
    for i, row in df.iterrows():
        row_dc = row.to_dict()
        actual = {col:row_dc[col] for col in dimension_cols}
        elem = row_dc[col_years[0]]
        if pd.isna(elem):
            previous = actual
        elif previous:
            ls_produits.append(
                {
                    col:str(previous.get(col)).strip() + "" + str(actual.get(col)).strip() for col in dimension_cols
                }
            )
            previous = {}
        else:

            ls_produits.append(
                { 
                    col:str(actual.get(col)).strip() for col in dimension_cols 
                }
            )
            previous = {}

    df_produits = pd.DataFrame(ls_produits)
    
    # clean values
    df_values = (
        df[col_years]
        .dropna(
            axis=0,
            how='any',
        )
        .reset_index(drop=True)
    )
    
    return pd.concat([df_produits, df_values], axis=1)


def prepare_df(config, df:pd.DataFrame) -> pd.DataFrame:

    dimension_cols = df.columns[config.dimension_cols_index]
    clean_col_df = process_column_names(
        df, 
        dimension_cols=dimension_cols,
        dimension_cols_rename=config.dimension_cols_rename
    )
    dimension_cols = clean_col_df.columns[config.dimension_cols_index]
    clean_df = merge_multiline_labels(clean_col_df, dimension_cols) 

    return clean_df


def save_df(df, path):
    if not path.parent.exists():
        path.parent.mkdir(parents=True)
    
    df.to_parquet(
        path,
        index = None
    )


def get_base_year(
    excel_file: pd.io.excel._base.ExcelFile,
    sheet_name: str,
    base_year_cell: str,
) -> int:
    wb = openpyxl.load_workbook(excel_file)
    ws = wb[sheet_name]
    cell = ws[base_year_cell]
    match = re.match(r"Base\s100\s\:\s(?P<base_year>\d{4})", cell.value)
    if match:
        return match.groupdict()["base_year"]
    else:
        return False

### Classes

In [14]:
class YearFile():

    def __init__(self, dataset_version, year, year_file_dict={}):
        self.dataset_version = dataset_version
        self.year = year
        self.file_name = dataset_version.dataset.source.file_names_dc[year]
        year_file_config = year_file_dict.get('config', {})
        self.config = OmegaConf.merge(dataset_version.config, year_file_config)
        self.raw_df = None
        self.dimensions_df = None
        self.prepared_df = None
        self.available_years = [
            str(y) for y in get_available_year(self.config.historic_years, self.year)
        ]
        self.years_to_load = []
            

    def load_data_as_df(self):
        folder_path = pathlib.Path(EXTRACTS_PATH, f"Annuaire Statistique {self.year}")
        file_name = self.dataset_version.dataset.source.file_names_dc[self.year]
        excel_file = get_excel_file(folder_path, file_name)
        self.raw_df = pd.read_excel(
            excel_file,
            self.config.sheet_name,
            header=0,
            skiprows=self.config.skiprows,
            skipfooter=self.config.skipfooter
        )
        prepared_df_all_cols = prepare_df(self.config, self.raw_df)
        self.years_to_load = self.dataset_version.years_per_file.get(self.year, [])
        self.prepared_df = prepared_df_all_cols[self.years_to_load]
        self.dimensions_df = prepared_df_all_cols[self.config.dimension_cols_rename]



class DatasetVersion:

    def __init__(self, dataset, version_dict):
        self.dataset = dataset
        version_config = version_dict.get('config', {})
        self.config = OmegaConf.merge(dataset.config, version_config)
        self.version_num = version_dict["version"]
        self.historic_years = self.config.historic_years
        self.years_dc = version_dict["years"]
        self.years = list(self.years_dc.keys())
        self.files_to_load = self.get_files_to_load_per_year()
        self.years_config = self.get_years_config()
        self.year_files = []
        self.years_per_file = {}
        self.process_year_files()


    def process_year_files(self):
        for year, year_file_dict in self.years_dc.items():
            year_file_dict = year_file_dict or {}
            year_file = YearFile(self, year, year_file_dict)
            self.year_files.append(year_file)
        self.year_files.sort(key=lambda x: x.year, reverse=True)
        self.years_per_file = self.get_years_per_file()


    def get_years_per_file(self):
        dc_years = {}
        for year in self.year_files:
            dc_years.update(
                {
                    ay:year.year for ay in year.available_years if ay not in dc_years
                }
            )
        dc_years_to_load_per_file = {y:[] for y in self.years}
        for available_year, year in dc_years.items():
            dc_years_to_load_per_file[year].append(available_year)
        return dc_years_to_load_per_file

    
    def get_data(self):
        for year_file in self.year_files:
            year_file.load_data_as_df()
        dimensions_df = self.year_files[0].dimensions_df
        ls_dfs = [dimensions_df, *[y.prepared_df for y in self.year_files]]
        return pd.concat(ls_dfs,  axis=1)        
    

    @property
    def fqtn(self):
        return f"{self.dataset.table_name}_v{self.version_num}"
    

    def __repr__(self):
        return self.fqtn
    

    def get_files_to_load_per_year(self):
        years_to_load_dc = get_years_to_load(
            self.historic_years, 
            self.years
        )
        file_names_dc = self.dataset.source.file_names_dc
        return {k:file_names_dc[v] for k,v in years_to_load_dc.items()}
    

    def get_years_config(self):
        dc_config = {}
        for year, year_dc in self.years_dc.items():
            dc_config[year] = year_dc.config if year_dc else {}

        return dc_config
    

    @property
    def load_path(self):
        return pathlib.Path(
            LOAD_PATH,
            self.dataset.source.schema,
            f"{self.fqtn}.parquet"
        )

    def load(self):
        df = self.get_data()
        save_df(df, self.load_path) 


class Dataset:
    def __init__(self, source, dataset_dict):
        self.source = source
        self.name = dataset_dict.name
        self.table_name = dataset_dict.table_name
        self.config = dataset_dict.get('config', {})
        self.type_data = self.config.get('type')
        self.versions_ls = dataset_dict["versions"]
        self.versions = []
        self.process_versions()


    def process_versions(self):
        for version_dict in self.versions_ls:
            version = DatasetVersion(self, version_dict)
            self.versions.append(version)

    def extract(self):
        for version in self.versions:
            version.load()


class Source:

    def __init__(self, source_config):
        self.name = source_config.name
        self.file_names_dc = source_config.file_names
        self.schema = source_config.schema
        self.datasets_ls = source_config.datasets
        self.datasets = []
        self.process_datasets()

    def __str__(self):
        return self.name
    
    def __repr__(self):
        return self.name

    
    def process_datasets(self):
        for dataset_dict in self.datasets_ls:
            dataset = Dataset(self, dataset_dict)
            self.datasets.append(dataset)

    
    def extract(self):
        for dataset in self.datasets:
            dataset.extract()


## Data Extraction

In [15]:
config = get_config()
ipp_source = Source(config)
ipp_source.extract()

## debug

In [19]:
pd.read_parquet(
    "/dataplatform_lab/lab/dwh_data/raw/indice_des_prix/ipc_national_v2.parquet"

)

Unnamed: 0,libelle_fr,libelle_ar,2021,2020,2019,2018
0,Produits alimentaires et boissons non alcoolisées,المواد الغذائية والمشروبات غير الكحولية,102.2,101.6,100.6,102.0
1,Produits alimentaires,المواد الغذائية,102.4,101.7,100.7,102.1
2,Boissons non alcoolisées,المشروبات غير الكحولية,99.4,99.4,99.4,99.7
3,Boissons alcoolisées et tabac,المشروبات الكحولية و التبغ,125.5,121.2,120.2,108.4
4,Boissons alcoolisées,المشروبات الكحولية,102.4,99.8,100.6,101.4
5,Tabac,التبغ,126.4,122.1,121.0,108.7
6,Articles d'habillement et chaussures,الملابس والأحذية,104.7,102.7,102.4,101.2
7,Articles d'habillement,الملابس,104.2,102.3,101.9,100.9
8,Chaussures,الأحذية,107.0,104.6,104.6,102.4
9,"Logement, eau, électricité, gaz et autres comb...",السكن والماء والكهرباء والغاز ومحروقات أخرى,102.6,101.8,101.2,100.5
