In [None]:
"""
Expected data path and required files: 

DATA_DIR
├── UK_65
│    ├── *_PERSON.csv
│    ├── *_CONTACT_DIAGNOSTICS.csv
│    ├── *_CONTACT_PRESCRIPTIONS.csv
│    ├── *_MEASURE_CHANGES.csv
│    ├── *_SOCIAL_CHANGES.csv
│    ├── *_ATTRIBUTES_CHANGES.csv
│    └── *_PERSON_MEASURE_CHANGES.csv
│
├── UK_70
├── FR_65
└── FR_70

MAPPING_DIR
├── mapping.csv
├── complementary_icd10_mapping_UK.pkl # dictionay mapping labels to icd10 code
└── complementary_icd10_mapping_FR.pkl # dictionay mapping labels to icd10 code
"""

DATA_DIR = '../../../data/Extractions_EU'
MAPPING_DIR = '../../../data/mapping'
OUTPUT_DIR = '../../../data/datasets'

# All the new files will be registered in a file associated to the following path:
# /localdrive10TB/users/charlotte.montaud/datasets/COUNTRY_AGEsuffix
# where COUNTRY can either be UK or FR and AGE can either be 65 or 70.
# The suffix indicates the computing that has been done on the dataset.
# The current existing suffixes in my repository are: 
# '' when nothing has been done
# 'bis' for the dataset that includes diagnosis columns
# 'mci' for dataset that includes diagnosis columns and that does not remove patients having had MCI before 2010
# 'from2000' for 'mci' dataset that also takes into account health information from 2000 (and not only from 2008)
# 'allmeds' for 'mci' dataset with all meds taken into account (not only the 50 most frequent)

suffix = 'from2005_withfreq'

In [None]:
CODES = {}

CODES['alzheimer'] = {
    'F00', # "Démence de la maladie d’Alzheimer (G30.- †)"
    'G30', # "Maladie d’Alzheimer"
} 

CODES['parkinson'] = {
    'G20', # "Maladie de Parkinson"   
    # 'G23.2', # "Atrophie multi-systématisée de type parkinsonien [AMS-P] [MSA-P]"
    # 'G23.3', # "Atrophie multi-systématisée de type cérébelleux [AMS-C] [MSA-C]"
}

CODES['vascular_dementias'] = {
    'F01', # "Démence vasculaire"
}

# Mild cognitive impairment
CODES['mci'] = {
    'F06.7', # "Trouble cognitif léger" 
    'R41.8', # "Symptômes et signes relatifs aux fonctions cognitives et à la conscience, autres et non précisés"
    'R41', # "Autres symptômes et signes relatifs aux fonctions cognitives et à la conscience"
}

CODES['alcohol_dementias'] = {
    'F10.6', # "Syndrome amnésique dû à l’alcool"
    #'F02.8', # "Démence au cours d’autres maladies classées ailleurs" (+ intoxication alcoolique)
    'G31.2', # "Dégénérescence du système nerveux liée à l’alcool"
    'E51.2', # "Encéphalopathie de Wernicke"
}

CODES['frontotemporal_dementias'] = {
    'G31.0', # "Circumscribed brain atrophy"
    'F02.0', # "Dementia in Pick disease (G31.0†)"
}

CODES['other_dementias'] = {
    'F02', # "Démence au cours d’autres maladies classées ailleurs"    
    'F03', # "Démence, sans précision"
    'F04', # "Syndrome amnésique organique, non induit par l’alcool et d’autres substances psychoactives"
    'F05.1', # "Delirium surajouté à une démence"
    
    'G31.01', # "Maladie de Pick"
    'G31.09', # "Autre trouble neurocognitif frontotemporal"
    'G31.1', # "Dégénérescence cérébrale sénile, non classée ailleurs"
}

CODES['parkinson_dementias'] = {
    'F02.3', # "Démence de la maladie de Parkinson"
    'G31.8', # "Autres affections dégénératives précisées du système nerveux" 
             #     Corps de Lewy (maladie à) (démence à) (F02.8 *)
    #'F02.8', # "Démence au cours d’autres maladies classées ailleurs"
}

from functools import reduce
all_dementias_codes = [CODES[key] for key in CODES.keys() if key not in {'parkinson', 'mci'}]
CODES['all_dementias'] = reduce(set.union, all_dementias_codes)

all_dementias_codes = [CODES[key] for key in CODES.keys() if key not in {'parkinson'}]
CODES['all_dementias+mci'] = reduce(set.union, all_dementias_codes)


DISEASES_OF_INTEREST = list(CODES.keys())

# Utils

### Importations

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer

from typing import Dict, Set, List, Tuple
import datetime

from glob import glob
import pickle

from IPython.display import display

pd.options.mode.chained_assignment = None # disable (false positive) warning 

In [None]:
DataFrameStatsTuple = Tuple[pd.DataFrame, Dict[str, int]]

In [None]:
def print_nan_stats(df):
    nan_stats = df.isna().sum().iloc[:10]
    nan_stats = pd.DataFrame(nan_stats, columns=['# NaN'])
    nan_stats_percent = np.round(nan_stats*100/len(df),2).rename(columns={'# NaN':'NaN (%)'})
    display(pd.concat([nan_stats, nan_stats_percent], axis=1).T)
    print(f'{len(df):,} patients\n')

def count_diseases(df):
    for disease in DISEASES_OF_INTEREST:
        n = df['diseases'].apply(lambda x: any(d == disease for d, t in x)).sum()
        print(f'\t- {n:,} patients having {disease}')

def processing_nans(df, to_censure)->pd.DataFrame:
    print('* Original dataset:')
    print_nan_stats(df)
    
    print('* Processing NaNs:')
    # Excluding people that are either dead before the period of interest, always 'Temporaire' (UK) 
    # or always inactive
    mask = df['person_state_code'].isna()
    print(f'\t> Excluding {sum(mask):,} patients due to being either dead before the \n\t'
         'period of interest or always Temporaire / Inactive.\n')
    df = df[~mask]
    
    # We assume that patients that are not present in DIAGNOSTICS table are healthy
    removed_person_ids = set(to_censure.keys()).intersection(df['person_id'])
    print(f"\t> Excluding {len(removed_person_ids):,} patients who had a disease before the end of\n\t"
         "the period of interest.\n")
    
    removed_diseases = []
    for person_id in removed_person_ids:
        removed_diseases.extend(to_censure[person_id])
    for disease in DISEASES_OF_INTEREST:
         print(f"\t\t- {removed_diseases.count(disease):,} patients having {disease}")
    df = df[~df['person_id'].isin(removed_person_ids)]
    
    mask = df['diseases'].isna()
    print(f"\n\t> Filling {sum(mask):,} patients' diseases column for patient having no\n\t"
          "diagnostics (assuming no neurodegenerative diseases).\n")
    df.loc[mask, 'diseases'] = df.loc[mask, 'diseases'].apply(lambda x: [])
    
    # We assume that people have no prescriptions do not take the medications
    print(f"\t> Filling {df.iloc[:,-1].isna().sum():,} patients' medications columns for patients having\n\t"
          "no prescriptions (assuming no medications were taken).\n")
    cols_medications = df.columns[9:]
    df[cols_medications] = df[cols_medications].fillna(0)
    
    print('* After processing:')
    print_nan_stats(df)
    
    print('Diseases:')
    count_diseases(df)
    
    return df

In [None]:
def save_dataset(df, to_censure:dict, inactive_ids:set, stats:dict, country, age, base_dir=OUTPUT_DIR):
    ## Manage folders
    # Create output directory if it doesn't exist
    if not os.path.exists(base_dir):
        os.mkdir(base_dir)
    
    # Create output subdirectory if it doesn't exist
    sub_dir = os.path.join(base_dir, f"{country}_{age}")
    if not os.path.exists(sub_dir):
        os.mkdir(sub_dir)
    else:
        # Assert that the subdirectory is empty
        assert not os.listdir(sub_dir), f"Directory {sub_dir} is not empty."
    
    ## Save objects
    # dataset
    dataset_path = os.path.join(sub_dir, "dataset.csv")
    df.to_csv(dataset_path, index=False)
    
    # to_censure 
    to_censure_path = os.path.join(sub_dir, "to_censure.pkl")
    with open(to_censure_path, "wb") as f:
        pickle.dump(to_censure, f)
    
    # inactive_ids
    inactive_ids_path = os.path.join(sub_dir, "inactive_ids.pkl")
    with open(inactive_ids_path, "wb") as f:
        pickle.dump(inactive_ids, f)
    
    # stats
    stats_path = os.path.join(sub_dir, "stats.pkl")
    with open(stats_path, "wb") as f:
        pickle.dump(stats, f)

### Params

In [None]:
min_before=1; min_during=0; min_after=1
start_year=2008; end_year=2010

n_most_frequent=50
valid_height_range=(100, 250); valid_weight_range=(30, 250) # value from paper
extraction_date='2023-01-01'

### UK_65

In [None]:
import sys
sys.path.append('Clean code')
from get_dataset import get_dataset

dataset_UK_65, has_another_disease_before_UK_65, inactive_ids_UK_65, stats_UK_65 = get_dataset(country='UK', age=65, valid_height_range=valid_height_range, valid_weight_range=valid_weight_range,
      min_before=min_before, min_during=min_during, min_after=min_after,
      n_most_frequent=n_most_frequent,
      start_year=start_year, end_year=end_year,
      extraction_date=extraction_date, mapping_dir=MAPPING_DIR, ref_codes=CODES)

In [None]:
dataset_UK_65 = processing_nans(dataset_UK_65, has_another_disease_before_UK_65)

In [None]:
save_dataset(dataset_UK_65, has_another_disease_before_UK_65,
             inactive_ids_UK_65, stats_UK_65, 
             'UK', 65)

### UK_70

In [None]:
%%time
dataset_UK_70, has_another_disease_before_UK_70, inactive_ids_UK_70, stats_UK_70 = get_dataset(
    country='UK', age=70, 
    valid_height_range=valid_height_range , valid_weight_range=valid_weight_range, 
    min_before=min_before, min_during=min_during, min_after=min_after,
    n_most_frequent=n_most_frequent, 
    start_year=start_year, end_year=end_year,
    extraction_date=extraction_date,mapping_dir=MAPPING_DIR, ref_codes=CODES)

In [None]:
dataset_UK_70 = processing_nans(dataset_UK_70, has_another_disease_before_UK_70)

In [None]:
save_dataset(dataset_UK_70, has_another_disease_before_UK_70, 
             inactive_ids_UK_70, stats_UK_70, 
             'UK', 70)

### FR_65

In [None]:
%%time
dataset_FR_65, has_another_disease_before_FR_65, inactive_ids_FR_65, stats_FR_65 = get_dataset(
    country='FR', age=65, 
    valid_height_range=valid_height_range , valid_weight_range=valid_weight_range, 
    min_before=min_before, min_during=min_during, min_after=min_after,
    n_most_frequent=n_most_frequent, 
    start_year=start_year, end_year=end_year,
    extraction_date=extraction_date,mapping_dir=MAPPING_DIR, ref_codes=CODES)

In [None]:
dataset_FR_65 = processing_nans(dataset_FR_65, has_another_disease_before_FR_65)

In [None]:
save_dataset(dataset_FR_65, has_another_disease_before_FR_65, 
             inactive_ids_FR_65, stats_FR_65,
             'FR', 65)

### FR_70

In [None]:
%%time
dataset_FR_70, has_another_disease_before_FR_70, inactive_ids_FR_70, stats_FR_70 = get_dataset(
    country='FR', age=70, 
    valid_height_range=valid_height_range , valid_weight_range=valid_weight_range, 
    min_before=min_before, min_during=min_during, min_after=min_after,
    n_most_frequent=n_most_frequent, 
    start_year=start_year, end_year=end_year,
    extraction_date=extraction_date,mapping_dir=MAPPING_DIR, ref_codes=CODES)

In [None]:
dataset_FR_70 = processing_nans(dataset_FR_70, has_another_disease_before_FR_70)

In [None]:
save_dataset(dataset_FR_70, has_another_disease_before_FR_70, 
             inactive_ids_FR_70, stats_FR_70,
             'FR', 70)

In [None]:
!ls {OUTPUT_DIR}

In [None]:
DISEASES_OF_INTEREST