In [1]:
from pathlib import Path
from typing import Tuple, List
from collections import defaultdict

import pandas as pd
from matplotlib import pyplot as plt
import numpy as np

import seaborn as sns

# TODO
- [X] Drop X and Y chromosomes in gene cols F: XX, H: XY
- [ ] Remove observations with low PFS

# Data description

## Metadata
1 ligne par fichier, des fois 2 ; dans le cas où il y a 2 lignes prendre la ligne où le CPA est le plus élevé (correspond à une seconde analyse où plus de séquences d'ADN ont été analysées)

3 colonnes CPA ont été normalisées par la fonction racine carré (afin de limiter au maximum l'impact des outlayers lors des analyses). Les valeurs CPA que l'on voit sont en fait les racines-carrés des valeurs initiales (un petit carré sur les colonnes permettra de retomber sur les valeurs initiales, non normalisées) 

* CPA : % d'altération. Nb d'altération détectée par rapport à une baseline. Estimation de la quantité d'adn de la tumeur qui se trouve dans le sang. C'est une valeur pas un pourcentage.
* nb.reads = dans un premier temps on s'en fiche (valeur qui permet de normaliser) Nombre de séquences scanées. Si très bas on fait moins confiance.
* sexe
* age
* htum : grosseur de la tumeur au diagnostic (ie. le jour où on fait la radio)
* relapse : 1 rechute, 0 pas rechute
* etat2 : statut du patient (0 vivant, 1 décédé)
* id.x : id patient
* time_point : 3 valeurs possibles (DIAG, AVANT_CHIR, FIN_TTT)
* time_OS : durée de suivi total
* time_PFS : nombre de jour entre la rechute et le diagnostique
* rep_histo : reponse histologique. Bon ou mauvais répondeur. On l'a après le 1er traitement
* puberte : + parlant que l'âge
* meta_bis : présence de métastase au diagnostique
* Time EFS: Time OS ou time PFS


## Data

En fonction des patients on n'a pas tous les time points
* Diagnotique
* Avant chirurgie (déjà un traitement)
* Fin T : fin traitement

Chaque chromosome est découpé en fenêtres => même fenetres pour tous les patients
Indicateur de présence d'ADN tumoral (0 : 2 copies, > 0 copie en trop, < 0 perte)

# Preprocess data

## Load files

In [2]:
OUTPUT_DIR = Path('..') / 'data' / 'intermediate'

data_dir = Path('..') / 'data' / 'raw' / 'results'
metadata_path = Path('..') / 'data' / 'raw' / 'metadata_DMLHT_ctDNA.csv'
metadata_path_old = Path('..') / 'data' / 'raw' / 'metadata_OS2006_ctdna.csv'

In [3]:
metadata = pd.read_csv(metadata_path, sep='\t', skiprows=1)

In [4]:
# get columns description
col_desc = !head -n 1 ../data/raw/metadata_DMLHT_ctDNA.csv
col_desc = pd.concat([pd.Series(metadata.columns), pd.Series(col_desc[0].split('\t'))], axis=1)
col_desc.columns = ['col_name', 'desc']
col_desc

Unnamed: 0,col_name,desc
0,sex,Genre du patient
1,age,Age du patient (années)
2,pub,Stade pubertaire
3,htum,hauteur de la tumeur (mm)
4,meta_bis,Présence (ou non) de metastase au diagnostic
5,chimgr,Type de chimio
6,rep_histo,réponse histologique : bonne (GR) ou mauvaise ...
7,relapse,"Rechute (1=oui, 0= non)"
8,etat2,Statut vital (1 =décédé)
9,time_OS,Survie globale jusqu’à la dernière date de sui...


In [5]:
def get_infos_from_filename(filename: str) -> Tuple[str, str]:
    filename = filename.split('.')[0]
    splitted_filename = filename.split('_')
    id_patient = f'{splitted_filename[0]}_{splitted_filename[1]}'
    time_point = f'{splitted_filename[2]}_{splitted_filename[3]}' if len(splitted_filename) == 4 else splitted_filename[2]
    return id_patient, time_point


def load_all_data_patient(data_dir: Path) -> defaultdict:
    data_patient = defaultdict(dict)

    for p in data_dir.glob('*.txt'):
        if not p.is_file():
            continue
        filename = str(p).split('/')[-1]
        id_patient, time_point = get_infos_from_filename(filename)
        data_patient[id_patient][time_point] = pd.read_csv(p, sep='\t')
    
    return data_patient

In [6]:
data_patient = load_all_data_patient(data_dir)
print(f'There are {len(data_patient)} patient')

There are 182 patient


## Organize data

In [7]:
patient_all_time_points = {patient_id: data for patient_id, data in data_patient.items() if len(data) == 3}
print(f'There are {len(patient_all_time_points)} patient with data for the 3 time points')

There are 115 patient with data for the 3 time points


In [8]:
patient_diag_time_point = {patient_id: {'DIAG': data['DIAG']} for patient_id, data in data_patient.items() if data.get('DIAG') is not None}
print(f'There are {len(patient_diag_time_point)} patient with data for DIAG time point')

There are 177 patient with data for DIAG time point


In [9]:
patient_chir_time_point = {patient_id: {'AVANT_CHIR': data['AVANT_CHIR']} for patient_id, data in data_patient.items() if data.get('AVANT_CHIR') is not None}
print(f'There are {len(patient_chir_time_point)} patient with data for AVANT_CHIR time point')

There are 143 patient with data for AVANT_CHIR time point


In [10]:
patient_end_time_point = {patient_id: {'FIN_TT': data['FIN_TT']} for patient_id, data in data_patient.items() if data.get('FIN_TT') is not None}
print(f'There are {len(patient_end_time_point)} patient with data for FIN_TT time point')

There are 143 patient with data for FIN_TT time point


In [11]:
## Check that there are no errors
nb_files = 0
for data in data_patient.values():
    nb_files += len(data.values())
assert nb_files == len(patient_diag_time_point) \
    + len(patient_chir_time_point) \
    + len(patient_end_time_point)

## Transform data
### Transpose data

In [12]:
def transpose_data_one_patient(data: pd.DataFrame) -> pd.DataFrame:
    one_data = data.copy()
    one_data = one_data[['id', 'ratio', 'zscore']].set_index('id').stack().reset_index()
    one_data.columns = ['id', 'score_label', 'value']
    one_data['col_name'] = one_data['id'] + '_' + one_data['score_label']
    return one_data.set_index('col_name')['value'].to_frame().T


def format_all_data_as_df(patient_data: defaultdict) -> pd.DataFrame:
    transpose_data = []
    ids_patient = []
    
    for patient_id, data in patient_data.items():
        for time_point, data in data.items():
            transpose_data.append(transpose_data_one_patient(data))
            ids_patient.append((patient_id, time_point))
        
    transpose_data_df = pd.concat(transpose_data, axis=0).reset_index(drop=True)
    ids_patient_df = pd.DataFrame(ids_patient, columns=['patient_id', 'time_point'])
    
    return pd.concat([ids_patient_df, transpose_data_df], axis=1)


def pivot_data(patient_all_time_points_df: pd.DataFrame) -> pd.DataFrame:
    """create 1 column per time_point x gene col"""
    pivoted = patient_all_time_points_df.pivot(index=['patient_id'], columns=['time_point'])
    pivoted.columns = ['_'.join(col).strip() for col in pivoted.columns.values]
    pivoted = pivoted.reset_index()
    return pivoted

In [13]:
patient_diag_time_point_df = format_all_data_as_df(patient_diag_time_point)
patient_chir_time_point_df = format_all_data_as_df(patient_chir_time_point)
patient_end_time_point_df = format_all_data_as_df(patient_end_time_point)

In [14]:
patient_all_time_points_df = format_all_data_as_df(patient_all_time_points)

In [15]:
# removing X,Y chromosomes
cols_to_keep = [col for col in patient_all_time_points_df.columns
 if not col.startswith('X')
 and not col.startswith('Y')]

list(set([col[0] for col in cols_to_keep])) # first char of col name

patient_all_time_points_df = patient_all_time_points_df[cols_to_keep]

In [16]:
patient_all_time_points_pivoted = pivot_data(patient_all_time_points_df)

## Clean metadata

In [17]:
COL_MAPPER = {
    'Code': 'patient_id',
    'EFS': 'time_EFS'}

metadata_df = metadata.copy()
metadata_df = metadata_df.rename(columns=COL_MAPPER)
metadata_df = metadata_df.sort_values('cpa_diagnostic', ascending=False).drop_duplicates(['patient_id'], keep='first')

In [18]:
metadata_df.head()

Unnamed: 0,sex,age,pub,htum,meta_bis,chimgr,rep_histo,relapse,etat2,time_OS,time_PFS,cpa_finTT,cpa_diagnostic,cpa_chirurgie,time_EFS,patient_id
33,Feminin,20.55,0.0,11.6,1,MTX,GR,0,0,2628.0,,0.378365,1.620802,,2628.0,OS2006_34
93,Feminin,12.91,1.0,9.3,0,MTX,GR,1,0,1786.0,1412.0,0.407198,1.255325,0.375167,1412.0,OS2006_424
10,Masculin,4.71,1.0,10.2,0,MTX,,1,1,261.0,59.0,,1.180568,,59.0,OS2006_6
157,Feminin,15.62,0.0,15.0,1,MTX,,1,0,1100.0,63.0,0.42417,1.172536,0.405512,63.0,OS2006_543
70,Feminin,12.56,0.0,11.4,1,MTX,,1,1,258.0,61.0,,1.160646,0.441871,61.0,OS2006_357


# join data and metadata

## one row per patient, all time points

In [19]:
merged = pd.merge(metadata_df, patient_all_time_points_pivoted, on='patient_id', how='outer') 

In [20]:
print(f"proportion of nulls: {merged['1:6000001-7000000_ratio_DIAG'].isna().sum()/len(merged)}")

proportion of nulls: 0.36813186813186816


In [21]:
assert len(merged.patient_id.unique()) == len(merged), "there are duplicated values"

## per time point

In [22]:
def merge_data_and_metadata(
        metadata_df: pd.DataFrame,
        data: pd.DataFrame,
        to_drop: List[str],
        time_points: List[str],
        ) -> pd.DataFrame:
    data_processed = data.copy()
    data_processed = data_processed.drop(columns=to_drop, errors="ignore")
    data_processed = data_processed[data_processed['time_point'].isin(time_points)]
    data_processed = pivot_data(data_processed)
    return pd.merge(
        metadata_df.drop(columns=to_drop, errors="ignore"),
        data_processed,
        how='inner',
        on='patient_id',
    )

In [23]:
TO_DROP = ['cpa_chirurgie', 'cpa_finTT', 'rep_histo']    # to avoid data leakage
TIME_POINTS = ['DIAG']    # we assure 'AVANT_CHIR' is actually after diag

merged_diag = merge_data_and_metadata(
    metadata_df,
    patient_all_time_points_df,
    to_drop=TO_DROP,
    time_points=TIME_POINTS)

In [24]:
TO_DROP = ['cpa_finTT']    # to avoid data leakage
TIME_POINTS = ['DIAG', 'AVANT_CHIR']    # we assume 'avant_chir' is after chir

merged_chir = merge_data_and_metadata(
    metadata_df,
    patient_all_time_points_df,
    to_drop=TO_DROP,
    time_points=TIME_POINTS)

# Save

In [25]:
# merged
merged.to_csv(OUTPUT_DIR / 'data_metadata.csv', index=False)    # all time points
merged_diag.to_csv(OUTPUT_DIR / 'data_metadata_diag.csv', index=False)    # diag data only
merged_chir.to_csv(OUTPUT_DIR / 'data_metadata_chir.csv', index=False)    # diag + chir data

# data
patient_all_time_points_pivoted.to_csv(OUTPUT_DIR / 'data_all_time_point_pivoted.csv', index=False)    # one row per patient
patient_all_time_points_df.to_csv(OUTPUT_DIR / 'data_data_all_time_point.csv', index=False)    # one row per patient per time point

# metadata
metadata_df.to_csv(OUTPUT_DIR / "metadata.csv", index=False)

# Load processed data

In [26]:
OUTPUT_DIR = Path('..') / 'data' / 'intermediate'

# merged
merged = pd.read_csv(OUTPUT_DIR / 'data_metadata.csv')    # all time points
merged_diag = pd.read_csv(OUTPUT_DIR / 'data_metadata_diag.csv')    # diag data only
merged_chir = pd.read_csv(OUTPUT_DIR / 'data_metadata_chir.csv')    # diag + chir data

# data
patient_all_time_points_pivoted = pd.read_csv(OUTPUT_DIR / 'data_all_time_point_pivoted.csv')    # one row per patient
patient_all_time_points_df = pd.read_csv(OUTPUT_DIR / 'data_data_all_time_point.csv')    # one row per patient per time point

# metadata
metadata_df = pd.read_csv(OUTPUT_DIR / "metadata.csv")