In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import pandas as pd
from data_cleaning import (
    _clean_lipid_intensities,
    _clean_sample_description,
    clean_lipidomic_data,
)

In [3]:
THIS_DIR = Path(".").resolve()
ROOT = THIS_DIR.parent.parent.resolve()
DATA_DIR = ROOT / "src" / "data"
BLD_DATA = ROOT / "bld" / "data"
BLD_DATA.mkdir(parents=True, exist_ok=True)

In [5]:
lipid_intensities = pd.read_csv(DATA_DIR / "lipidomics" / "lipid_intensities.csv")
sample_description = pd.read_csv(
    DATA_DIR / "lipidomics" / "sample_description.csv", delimiter=";"
)
clean_phenotypic_data = pd.read_pickle(BLD_DATA / "clean_phenotypic_data.pkl")

In [9]:
clean_phenotypic_data["alc_5_drinks"].dtype

Float32Dtype()

In [45]:
clean_sample_description = _clean_sample_description(sample_description)
clean_sample_description.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sex,age
Patient_ID,ind,Unnamed: 2_level_1,Unnamed: 3_level_1
gxyw282,88,F,42
qala449,35,F,51
cfci956,97,F,28
jhod839,11,F,55
nqem946,49,F,24


In [46]:
clean_sample_description["sex"].unique()

['F', 'M']
Categories (2, object): ['F', 'M']

In [47]:
lipid_intensities.head()

Unnamed: 0,originalMS#,gpeakpos75,gpeakpos366,gpeakpos430,gpeakpos641,gpeakpos654,gpeakpos721,gpeakpos1549,gpeakpos1999,gpeakpos2041,...,gpeakneg5948,gpeakneg5989,gpeakneg5990,gpeakneg6009,gpeakneg6017,gpeakneg6039,gpeakneg6041,gpeakneg6049,gpeakneg6179,gpeakneg6243
0,88,20.010132,23.737193,20.51544,19.896349,20.46376,18.106392,21.021372,19.813893,17.12801,...,20.226886,18.759335,17.025329,21.324577,22.273666,19.997418,17.379238,20.965021,19.558838,23.912203
1,35,20.970653,22.350294,21.841468,18.469919,21.225256,18.825162,19.852056,19.593483,17.1846,...,19.182669,20.071385,16.779131,21.727323,22.571511,20.219298,18.526277,20.914076,18.491121,22.872198
2,97,20.230461,22.168165,21.080258,17.555959,21.735137,20.556331,19.739518,18.459899,17.790985,...,20.026975,19.871664,18.378659,22.762373,22.965926,20.869206,18.382267,21.287581,19.392318,22.591559
3,11,20.3693,25.555585,21.372413,19.436745,20.993355,20.262328,21.049905,18.051902,17.532557,...,19.854519,19.044866,17.495501,22.429385,23.331532,20.304773,18.193191,21.373093,18.262589,23.165153
4,49,20.663592,23.165646,21.275192,19.593324,21.112361,20.037621,18.963551,18.606627,17.680726,...,20.515241,19.22211,16.045029,23.330734,23.590012,20.229432,17.964824,22.645571,19.073267,22.836274


In [51]:
test_lipids = _clean_lipid_intensities(lipid_intensities)
test_lipids

Unnamed: 0_level_0,gpeakpos75,gpeakpos366,gpeakpos430,gpeakpos641,gpeakpos654,gpeakpos721,gpeakpos1549,gpeakpos1999,gpeakpos2041,gpeakpos2069,...,gpeakneg5948,gpeakneg5989,gpeakneg5990,gpeakneg6009,gpeakneg6017,gpeakneg6039,gpeakneg6041,gpeakneg6049,gpeakneg6179,gpeakneg6243
ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
88,20.010132,23.737193,20.515440,19.896349,20.463760,18.106392,21.021372,19.813893,17.128010,20.501618,...,20.226886,18.759335,17.025329,21.324577,22.273666,19.997418,17.379238,20.965021,19.558838,23.912203
35,20.970653,22.350294,21.841468,18.469919,21.225256,18.825162,19.852056,19.593483,17.184600,21.336785,...,19.182669,20.071385,16.779131,21.727323,22.571511,20.219298,18.526277,20.914076,18.491121,22.872198
97,20.230461,22.168165,21.080258,17.555959,21.735137,20.556331,19.739518,18.459899,17.790985,20.651135,...,20.026975,19.871664,18.378659,22.762373,22.965926,20.869206,18.382267,21.287581,19.392318,22.591559
11,20.369300,25.555585,21.372413,19.436745,20.993355,20.262328,21.049905,18.051902,17.532557,19.728908,...,19.854519,19.044866,17.495501,22.429385,23.331532,20.304773,18.193191,21.373093,18.262589,23.165153
49,20.663592,23.165646,21.275192,19.593324,21.112361,20.037621,18.963551,18.606627,17.680726,21.126692,...,20.515241,19.222110,16.045029,23.330734,23.590012,20.229432,17.964824,22.645571,19.073267,22.836274
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
985,20.688943,23.498907,21.653192,20.598327,21.622449,19.845434,20.417449,17.737270,16.117879,20.355606,...,19.574582,19.071444,17.574223,23.502094,23.840568,20.548831,17.948143,22.646340,18.600169,22.738894
989,20.720792,23.424420,21.566624,18.784941,21.387331,19.750504,20.638996,18.193023,17.706718,20.906125,...,19.573356,19.546061,17.609062,22.902518,23.336611,20.251979,17.857960,22.038750,18.009683,22.508738
998,20.990500,23.998636,21.926036,19.258090,21.886556,19.985877,20.186578,19.450299,18.452317,21.819396,...,20.108513,19.421757,17.910703,21.821249,22.994454,20.348566,18.138418,20.867264,19.650337,23.025880
980,21.099667,23.646367,21.763161,19.839383,20.600685,19.376180,20.618615,20.492938,18.282796,21.109731,...,20.685433,17.980344,17.355729,22.709611,23.803293,20.454478,17.801180,21.791065,17.999105,23.416695


In [52]:
test_lipids["gpeakpos75"].dtype

dtype('float64')

In [57]:
test_complete = clean_lipidomic_data(sample_description, lipid_intensities)
test_complete.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sex,age,gpeakpos75,gpeakpos366,gpeakpos430,gpeakpos641,gpeakpos654,gpeakpos721,gpeakpos1549,gpeakpos1999,...,gpeakneg5948,gpeakneg5989,gpeakneg5990,gpeakneg6009,gpeakneg6017,gpeakneg6039,gpeakneg6041,gpeakneg6049,gpeakneg6179,gpeakneg6243
Patient_ID,ind,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
gxyw282,88,F,42,20.010132,23.737193,20.51544,19.896349,20.46376,18.106392,21.021372,19.813893,...,20.226886,18.759335,17.025329,21.324577,22.273666,19.997418,17.379238,20.965021,19.558838,23.912203
qala449,35,F,51,20.970653,22.350294,21.841468,18.469919,21.225256,18.825162,19.852056,19.593483,...,19.182669,20.071385,16.779131,21.727323,22.571511,20.219298,18.526277,20.914076,18.491121,22.872198
cfci956,97,F,28,20.230461,22.168165,21.080258,17.555959,21.735137,20.556331,19.739518,18.459899,...,20.026975,19.871664,18.378659,22.762373,22.965926,20.869206,18.382267,21.287581,19.392318,22.591559
jhod839,11,F,55,20.3693,25.555585,21.372413,19.436745,20.993355,20.262328,21.049905,18.051902,...,19.854519,19.044866,17.495501,22.429385,23.331532,20.304773,18.193191,21.373093,18.262589,23.165153
nqem946,49,F,24,20.663592,23.165646,21.275192,19.593324,21.112361,20.037621,18.963551,18.606627,...,20.515241,19.22211,16.045029,23.330734,23.590012,20.229432,17.964824,22.645571,19.073267,22.836274
