## 임상시험 데이터 확인
### 목적
임상시험 데이터 및 포맷, 스펙(스키마)를 확인하고 임상시험 데이터의 최종적 형태인 sas7bdat / xport를 다룬다.

### 준비
임상시험 데이터: cafe.naver.com/dmisimportant/104

### Libraries
sas7bdat / xport를 열기 위해 sas7bdat 사용

In [1]:
import os
import numpy as np
import pandas as pd
import secrets
from sas7bdat import SAS7BDAT as sas7bdat

### Variables
CRF 포맷, LAB 데이터 포맷, 데이터셋 파일 경로 설정

In [2]:
ornament="-"*10
hangul="가나다라마바사아자차카파타하"
ext=(".sas7bdat",".xport")
path_crf="C:/code/CUBEDEMO2017/CUBEDEMO_2017_dc_or_fmt.xlsx"
path_lab="C:/code/CUBEDEMO2017/CUBEDEMO_2017_dc_or_lar.xlsx"
path_set="C:/code/CUBEDEMO2017/SASSET/"

### 포맷 파일 로드
CRF 포맷 파일에서 데이터 도메인 확인

In [4]:
form_crf=pd.read_excel(path_crf)
form_lab=pd.read_excel(path_lab)
print(ornament,"domain formats:",os.linesep,form_crf.DOMAIN.unique())

---------- domain formats: 
 ['AE' 'AY' 'CM' 'CT' 'CY' 'DM' 'DS' 'EF' 'EG' 'ES' 'IE' 'IP' 'LB' 'LC'
 'LY' 'MH' 'MY' 'PD' 'PG' 'RN' 'SU' 'SV' 'VS']


### 데이터셋 확인
데이터셋 경로의 sas7bdat / xport 파일을 os.path 오브젝트로 로드하고 불량 파일이 있는지 확인

In [5]:
sasobj=[obj for obj in os.scandir(path_set) if any(map(obj.path.lower().__contains__,ext)) and obj.is_file()]
sasbad=[obj for obj in sasobj if obj.stat().st_size<3]
if len(sasbad)>1:raise Exception("exotic file exists")

sas7bdat / xport os.path 오브젝트를 도메인 이름: 데이터프레임의 딕셔너리로 로드하고 어떤 도메인의 데이터셋이 있는지 확인
- 데이터셋 내 string이 bytes이므로 utf-8로 변환하고, 이 과정에서 누락되는 레코드가 있는지도 확인

In [7]:
def _decode(filepath):
    data=pd.read_sas(filepath)
    nas=data.notna().value_counts().sum()
    bytecol=data.select_dtypes("object").columns
    data[bytecol]=data[bytecol].apply(lambda q:q.str.decode("utf-8"))
    if nas==data.notna().value_counts().sum():
        return data
    else:
        print(ornament,"error:",filepath)
        return None

data={os.path.splitext(obj.name)[0].upper():_decode(obj.path) for obj in sasobj}
print(ornament,"domain:",os.linesep,data.keys(),os.linesep,len(data),"domains")

---------- domain: 
 dict_keys(['AE', 'AY', 'CM', 'CT', 'CY', 'DA', 'DM', 'DS', 'DY', 'EF', 'EG', 'EN', 'ES', 'IE', 'IP', 'LB', 'LC', 'LY', 'MH', 'MY', 'PD', 'PG', 'RN', 'SN', 'SU', 'SV', 'VS']) 
 27 domains


### 도메인별 데이터셋 형태 확인
SN은 subject 이름, visit site를 포함함을 알 수 있음

In [45]:
sn=data["SN"]
sn.sample(10)

Unnamed: 0,SUBJID,VISIT,SNNAME,SNDTC
25,S-US-003,5007.0,이상하하,2017-03-23
20,S-MJ-002,5007.0,김민자가,2017-06-20
22,S-MJ-010,3006.0,김민자가,2017-06-14
9,S-2Z-028,5007.0,공우카바,2017-02-20
17,S-4Z-001,3006.0,원유타가,2017-03-07
18,S-4Z-001,5007.0,원유타가,2017-03-07
10,S-2Z-029,3006.0,공우카바,2017-02-20
13,S-2Z-034,3006.0,송나라사,2017-02-20
7,S-2Z-017,5007.0,황태타사,2017-02-21
8,S-2Z-023,5007.0,송도파가,2017-02-20


#### SN: SNNAME 가리기
Subject 이름을 가림

In [44]:
def _aname(name,n=2,chars=hangul):
    suffix="".join([secrets.choice(chars) for q in range(n)])
    return name[:n]+suffix

sn_snname_mapper={name:_aname(name) for name in sn.SNNAME.unique()}
sn.SNNAME.replace(sn_snname_mapper,inplace=True)
data["SN"].sample(10)

Unnamed: 0,SUBJID,VISIT,SNNAME,SNDTC
24,S-US-002,5007.0,이상하하,2017-03-23
21,S-MJ-009,5007.0,김민자가,2017-06-14
8,S-2Z-023,5007.0,송도파가,2017-02-20
10,S-2Z-029,3006.0,공우카바,2017-02-20
18,S-4Z-001,5007.0,원유타가,2017-03-07
7,S-2Z-017,5007.0,황태타사,2017-02-21
17,S-4Z-001,3006.0,원유타가,2017-03-07
5,S-2Z-016,5007.0,황태타사,2017-02-21
6,S-2Z-017,3006.0,황태타사,2017-02-21
15,S-3Z-013,3006.0,김민자가,2017-03-02


#### SN: SUBJID 추출
SN의 SUBJID를 각 도메인의 인덱스로 씀
- SN.SUBJID 개수를 subject 수로 가정함

In [19]:
ix=sn.SUBJID.unique()
print(ix)
print(ornament,"total subjects:",len(ix))

['S-1Z-031' 'S-1Z-034' 'S-2Z-006' 'S-2Z-012' 'S-2Z-013' 'S-2Z-016'
 'S-2Z-017' 'S-2Z-023' 'S-2Z-028' 'S-2Z-029' 'S-2Z-032' 'S-2Z-034'
 'S-2Z-035' 'S-3Z-013' 'S-3Z-048' 'S-4Z-001' 'S-MJ-001' 'S-MJ-002'
 'S-MJ-009' 'S-MJ-010' 'S-US-002' 'S-US-003' 'S-US-007']
---------- total subjects: 23


#### DM: 내용 확인
DM은 인구학적 정보를 포함하며, 왜인지 인덱스(SN.SUBJID)에 부합하지 않는 row가 많음
- 인구학적 정보는 시험 초기에 수집되므로 인덱스에 해당하지 않는 row가 있으면 안됨

In [24]:
dm=data["DM"]
dm
dm[dm.SUBJID.isin(ix)]

Unnamed: 0,SUBJID,BRTHDTC,AGE,SEX,FERTILE
23,S-1Z-031,1984-09,32.0,2.0,1.0
26,S-1Z-034,1977-11,38.0,2.0,1.0
37,S-2Z-006,1975-04,41.0,2.0,2.0
43,S-2Z-012,1977-10,39.0,2.0,1.0
44,S-2Z-013,1991-07,25.0,1.0,
47,S-2Z-016,1987-02,29.0,1.0,
48,S-2Z-017,1983-03,33.0,1.0,
54,S-2Z-023,1998-04,17.0,1.0,
58,S-2Z-028,1989-04,27.0,2.0,2.0
59,S-2Z-029,1975-04,41.0,1.0,


#### AE: 내용 확인
AE는 시험간 subject 별 adverse effect 정보를 포함
- 컬럼(e.g. form)별 숫자 코딩된 categorical data가 있음
- 상기에 따라 MedDRA 코딩된 컬럼이 있음
- 숫자 코딩의 기준은 CRF Format에 정의되어 있음

In [52]:
ae=data["AE"]
print(ae.loc[:,[col for col in ae if col.startswith("AE")]].sample(10))
print(ae[ae.SUBJID.isin(ix)][["SUBJID","INV_PT"]].sample(10))
form_crf[form_crf.DOMAIN=="AE"].sample(10)

                          AETERM     AESTDTC  AETEAE  AEOUT     AEENDTC  \
63                  Pain burning  2016-06-06     1.0    3.0  2016-07-01   
61                          Pain  2016-04-25     1.0    2.0         NaN   
91   Hand and foot skin reaction  2017-02-08     2.0    3.0  2017-02-15   
48                        Queasy  2017-01-UK     2.0    3.0  2017-02-01   
180    Myelocyte count decreased  2016-05-19     1.0    6.0         NaN   
139                AST increased  2016-02-15     2.0    3.0  2016-02-20   
138                          NaN         NaN     NaN    NaN         NaN   
149                  Eye allergy  2017-02-02     2.0    5.0         NaN   
6                Angina pectoris  2016-06-15     1.0    NaN  2016-06-15   
187              Acute nephritis  2016-07-07     1.0    2.0         NaN   

     AESER  AESEV  AEREL  AEACN  AEACNOTH  
63     1.0    1.0    1.0    2.0       3.0  
61     3.0    2.0    2.0    2.0       2.0  
91     3.0    2.0    1.0    2.0       3.0 

Unnamed: 0,DOMAIN,FMTNAME,VARNAME,START,END,LABEL
26,AE,AESER,AESER,6,6,Significant disability
27,AE,AESER,AESER,7,7,Other medically important event
12,AE,AEOUT,AEOUT,2,2,Not recovered/Not resolved
13,AE,AEOUT,AEOUT,3,3,Recovered/Resolved
2,AE,AEACN,AEACN,3,3,Dose reduced
11,AE,AEOUT,AEOUT,1,1,Fatal
32,AE,AETEAE,AETEAE,2,2,No
0,AE,AEACN,AEACN,1,1,Dose increased
9,AE,AEACNOTH,AEACNOTH,3,3,Non-drug treatment
4,AE,AEACN,AEACN,5,5,Drug withdrawn


#### AE: AE Code Labeling
AE의 코딩된 categorical AEs이 CRF 상에서 무엇을 의미하는지 확인하고자 함
- DB Specification 또는 CRF Format에서 컬럼별 코드 및 라벨을 컬럼: 코드: 라벨의 딕셔너리로 추출
- 딕셔너리에 따라 컬럼 코드를 라벨링
- 일견 AESER rate가 상당히 높음

In [49]:
def _get_label(form,var="VARNAME",kvp=["END","LABEL"]):
    label={}
    for varname in form[var].unique():
        kv=form[form[var]==varname].loc[:,kvp].values
        label[varname]={k:v for k,v in kv}
    return label

ae_label=_get_label(form_crf)
print(ae_label)

{'AEACN': {1: 'Dose increased', 2: 'Dose not changed', 3: 'Dose reduced', 4: 'Drug interrupted', 5: 'Drug withdrawn', 6: 'Not applicable', 7: 'Unknown'}, 'AEACNOTH': {1: 'None', 2: 'Drug treatment', 3: 'Non-drug treatment', 4: 'Drug and non-drug treatment'}, 'AEOUT': {1: 'Fatal', 2: 'Not recovered/Not resolved', 3: 'Recovered/Resolved', 4: 'Recovered/Resolved with sequelae', 5: 'Recovering/Resolving', 6: 'Unknown'}, 'AEREL': {1: 'Not related', 2: 'Unlikely related', 3: 'Possibly related', 4: 'Related'}, 'AESER': {1: 'No', 2: 'Death', 3: 'Hospitalization', 4: 'Life threatening', 5: 'Congenital anomaly or birth defect', 6: 'Significant disability', 7: 'Other medically important event'}, 'AESEV': {1: 'Mild', 2: 'Moderate', 3: 'Severe'}, 'AETEAE': {1: 'Yes', 2: 'No'}, 'AEYN': {1: 'Yes', 2: 'No'}, 'CMINDC': {1: 'Medical history', 2: 'Adverse event', 3: 'Hypertension', 4: 'Prophylaxis', 5: 'Supplement', 6: 'Other'}, 'CMONGO': {1: 'Checked'}, 'CTYN': {1: 'Yes', 2: 'No'}, 'CMYN': {1: 'Yes', 2:

In [59]:
ae_col_ambi=[col for col in ae if col in form_crf[form_crf.DOMAIN=="AE"].VARNAME.values]
ae[ae_col_ambi].replace(ae_label)

Unnamed: 0,AETEAE,AEOUT,AESER,AESEV,AEREL,AEACN,AEACNOTH
0,Yes,Not recovered/Not resolved,Life threatening,Severe,Unlikely related,Dose reduced,Non-drug treatment
1,Yes,Unknown,Death,Mild,Related,Drug interrupted,
2,,Recovering/Resolving,,,,,
3,Yes,Fatal,Life threatening,Moderate,Unlikely related,Dose not changed,Drug treatment
4,Yes,Recovered/Resolved with sequelae,Hospitalization,Moderate,Possibly related,Dose reduced,Drug treatment
...,...,...,...,...,...,...,...
186,No,Unknown,No,Mild,Not related,Dose not changed,Drug treatment
187,Yes,Not recovered/Not resolved,Hospitalization,Mild,Unlikely related,Drug withdrawn,Drug treatment
188,Yes,Fatal,No,Mild,Unlikely related,Unknown,
189,No,Not recovered/Not resolved,Other medically important event,Mild,Possibly related,Drug withdrawn,Drug and non-drug treatment


#### LB: 내용 확인
LB는 랩 테스트 결과를 포함
- VISIT, SEQ 등 랩 테스트 회차에 따른 와이드 포맷으로 되어 있으며, 인덱스를 정할 때 유의해야 함
- LB.LBTEST에 해당 랩 테스트 명목이 나와 있음
- LB.LBORRES에 랩 결과 수치가 나와 있음
- 일부 categorical data 포함 (e.g. Occult Blood)

In [62]:
lb=data["LB"]
lb.sample(10)

Unnamed: 0,SUBJID,VISIT,SEQ,LBTEST,LBORRES,LBNOR,LBCLSIG
1645,S-2Z-029,7.0,5.0,Leukocytes,4.0,1.0,
2539,S-4Z-001,1.0,10.0,Bilirubin,9890.035,1.0,1.0
111,S-1Z-018,4.0,19.0,Protein[U],,1.0,
1189,S-2Z-014,1.0,15.0,Specific Gravity[U],2.0,2.0,1.0
847,S-2Z-001,5.0,19.0,Protein[U],,1.0,
2165,S-3Z-015,2.0,11.0,Hemoglobin A1C,6.0,1.0,
2294,S-3Z-030,1.0,18.0,Occult blood[U],,1.0,
991,S-2Z-008,2.0,18.0,Occult blood[U],,1.0,
632,S-1Z-033,1.0,18.0,Occult blood[U],,1.0,
736,S-1Z-033,7.0,13.0,"Glomerular Filtration Rate, Estimated [Automat...",101.48,,


In [66]:
lb_desc=lb.groupby(["LBTEST"])["LBORRES"].agg(["mean","std","min","max","sum"])
lb_desc

Unnamed: 0_level_0,mean,std,min,max,sum
LBTEST,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alanine Aminotransferase,336.257126,1409.494132,0.3,9083.352,64225.111
Albumin,335.514705,1577.394172,1.0,9954.012,63747.794
Albumin [U],,,,,0.0
Aspartate Aminotransferase,325.924723,1415.747206,1.0,9930.498,62251.622
Bilirubin,215.007593,1165.893477,0.4,9890.035,40636.435
Creatinine,184.471609,1056.242446,0.377,9596.921,35418.549
Erythrocytes,245.25126,1130.807728,1.0,7803.056,47088.242
"Glomerular Filtration Rate, Estimated [Automatical",5003.627111,3658.624184,782.108,9718.77,45032.644
"Glomerular Filtration Rate, Estimated [Automatically calculated]",82.635866,40.42601,0.08,194.76,14791.82
"Glomerular Filtration Rate, Estimated(Cockcroft-Gault)",37.18,24.580435,7.58,66.11,148.72


#### LB: Mockup Data 만들기
Subject Index와 LB를 이용해서 목업 데이터를 만듦
- SN.SUBJID를 기준으로 피험자당 10번의 랩 테스트를 가정함
- 편의상 DM 데이터를 결합
- 연습용 데이터 특성상 생성된 데이터는 현실성이 없음

In [69]:
def _gen_mockup_value(desc,count):
    print(ornament,"generating values")
    return {q:np.random.normal(desc.loc[q]["mean"],desc.loc[q]["std"],count) for q in desc.index}

def _gen_mockup(desc,ix,count=10):
    data=_gen_mockup_value(desc,count=len(ix)*count)
    ix=pd.MultiIndex.from_product([ix,[q for q in range(1,count+1,1)]],names=["SUBJID","VISIT"])
    return pd.DataFrame(data,ix).reset_index()

_gen_mockup(lb_desc.dropna(),ix).merge(dm,on="SUBJID")

---------- generating values


Unnamed: 0,SUBJID,VISIT,Alanine Aminotransferase,Albumin,Aspartate Aminotransferase,Bilirubin,Creatinine,Erythrocytes,"Glomerular Filtration Rate, Estimated [Automatical","Glomerular Filtration Rate, Estimated [Automatically calculated]",...,Hemoglobin A1C,Leukocytes,Platelets,Protein,Specific Gravity[U],pH[U],BRTHDTC,AGE,SEX,FERTILE
0,S-1Z-031,1,2634.906749,533.524841,-807.963816,581.388932,600.952135,307.588939,5258.337568,116.928805,...,-64.347504,566.909566,-912.762652,-1998.905348,819.406213,796.693771,1984-09,32.0,2.0,1.0
1,S-1Z-031,2,585.964940,589.470691,1288.162267,-708.573748,331.210101,-390.085265,2817.400402,93.646661,...,-976.113950,2166.951034,13.138443,-872.764852,-897.257543,1306.041228,1984-09,32.0,2.0,1.0
2,S-1Z-031,3,1978.941811,1763.981715,990.436783,-1859.405020,-76.395511,1773.971259,4437.979050,59.455479,...,-1656.555055,-213.669048,-121.663959,1296.157977,278.837654,791.844183,1984-09,32.0,2.0,1.0
3,S-1Z-031,4,113.117333,1442.474993,701.951166,13.758323,1483.851713,1795.280339,5701.201243,80.334033,...,-1144.280670,2162.004948,1396.519662,-73.279230,-1448.939978,2278.581153,1984-09,32.0,2.0,1.0
4,S-1Z-031,5,2964.423192,12.258556,348.837296,626.683202,1045.188406,-278.564498,8430.552700,72.918841,...,219.762879,1218.186209,-52.471319,-1590.563299,834.667785,-730.636529,1984-09,32.0,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,S-US-007,6,1085.833640,540.821357,1845.934935,337.118280,1215.720402,898.997978,5827.504782,63.854711,...,-848.287788,-479.607242,-986.737467,552.246911,-446.053852,-312.836843,1981-01,36.0,1.0,
226,S-US-007,7,1670.176291,-1610.486892,1424.355967,1961.262145,-832.380560,-1630.307754,6611.334649,92.701876,...,-458.145699,2449.700574,-1162.663616,-1975.669533,1073.546004,-931.718494,1981-01,36.0,1.0,
227,S-US-007,8,-1017.705167,-407.098224,-1696.560014,930.426466,253.735864,1975.833521,2692.734394,109.756303,...,249.038567,-1175.578979,-422.345580,-1681.781566,28.118825,2115.800956,1981-01,36.0,1.0,
228,S-US-007,9,934.931086,-2538.936570,1237.135469,1707.916614,1340.341660,1492.285400,6094.005395,93.938583,...,1674.741847,585.353918,554.552963,717.357028,-59.794714,-153.469840,1981-01,36.0,1.0,


### 후기
- 하나의 임상시험을 구성하는 데이터셋은 많으며, 각 데이터 연동을 위한 정의와 연동이 일목요연히 되어 있음
    - 수치형, 범주형 값이 혼재함
    - 일반적이지 않은 범주형 값 (0,1 No,Yes(False,True)가 아닌 1,2 Yes,No)
    - 데이터셋 복잡도의 중요한 요인은 프로토콜의 site visit number로 생각됨
- 분석에 이용되는 실질적 데이터뿐만 아니라 엔드유저에게 해당 데이터가 어떻게 보이고 수집됐는지를 포함
- 프로토콜 이해를 선행하지 않고서는 정확하고 효율적으로 데이터셋을 다루기 어려움
- 도메인 데이터와 DB Specification 내 각 도메인 정의를 연동할 수 있는 클래스 이용 필요