In [1]:
import pandas as pd
from sqlalchemy import create_engine
import json

with open("/home/jovyan/work/.credentials.json", "r") as credential:
    credential = json.load(credential)
    url = credential["url"]


def concat_drug_concept_id(group):
    """return joined rows"""
    group_list = list(map(str, group))
    
    if len(group_list) > 1:
        return f'({", ".join(group_list)})>'
    return f'{group_list[0]}>'


def get_pattern(row):
    """remove consecutive drug_concept_id and return final pattern"""
    pattern_list = row.split(">")
    pattern = pattern_list[0].strip()
    id_before = pattern_list[0].strip()
    
    for drug_concept_id in pattern_list[1:-1]:
        drug_concept_id = drug_concept_id.strip()
        if id_before != drug_concept_id:
            pattern += f' -> {drug_concept_id}'
        id_before = drug_concept_id
    
    return pattern

In [2]:
# DB 연결
engine = create_engine(url, connect_args={'options': '-csearch_path={}'.format('de')})
df_condition = pd.read_sql_table(
    "condition_occurrence",
    con=engine
)

df_drug = pd.read_sql_table(
    "drug_exposure",
    con=engine
)

In [3]:
# 제 2형 당뇨병 환자들의 person_id 가져오기
diabetes2_concept_id = [3191208, 36684827, 3194332, 3193274, 43531010,
                        4130162, 45766052, 45757474, 4099651, 4129519,
                        4063043, 4230254, 4193704, 4304377, 201826,
                        3194082, 3192767]

diabetes2 = df_condition[df_condition["condition_concept_id"].isin(diabetes2_concept_id)]["person_id"]

In [4]:
# digoxin, simvastatin, clopidogrel, naproxen을 처방 받은 경우만 가져오기
drug_interest_concept_id = [19018935, 1539411, 1539463, 19075601, 1115171]
df_dscn = df_drug.loc[df_drug["drug_concept_id"].isin(drug_interest_concept_id)]

In [5]:
# 제 2형 당뇨병을 진단받은 환자들의 기록만 가져오기
df_dscn_diabetes2 = df_dscn.loc[df_dscn["person_id"].isin(diabetes2)]

In [6]:
# 처방일이 같지만 종료일이 다른 경우 제외(같은 약 처방 중복되므로)
df_dscn_diabetes2_rm_duplicates = df_dscn_diabetes2[["person_id", "drug_exposure_start_date", "drug_concept_id"]].drop_duplicates()

In [7]:
# 날짜 별 처방 가져오기
df_pattern = df_dscn_diabetes2_rm_duplicates \
    .groupby(["person_id", "drug_exposure_start_date"]) \
    .drug_concept_id \
    .apply(concat_drug_concept_id) \
    .reset_index()

In [8]:
# 사람별로 처방받은 모든 drug_concept_id 합치기
df_pattern_seq = df_pattern \
    .groupby("person_id") \
    .drug_concept_id \
    .apply(" ".join) \
    .reset_index()

In [9]:
# 연속된 처방 지우기
df_pattern_seq["pattern"] = df_pattern_seq.apply(lambda row: get_pattern(row["drug_concept_id"]), axis=1)

In [10]:
# 제2형 당뇨병을 진단받은 환자들의 digoxin, simvastatin, clodpidogrel, naproxen 처방 패턴에 대한 aggregation
df_pattern_seq \
    .groupby("pattern") \
    .size() \
    .to_frame("person_count") \
    .reset_index() \
    .sort_values("person_count", ascending=False) \
    .reset_index(drop=True)

Unnamed: 0,pattern,person_count
0,1115171,10
1,1539463 -> 1115171 -> 1539463,4
2,1539463,2
3,"(1539411, 19075601)",1
4,"(1539411, 19075601) -> 1539463 -> (1539411, 19...",1
5,"(19075601, 1539411)",1
6,"1115171 -> (19075601, 1539411) -> 1539463 -> (...",1
7,"1115171 -> 19018935 -> (19018935, 1539411)",1
8,1115171 -> 19018935 -> 1539463,1
9,"1539411 -> (1539411, 19075601)",1
