In [1]:
import pandas as pd
from sqlalchemy import create_engine
from datetime import timedelta
import json

with open("/home/jovyan/work/.credentials.json", "r") as credential:
    credential = json.load(credential)
    url = credential["url"]

In [2]:
# DB 연결
engine = create_engine(url, connect_args={'options': '-csearch_path={}'.format('de')})
df = pd.read_sql_table(
    "drug_exposure",
    con=engine
)

In [3]:
# 환자 번호  정보 가져오기
df_interest = df.loc[df.person_id == 1891866]

In [4]:
# 약 종류 별 처음 시작일, 마지막 종료일 구하기
df_drug_dates = df_interest \
    .groupby("drug_concept_id")\
    .agg({"drug_exposure_start_date":"min", 
          "drug_exposure_end_date": "max",
         }) \
    .reset_index()

In [5]:
# 같은 약이며, 복용 시작일이 같지만, 복용 종료일이 다른 경우의 drug_exposure_id 추출 (복용일로부터 제외하기 위해)
criteria = df_interest[["drug_concept_id", "drug_exposure_start_date"]]
drug_exposure_duplicate_id = df_interest[criteria.duplicated(keep="last")]["drug_exposure_id"]

# 추출된 drug_exposure_id 제외
df_interest_duplicate_rm = df_interest.loc[~df_interest["drug_exposure_id"].isin(drug_exposure_duplicate_id)]

In [6]:
"""
1891866 환자의 약 별 복용일 구하기
약을 여러 차례에 걸쳐 복용할 경우, 각 차례 사이에 복용 안 하는 경우도 존재
따라서 각 drug 별 복용일 산출 후, 약물 별 합산 구함
"""
df_interest_duplicate_rm["total_exposure_days"] = df_interest_duplicate_rm["drug_exposure_end_date"] - df_interest_duplicate_rm["drug_exposure_start_date"] + timedelta(days=1)
df_drug_exposure_days = df_interest_duplicate_rm.groupby("drug_concept_id")\
    .total_exposure_days \
    .sum() \
    .to_frame() \
    .reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_interest_duplicate_rm["total_exposure_days"] = df_interest_duplicate_rm["drug_exposure_end_date"] - df_interest_duplicate_rm["drug_exposure_start_date"] + timedelta(days=1)


In [7]:
# Join
df_merged = pd.merge(df_drug_dates, 
                     df_drug_exposure_days, 
                     how="inner", 
                     on="drug_concept_id")

In [8]:
# 복용일이 긴 순으로 정렬(오름차순)
df_merged.sort_values("total_exposure_days")

Unnamed: 0,drug_concept_id,drug_exposure_start_date,drug_exposure_end_date,total_exposure_days
4,40213227,1993-01-05,1993-01-05,1 days
3,40213154,1989-09-12,1998-07-07,10 days
2,19030765,1988-10-18,1998-10-05,1214 days
0,1539463,1990-03-13,1998-03-11,5484 days
1,19009384,1959-12-01,1998-10-06,14421 days


In [9]:
drug_exposure_duplicate_id

29761    62232837
29805    62232899
30828    62233775
Name: drug_exposure_id, dtype: int64