In [34]:
import pandas as pd
import numpy as np

In [35]:
df = pd.read_csv('data/data_clean.csv')

## 2) Clinical dosing algorithm

In [51]:
alt = df.copy()

alt['Age in decades'] = alt['Age'].apply(lambda x: int(x[0]) if x != 'unknown' else np.nan)

alt.loc[alt['Height (cm)'] == 'unknown', 'Height (cm)'] = np.nan
alt.loc[alt['Weight (kg)'] == 'unknown', 'Weight (kg)'] = np.nan

alt['Asian race'] = alt['Race'].map({'Asian': 1}).fillna(0)
alt['Black or African American race'] = alt['Race'].map({'Black or African American': 1}).fillna(0)
alt['Missing or mixed race'] = alt['Race'].map({'Unknown': 1}).fillna(0)
alt['Enzyme inducer status'] = np.where(
    (alt['Carbamazepine (Tegretol)'] == 1) | 
    (alt['Phenytoin (Dilantin)'] == 1) | 
    (alt['Rifampin or Rifampicin'] == 1),
    1,
    0
)

alt['Amiodarone status'] = alt['Amiodarone (Cordarone)'].map({'1.0': 1}).fillna(0)

In [52]:
alt = alt[[
    'Age in decades', 'Height (cm)', 'Weight (kg)', 'Asian race', 'Black or African American race',
    'Missing or mixed race', 'Enzyme inducer status', 'Amiodarone status', 'VKORC1_SNP', 'CYP2C9 consensus', 'correct_dosage'
]].dropna(how='any').copy()

In [53]:
alt['sqrt_weekly_dose'] = 4.0376 - 0.2546 * alt['Age in decades'] + 0.0118 * alt['Height (cm)'].astype(float) \
    + 0.0134 * alt['Weight (kg)'].astype(float) - 0.6752 * alt['Asian race'] \
    + 0.4060 * alt['Black or African American race'] + 0.0443 * alt['Missing or mixed race'] \
    + 1.2799 * alt['Enzyme inducer status'] - 0.5695 * alt['Amiodarone status']

alt['pred_weekly_dose'] = np.where(
    alt['sqrt_weekly_dose'] < np.sqrt(21),
    'low',
    np.where(
        alt['sqrt_weekly_dose'] > np.sqrt(49),
        'high',
        'medium'
    )
)

In [54]:
alt.loc[alt['correct_dosage'] == alt['pred_weekly_dose']].shape[0] / alt.shape[0]

0.6527587779297765

## 1) Fixed-dose

In [55]:
# accuracy is just the fraction of individuals that are truly medium
alt.loc[alt['correct_dosage'] == 'medium'].shape[0] / alt.shape[0]

0.6158230734154126

## save down cleaned without nulls


In [58]:
alt.drop(columns=['pred_weekly_dose', 'sqrt_weekly_dose']).to_csv('data/clean_nonull.csv', index=False)

In [60]:
alt.iloc[2405]

Age in decades                         6.0
Height (cm)                         175.26
Weight (kg)                          101.2
Asian race                             0.0
Black or African American race         0.0
Missing or mixed race                  0.0
Enzyme inducer status                    0
Amiodarone status                      0.0
VKORC1_SNP                             G/G
CYP2C9 consensus                     *1/*1
correct_dosage                      medium
sqrt_weekly_dose                  5.934148
pred_weekly_dose                    medium
Name: 2675, dtype: object

In [61]:
alt

Unnamed: 0,Age in decades,Height (cm),Weight (kg),Asian race,Black or African American race,Missing or mixed race,Enzyme inducer status,Amiodarone status,VKORC1_SNP,CYP2C9 consensus,correct_dosage,sqrt_weekly_dose,pred_weekly_dose
0,6.0,193.04,115.7,0.0,0.0,0.0,0,0.0,A/G,*1/*1,medium,6.338252,medium
1,5.0,176.53,144.2,0.0,0.0,0.0,0,0.0,A/A,*1/*1,medium,6.779934,medium
2,4.0,162.56,77.1,0.0,0.0,0.0,0,0.0,G/G,*1/*1,high,5.970548,medium
3,6.0,182.24,90.7,0.0,0.0,0.0,0,0.0,A/G,*1/*1,medium,5.875812,medium
4,5.0,167.64,72.6,0.0,0.0,0.0,0,0.0,A/G,*1/*3,medium,5.715592,medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5523,2.0,185.42,113.6,0.0,0.0,0.0,0,0.0,unknown,*1/*1,high,7.238596,high
5524,7.0,160.02,55.9,0.0,0.0,0.0,0,0.0,unknown,*1/*3,medium,4.892696,medium
5525,6.0,187.96,97.7,0.0,0.0,0.0,0,0.0,unknown,*1/*1,high,6.037108,medium
5526,6.0,177.8,87.3,0.0,0.0,0.0,0,0.0,unknown,unknown,high,5.777860,medium
