In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/warfarin.csv').dropna(
    subset=['Therapeutic Dose of Warfarin']
).dropna(how='all', axis=1).fillna('unknown').copy()
df['correct_dosage'] = np.where(
    df['Therapeutic Dose of Warfarin'] < 21,
    'low',
    np.where(
        df['Therapeutic Dose of Warfarin'] > 49,
        'high',
        'medium'
    )
)

In [3]:
df.groupby('correct_dosage')['Therapeutic Dose of Warfarin'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
correct_dosage,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
high,651.0,63.769478,18.35818,49.42,52.5,59.5,70.0,315.0
low,1495.0,14.447819,4.031703,2.1,11.25,15.0,17.5,20.75
medium,3382.0,31.972874,7.785291,21.0,25.0,31.275,37.5,49.0


In [4]:
def impute_vkorc1(x):
    if x['VKORC1 -1639 consensus'] == 'unknown':
        dict_map = {'C/C': 'G/G', 'T/T': 'A/A', 'C/T': 'A/G'}
        if x['Race'] not in ['Black or African American', 'Unknown']:
            if x['VKORC1 2255 consensus'] in dict_map:  # rs2359612
                return dict_map[x['VKORC1 2255 consensus']]
            
        if x['VKORC1 1173 consensus'] in dict_map:  # rs9934438
            return dict_map[x['VKORC1 1173 consensus']]
        
        if x['Race'] not in ['Black or African American', 'Unknown']:
            dict_map = {'G/G': 'G/G', 'C/C': 'A/A', 'C/G': 'A/G'}
            if x['VKORC1 1542 consensus'] in dict_map:  # rs8050894
                return dict_map[x['VKORC1 1542 consensus']]
            
    return x['VKORC1 -1639 consensus'] # if known or unimputable, do not change

In [5]:
df['VKORC1_SNP'] = df.apply(lambda x: impute_vkorc1(x), axis=1)

In [6]:
ignore_cols = [
    x for x in df.columns if x.startswith('VKORC1') and (x != 'VKORC1_SNP')
] + [
    x for x in df.columns if ('CYP2C9' in x.upper()) and (x != 'CYP2C9 consensus')
]

ignore_cols

['VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T',
 'VKORC1 QC genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T',
 'VKORC1 genotype: 497T>G (5808); chr16:31013055; rs2884737; A/C',
 'VKORC1 QC genotype: 497T>G (5808); chr16:31013055; rs2884737; A/C',
 'VKORC1 genotype: 1173 C>T(6484); chr16:31012379; rs9934438; A/G',
 'VKORC1 QC genotype: 1173 C>T(6484); chr16:31012379; rs9934438; A/G',
 'VKORC1 genotype: 1542G>C (6853); chr16:31012010; rs8050894; C/G',
 'VKORC1 QC genotype: 1542G>C (6853); chr16:31012010; rs8050894; C/G',
 'VKORC1 genotype: 3730 G>A (9041); chr16:31009822; rs7294;  A/G',
 'VKORC1 QC genotype: 3730 G>A (9041); chr16:31009822; rs7294;  A/G',
 'VKORC1 genotype: 2255C>T (7566); chr16:31011297; rs2359612; A/G',
 'VKORC1 QC genotype: 2255C>T (7566); chr16:31011297; rs2359612; A/G',
 'VKORC1 genotype: -4451 C>A (861); Chr16:31018002; rs17880887; A/C',
 'VKORC1 QC genotype: -4451 C>A (861); Chr16:31018002; rs17880887; A/C',
 'VKORC1 -1639 consensu

In [7]:
df.drop(columns=ignore_cols).columns

Index(['PharmGKB Subject ID', 'Gender', 'Race', 'Ethnicity', 'Age',
       'Height (cm)', 'Weight (kg)', 'Indication for Warfarin Treatment',
       'Comorbidities', 'Diabetes',
       'Congestive Heart Failure and/or Cardiomyopathy', 'Valve Replacement',
       'Medications', 'Aspirin', 'Acetaminophen or Paracetamol (Tylenol)',
       'Was Dose of Acetaminophen or Paracetamol (Tylenol) >1300mg/day',
       'Simvastatin (Zocor)', 'Atorvastatin (Lipitor)', 'Fluvastatin (Lescol)',
       'Lovastatin (Mevacor)', 'Pravastatin (Pravachol)',
       'Rosuvastatin (Crestor)', 'Cerivastatin (Baycol)',
       'Amiodarone (Cordarone)', 'Carbamazepine (Tegretol)',
       'Phenytoin (Dilantin)', 'Rifampin or Rifampicin',
       'Sulfonamide Antibiotics', 'Macrolide Antibiotics',
       'Anti-fungal Azoles', 'Herbal Medications, Vitamins, Supplements',
       'Target INR', 'Estimated Target INR Range Based on Indication',
       'Subject Reached Stable Dose of Warfarin',
       'Therapeutic Dose of 

In [59]:
df.drop(columns=ignore_cols).to_csv('data/data_clean.csv', index=False)