In [63]:
import pandas as pd
import ast
from sklearn.preprocessing import MultiLabelBinarizer

# Load Data

In [64]:
ndc = pd.read_csv('data/ndc.csv')

In [65]:
label = pd.read_csv('data/label.csv')

  interactivity=interactivity, compiler=compiler, result=result)


# Data Cleaning

## Join the Files

In [66]:
df = ndc.merge(
    label,
    left_on='spl_id',
    right_on='id',
    how='left'
)

In [67]:
# delete the dataframes to save memory
del ndc 
del label

## Remove unfinished products because the FDA only reviews finished products. 

In [68]:
df = df.query('finished == True')

In [69]:
df.columns

Index(['product_id', 'product_ndc', 'spl_id', 'application_number',
       'dea_schedule', 'dosage_form', 'finished', 'marketing_category',
       'marketing_start_date', 'marketing_end_date', 'openfda.pharm_class_cs',
       'openfda.pharm_class_epc', 'openfda.pharm_class_pe',
       'openfda.pharm_class_moa', 'pharm_class', 'product_type', 'route', 'id',
       'set_id', 'version', 'effective_time', 'drug_interactions'],
      dtype='object')

## Select Columns 

Exclude IDs and other columns that won't be used as features due to bias detected during the exploration phase. 

In [70]:
cols_to_keep = [
    'application_number',
    'dea_schedule',
    'dosage_form',
    'marketing_start_date',
    'openfda.pharm_class_cs',
    'openfda.pharm_class_epc',
    'openfda.pharm_class_pe',
    'openfda.pharm_class_moa',
    'product_type',
    'route',
    'drug_interactions'
]

In [71]:
df = df[cols_to_keep]

## Derive Target

Derive target variable from application_number. All products with an application number have been approved. 

In [72]:
df['approved'] = df['application_number'].apply(lambda x: 1 if pd.notnull(x) else 0)

In [73]:
df.drop(columns=['application_number'], inplace=True)

## Replace Nulls 

In [74]:
df.isna().sum() / len(df)

dea_schedule               0.955558
dosage_form                0.000000
marketing_start_date       0.000000
openfda.pharm_class_cs     0.852796
openfda.pharm_class_epc    0.730471
openfda.pharm_class_pe     0.879947
openfda.pharm_class_moa    0.871066
product_type               0.000000
route                      0.018502
drug_interactions          0.648037
approved                   0.000000
dtype: float64

All columns with nulls are categorical, so let's replace their nulls with a single category. 

In [75]:
for col in df.columns[df.isna().sum() > 0]:
    df[col].fillna('N/A', inplace=True)

In [76]:
df.isna().sum()

dea_schedule               0
dosage_form                0
marketing_start_date       0
openfda.pharm_class_cs     0
openfda.pharm_class_epc    0
openfda.pharm_class_pe     0
openfda.pharm_class_moa    0
product_type               0
route                      0
drug_interactions          0
approved                   0
dtype: int64

# Feature Transformation

## Lists to Columns

Some of the columns contain lists of categories. We need to convert these into binary columns for each category. This is like one-hot-encoding, but with lists of values instead of a single value. However, these fields are currently stored as strings, so we first need to literally evaluate them as lists. 

In [87]:
cols_with_lists = [
    'route',
    'openfda.pharm_class_cs',
    'openfda.pharm_class_epc',
    'openfda.pharm_class_pe',
    'openfda.pharm_class_moa'
]

In [77]:
for col in cols_with_lists:
    df[col] = df[col].apply(lambda x: ast.literal_eval(x) if x != 'N/A' else x.split())

Now let's create a function to binarize each column that contains lists as values.

In [95]:
def binarize_list_column(df, colname):
    mlb = MultiLabelBinarizer()
    mlb.fit(df[colname])
    class_names = [colname + '_' + c for c in mlb.classes_]
    return pd.DataFrame(mlb.transform(df[colname]), columns=class_names) 

Apply the function to each column and horizontally concatenate the results to the input dataframe. 

In [98]:
for col in cols_with_lists:
    pd.concat([df, binarize_list_column(df, col)], axis=1)

Now we can drop the original untransformed columns. 

In [99]:
df.drop(columns=cols_with_lists, inplace=True)