# __Feature Engineering & Encoding__

* Delete all date & name columns (and any other unnecessary columns)
* Create 'days_to_patent_expiry' column
* Combine data in several categorical columns (i.e. te_code)
* One-hot encode all categorical data

In [107]:
import pandas as pd
import numpy as np
import datetime as dt

In [108]:
# Unpickle Top 100 drugs file
import dill
Price_Patent_Data = dill.load(open('data/top_100_drugs.pkd', 'rb'))

In [109]:
Price_Patent_Data.drop('submission_date', axis = 1, inplace = True) # Submission_date isn't diverse enough to be useful

In [110]:
#Convert all to datetimes - not needed if unpickling
Price_Patent_Data['effective_date'] = pd.to_datetime(Price_Patent_Data['effective_date'])
Price_Patent_Data['corresponding_generic_drug_effective_date'] = pd.to_datetime(Price_Patent_Data['corresponding_generic_drug_effective_date'])
Price_Patent_Data['approval_date'] = pd.to_datetime(Price_Patent_Data['approval_date'])
Price_Patent_Data['patent_expire_date_text'] = pd.to_datetime(Price_Patent_Data['patent_expire_date_text'])
# Price_Patent_Data['submission_date'] = pd.to_datetime(Price_Patent_Data['submission_date'])
# Price_Patent_Data['exclusivity_date'] = pd.to_datetime(Price_Patent_Data['exclusivity_date'])

### __Drop ndc_description__
Duplicate information found in drug_names, strength, route, and other columns

In [111]:
Price_Patent_Data.drop_duplicates(keep = 'first', inplace = True)
Price_Patent_Data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 214082 entries, 68472 to 6411864
Data columns (total 17 columns):
nadac_per_unit                               214082 non-null float64
drug_names                                   214082 non-null object
ndc                                          214082 non-null float64
effective_date                               214082 non-null datetime64[ns]
classification_for_rate_setting              214082 non-null object
corresponding_generic_drug_effective_date    214082 non-null datetime64[ns]
corresponding_generic_drug_nadac_per_unit    214082 non-null float64
otc                                          214082 non-null object
approval_date                                132995 non-null datetime64[ns]
patent_expire_date_text                      2195 non-null datetime64[ns]
pricing_unit                                 214082 non-null object
ingredient                                   132995 non-null object
applicant                          

### __Create features__

i.e. If a drug has a generic equivalent (i.e. a classification_for_rate_setting value of 'G', and a te_code of 'A')...see notes

In [112]:
# Function combining all feature engineerings

def create_features(df):
    import re
    # Create days_before_patent_expires feature
    df['days_before_patent_expires'] = (df['patent_expire_date_text'] - df['approval_date']).astype('timedelta64[D]')

    # Create drug_age feature
    df['drug_age'] = df.groupby(['ndc'])['approval_date'].transform(lambda x: dt.date.today() - x.min().date()).astype('m8[ns]')/np.timedelta64(1, 'D')

    # Drop two datetime columns now that we're done with them
    df.drop(columns = ['approval_date', 'patent_expire_date_text'], inplace = True)

    # Combine te_code categories
    df['te_code'] = df['te_code'].str.replace(r"(^A.*)", "A", regex = True)
    df['te_code'] = df['te_code'].str.replace(r"(^B.*)", "B", regex = True)
    df.te_code.value_counts(dropna = False)

    # Combine classifications listed in classification_for_rate_setting feature
    df['classification_for_rate_setting'] = df['classification_for_rate_setting'].str.replace(r"(^B.*)", "B", regex = True)
    df['classification_for_rate_setting'].value_counts(dropna = False)

    # Simplify exclusivity_code - Delete everything after the dash
#     df['exclusivity_code'] = df['exclusivity_code'].str.replace(r"(.*)-.*", '\\1', regex = True)
#     df.exclusivity_code.value_counts(dropna = False)

    # Aggregate values in dosage_form feature with less than 1000 counts
    df.loc[df.groupby('dosage_form').dosage_form.transform('count').lt(1000), 'dosage_form'] = 'OTHER'
    df['dosage_form'].value_counts(dropna = False)

    # Aggregate values in route feature with less than 100 counts
    df.loc[df.groupby('route').dosage_form.transform('count').lt(100), 'route'] = 'OTHER'
    df['route'].value_counts(dropna = False)
    
    # Correct the corresponding_generic_drug_effective_date and corresponding_generic_drug_nadac_per_unit data (generics can't have data in these columns)
    df.loc[df.classification_for_rate_setting == 'G', ['corresponding_generic_drug_effective_date', 'corresponding_generic_drug_nadac_per_unit']] = pd.NaT, np.NaN

    # Create a new generic_exists column
    df['generic_exists'] = np.where((df['classification_for_rate_setting'] == 'B') & (df['corresponding_generic_drug_nadac_per_unit'].notnull()), 1, 0)  
    df['generic_exists'].value_counts(dropna = False)

In [113]:
create_features(Price_Patent_Data)

### __Convert dates to day, month, year columns__

(for one-hot encoding later)

In [114]:
# Convert effective_date column values
Price_Patent_Data['effective_date_year'] = (Price_Patent_Data['effective_date'].dt.year).astype('float16')
Price_Patent_Data['effective_date_month'] = (Price_Patent_Data['effective_date'].dt.month).astype('float16')
Price_Patent_Data['effective_date_day'] = (Price_Patent_Data['effective_date'].dt.day).astype('float16')

# Convert corresponding_generic_drug_effective_date column values
Price_Patent_Data['corresponding_generic_drug_effective_year'] = (Price_Patent_Data['corresponding_generic_drug_effective_date'].dt.year).astype('float16')
Price_Patent_Data['corresponding_generic_drug_effective_month'] = (Price_Patent_Data['corresponding_generic_drug_effective_date'].dt.month).astype('float16')
Price_Patent_Data['corresponding_generic_drug_effective_day'] = (Price_Patent_Data['corresponding_generic_drug_effective_date'].dt.day).astype('float16')

# Drop both original columns
Price_Patent_Data.drop(['effective_date', 'corresponding_generic_drug_effective_date'], axis = 1, inplace = True)

### __Fix column data types__

In [115]:
Price_Patent_Data[[
                   'nadac_per_unit', 
                   'corresponding_generic_drug_nadac_per_unit', 
                   'days_before_patent_expires',
                   'drug_age',
                   'ndc', 
                   ]] = Price_Patent_Data[['nadac_per_unit', 'corresponding_generic_drug_nadac_per_unit', 'days_before_patent_expires', 'drug_age', 'ndc']].astype('float32')

In [116]:
# Drop drug_names (carry on with 'ndc')
Price_Patent_Data.drop('drug_names', axis = 1, inplace = True)

In [117]:
# Drop rows without a nadac_per_unit (price)
Price_Patent_Data = Price_Patent_Data[Price_Patent_Data.nadac_per_unit > 0]
Price_Patent_Data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 214082 entries, 68472 to 6411864
Data columns (total 21 columns):
nadac_per_unit                                214082 non-null float32
ndc                                           214082 non-null float32
classification_for_rate_setting               214082 non-null object
corresponding_generic_drug_nadac_per_unit     2402 non-null float32
otc                                           214082 non-null object
pricing_unit                                  214082 non-null object
ingredient                                    132995 non-null object
applicant                                     132995 non-null object
te_code                                       125918 non-null object
type                                          132995 non-null object
dosage_form                                   132995 non-null object
route                                         132995 non-null object
days_before_patent_expires                    2195 non-

### __Drop unneeded columns__
Dropping several columns that have low amounts of data (you have to go back to previous notebooks to see this is the case), or have a weak theoretical basis for being included.

In [118]:
Price_Patent_Reg = Price_Patent_Data.drop([
                                         'drug_age',
#                                          'effective_date_year',
#                                          'effective_date_month',
#                                          'effective_date_day',
                                         'classification_for_rate_setting',
                                         'corresponding_generic_drug_nadac_per_unit',
                                         'pricing_unit',
                                         'ingredient',
                                         'applicant',
#                                          'otc',
                                         'type',
#                                          'dosage_form',
                                         'route',
                                         ], axis = 1)

### __Create dummies for the following columns:__

* One-hot encode in the sklearn pipline later (instead of doing this now)

In [119]:
# Replace columns above with binary versions (including ndc_description)
Price_Patent_Reg = pd.get_dummies(data = Price_Patent_Reg, drop_first = True, columns = [
                                                                                        'otc',
                                                                                        'te_code',
                                                                                        'dosage_form', 
                                                                                        ])
Price_Patent_Reg.info(verbose = True, null_counts = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 214082 entries, 68472 to 6411864
Data columns (total 15 columns):
nadac_per_unit                                214082 non-null float32
ndc                                           214082 non-null float32
days_before_patent_expires                    2195 non-null float32
generic_exists                                214082 non-null int32
effective_date_year                           214082 non-null float16
effective_date_month                          214082 non-null float16
effective_date_day                            214082 non-null float16
corresponding_generic_drug_effective_year     2402 non-null float16
corresponding_generic_drug_effective_month    2402 non-null float16
corresponding_generic_drug_effective_day      2402 non-null float16
dosage_form_SPRAY                             214082 non-null uint8
dosage_form_TABLET                            214082 non-null uint8
dosage_form_TABLET, CHEWABLE                  214082 non-n

### __Fill NaNs, update dtypes?__

In [120]:
# Fill NaNs with zeros (research better alternative)
Price_Patent_Reg.fillna(0, inplace = True) 

### __Pickle__

In [121]:
# Pickle data
dill.dump(Price_Patent_Reg, open('data/features_created.pkd', 'wb'))

In [122]:
# Save as CSV - create dummy variables and sparse matrix in Regressions notebook
Price_Patent_Reg.to_csv('data/Price_Patent_Reg.csv')