In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#from tqdm import tqdm  # Import tqdm for progress bar
# Added notes here.

# Define a function to load data
def load_data(base_path):
    datasets = {}
    # List of filenames to load
    filenames = ['target_train_cleaned9.28', 'medclms_train_cleaned9.29', 'rxclms_train_cleaned9.28']

    for filename in filenames:
        dataset = pd.read_csv(f"{base_path}/{filename}.csv", low_memory=False, encoding='ISO-8859-1')
        dataset.fillna('N/A', inplace=True)
        datasets[filename] = dataset

    return datasets


In [2]:


# Define the base path where the data is located
base_path = "/Users/nathanzlomke/Downloads"

# Load the datasets using the defined function
datasets = load_data(base_path)


  dataset.fillna('N/A', inplace=True)
  dataset.fillna('N/A', inplace=True)
  dataset.fillna('N/A', inplace=True)


In [3]:
datasets['rxclms_train_cleaned9.28'].rename(columns = {'ï»¿therapy_id':'therapy_id'}, inplace = 'True')

In [4]:
datasets['rxclms_train_cleaned9.28'].head()

Unnamed: 0,therapy_id,document_key,ndc_id,service_date,process_date,Prescription_Filled_Duration,RX_Process_Duration,pay_day_supply_cnt,rx_cost,tot_drug_cost_accum_amt,...,strength_meas,metric_strength,specialty_ind,clm_type,ddi_ind,anticoag_ind,diarrhea_treat_ind,nausea_treat_ind,seizure_treat_ind,RxGroupings
0,1009508044-TAGRISSO-1,A184611654291011,169266015,3/2/18,12/4/19,-692,642,30,919.78,2830.22,...,UNIT/ML,100.0,NONSPCL,rx,No,No,No,No,No,Diabetes
1,1023838279-TAGRISSO-1,A184877988141011,69097022416,3/28/18,3/13/20,-569,716,28,5.0,1138.66,...,MG,70.0,NONSPCL,rx,No,No,No,No,No,Other
2,1023838279-TAGRISSO-1,A185952462961011,60505257908,7/14/18,3/13/20,-461,608,30,7.3,2693.11,...,MG,20.0,NONSPCL,rx,No,No,No,No,No,Cardio
3,1023838279-TAGRISSO-1,A186219930881011,65862057290,8/9/18,3/17/20,-435,586,30,12.2,2718.31,...,MG,160.0,NONSPCL,rx,No,No,No,No,No,Cardio
4,1071647492-TAGRISSO-1,A194013465121011,16252060144,1/1/19,3/3/20,-364,427,90,9.9,0.0,...,MG,70.0,NONSPCL,rx,No,No,No,No,No,Other


In [9]:

# Merge all datasets together on 'therapy_id' as the primary key using an inner join
merged_data = datasets['target_train_cleaned9.28'].merge(datasets['medclms_train_cleaned9.29'], on='therapy_id', how='left')  
merged_data = merged_data.merge(datasets['rxclms_train_cleaned9.28'], on='therapy_id', how='left')
merged_data.shape

(3896297, 91)

In [10]:

# Convert 'Yes' to 1 and 'No' to 0 in the 'tgt_ade_dc_ind' column
merged_data['tgt_ade_dc_ind'] = merged_data['tgt_ade_dc_ind'].replace({'Yes': 1, 'No': 0})

# Print statistics about the column 'tgt_ade_dc_ind'
print("Statistics about 'tgt_ade_dc_ind' column:")
print(merged_data['tgt_ade_dc_ind'].describe())


Statistics about 'tgt_ade_dc_ind' column:
count    3.896297e+06
mean     1.974128e-01
std      3.980465e-01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
Name: tgt_ade_dc_ind, dtype: float64


In [32]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3878636 entries, 0 to 3878635
Data columns (total 91 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   therapy_id                    object 
 1   medclm_key                    int64  
 2   clm_unique_key                float64
 3   primary_diag_cd               object 
 4   visit_date                    object 
 5   Visit_Duration                object 
 6   diag_cd2                      object 
 7   diag_cd3                      object 
 8   diag_cd4                      object 
 9   diag_cd5                      object 
 10  diag_cd6                      object 
 11  diag_cd7                      object 
 12  diag_cd8                      object 
 13  diag_cd9                      object 
 14  process_date_x                object 
 15  MedProcess_Duration           int64  
 16  reversal_iNod_x               object 
 17  pot                           object 
 18  util_cat              

In [33]:
merged_data.head()

Unnamed: 0,therapy_id,medclm_key,clm_unique_key,primary_diag_cd,visit_date,Visit_Duration,diag_cd2,diag_cd3,diag_cd4,diag_cd5,...,id,therapy_start_date,therapy_end_date,Date Duration,tgt_ade_dc_ind,race_cd,est_age,sex_cd,cms_disabled_ind,cms_low_income_ind
0,1066310426-TAGRISSO-1,35908472910,6.49e+17,I70292,9/10/19,-128.0,I771,I739,R9431,Z7982,...,1066310426,1/16/20,2/15/20,30,0,White,74.0,M,No,No
1,1066310426-TAGRISSO-1,35908472910,6.49e+17,I70292,9/10/19,-128.0,I771,I739,R9431,Z7982,...,1066310426,1/16/20,2/15/20,30,0,White,74.0,M,No,No
2,1066310426-TAGRISSO-1,35908472910,6.49e+17,I70292,9/10/19,-128.0,I771,I739,R9431,Z7982,...,1066310426,1/16/20,2/15/20,30,0,White,74.0,M,No,No
3,1066310426-TAGRISSO-1,35908472910,6.49e+17,I70292,9/10/19,-128.0,I771,I739,R9431,Z7982,...,1066310426,1/16/20,2/15/20,30,0,White,74.0,M,No,No
4,1066310426-TAGRISSO-1,35908472910,6.49e+17,I70292,9/10/19,-128.0,I771,I739,R9431,Z7982,...,1066310426,1/16/20,2/15/20,30,0,White,74.0,M,No,No


In [11]:
merged_data.to_csv("/Users/nathanzlomke/Downloads/CORRECTED_cleaned_humana_inner10-9.csv")