In [42]:
import pandas as pd

input_path = "ha_train_set/inputs.csv"
label_path = "ha_train_set/labels.csv"

inputs = pd.read_csv(input_path)
labels = pd.read_csv(label_path)

In [43]:
# Combine the inputs and labels based on PatientID column present in both
data = pd.merge(inputs, labels, on='PatientID')
data.columns

Index(['PatientID', 'State', 'Sex', 'GeneralHealth', 'AgeCategory',
       'HeightInMeters', 'WeightInKilograms', 'BMI', 'HadAngina', 'HadStroke',
       'HadAsthma', 'HadSkinCancer', 'HadCOPD', 'HadDepressiveDisorder',
       'HadKidneyDisease', 'HadArthritis', 'HadDiabetes',
       'DeafOrHardOfHearing', 'BlindOrVisionDifficulty',
       'DifficultyConcentrating', 'DifficultyWalking',
       'DifficultyDressingBathing', 'DifficultyErrands', 'SmokerStatus',
       'ECigaretteUsage', 'ChestScan', 'RaceEthnicityCategory',
       'AlcoholDrinkers', 'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver',
       'TetanusLast10Tdap', 'HighRiskLastYear', 'CovidPos', 'HadHeartAttack'],
      dtype='object')

In [31]:
## Type of Columns
data.dtypes

PatientID                      int64
State                         object
Sex                           object
GeneralHealth                 object
AgeCategory                   object
HeightInMeters               float64
WeightInKilograms            float64
BMI                          float64
HadAngina                      int64
HadStroke                      int64
HadAsthma                      int64
HadSkinCancer                  int64
HadCOPD                        int64
HadDepressiveDisorder          int64
HadKidneyDisease               int64
HadArthritis                   int64
HadDiabetes                   object
DeafOrHardOfHearing            int64
BlindOrVisionDifficulty        int64
DifficultyConcentrating        int64
DifficultyWalking              int64
DifficultyDressingBathing      int64
DifficultyErrands              int64
SmokerStatus                  object
ECigaretteUsage               object
ChestScan                      int64
RaceEthnicityCategory         object
A

In [32]:
## Convert all object type columns to category (Int Encoding)

for col in data.columns:
    if data[col].dtype == 'object':
        data[col] = data[col].astype('category')
        data[col] = data[col].cat.codes

data.dtypes

PatientID                      int64
State                           int8
Sex                             int8
GeneralHealth                   int8
AgeCategory                     int8
HeightInMeters               float64
WeightInKilograms            float64
BMI                          float64
HadAngina                      int64
HadStroke                      int64
HadAsthma                      int64
HadSkinCancer                  int64
HadCOPD                        int64
HadDepressiveDisorder          int64
HadKidneyDisease               int64
HadArthritis                   int64
HadDiabetes                     int8
DeafOrHardOfHearing            int64
BlindOrVisionDifficulty        int64
DifficultyConcentrating        int64
DifficultyWalking              int64
DifficultyDressingBathing      int64
DifficultyErrands              int64
SmokerStatus                    int8
ECigaretteUsage                 int8
ChestScan                      int64
RaceEthnicityCategory           int8
A

In [44]:
data.head()

Unnamed: 0,PatientID,State,Sex,GeneralHealth,AgeCategory,HeightInMeters,WeightInKilograms,BMI,HadAngina,HadStroke,...,ChestScan,RaceEthnicityCategory,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos,HadHeartAttack
0,1,Alabama,Female,Fair,Age 75 to 79,1.63,84.82,32.099998,1,0,...,1,"White only, Non-Hispanic",0,0,0,1,"No, did not receive any tetanus shot in the pa...",0,1,0
1,2,Alabama,Female,Very good,Age 65 to 69,1.6,71.669998,27.99,0,0,...,0,"White only, Non-Hispanic",0,0,1,1,"Yes, received Tdap",0,0,0
2,3,Alabama,Male,Excellent,Age 60 to 64,1.78,71.209999,22.530001,0,0,...,0,"White only, Non-Hispanic",1,0,0,0,"Yes, received tetanus shot but not sure what type",0,0,0
3,4,Alabama,Male,Very good,Age 70 to 74,1.78,95.25,30.129999,0,0,...,0,"White only, Non-Hispanic",0,0,1,1,"Yes, received tetanus shot but not sure what type",0,0,0
4,8,Alaska,Female,Fair,Age 60 to 64,1.7,87.540001,30.23,0,0,...,0,"Black only, Non-Hispanic",0,0,0,1,"No, did not receive any tetanus shot in the pa...",0,0,0


In [45]:
data["HadHeartAttack"].value_counts()

HadHeartAttack
0    179585
1     10572
Name: count, dtype: int64

In [46]:
## Select all rows where data["HadHeartAttack"] = 1 and equal number of rows where data["HadHeartAttack"] = 0, currently 0 is more than 1

data_1 = data[data["HadHeartAttack"] == 1]
data_0 = data[data["HadHeartAttack"] == 0]
data_0 = data_0.sample(n=len(data_1), random_state=42)

data = pd.concat([data_0, data_1])

## Shuffle the data
data = data.sample(frac=1, random_state=42)

data["HadHeartAttack"].value_counts()

HadHeartAttack
0    10572
1    10572
Name: count, dtype: int64

In [47]:
data.head()

Unnamed: 0,PatientID,State,Sex,GeneralHealth,AgeCategory,HeightInMeters,WeightInKilograms,BMI,HadAngina,HadStroke,...,ChestScan,RaceEthnicityCategory,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos,HadHeartAttack
153262,191479,Utah,Male,Excellent,Age 45 to 49,1.83,90.720001,27.120001,0,0,...,0,"White only, Non-Hispanic",0,0,0,0,"Yes, received tetanus shot, but not Tdap",0,0,0
176967,221215,West Virginia,Male,Very good,Age 80 or older,1.68,72.57,25.82,1,1,...,1,"White only, Non-Hispanic",1,0,1,1,"No, did not receive any tetanus shot in the pa...",0,0,0
40724,50911,Illinois,Female,Excellent,Age 55 to 59,1.73,68.040001,22.809999,0,0,...,0,"White only, Non-Hispanic",1,0,0,0,"No, did not receive any tetanus shot in the pa...",0,1,0
112228,140381,New York,Male,Excellent,Age 75 to 79,1.65,76.199997,27.959999,1,0,...,1,"White only, Non-Hispanic",1,1,1,1,"Yes, received Tdap",0,0,1
187846,234749,Puerto Rico,Male,Fair,Age 55 to 59,1.63,63.5,24.030001,1,0,...,1,Hispanic,1,0,0,0,"No, did not receive any tetanus shot in the pa...",0,0,1


In [50]:
# Now pick around 500 samples for data["HadHeartAttack"] = 0 and 500 samples for data["HadHeartAttack"] = 1
data = data.sample(n=5000, random_state=42)
data["HadHeartAttack"].value_counts()

HadHeartAttack
0    2565
1    2435
Name: count, dtype: int64

In [49]:
data.to_csv("ha_train_set/sample_data.csv", index=False)

In [51]:
sample = pd.read_csv("ha_train_set/sample_data.csv")
sample.head()

Unnamed: 0,PatientID,State,Sex,GeneralHealth,AgeCategory,HeightInMeters,WeightInKilograms,BMI,HadAngina,HadStroke,...,ChestScan,RaceEthnicityCategory,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos,HadHeartAttack
0,180377,South Dakota,Male,Good,Age 40 to 44,1.85,102.059998,29.68,0,0,...,0,"White only, Non-Hispanic",0,0,0,0,"Yes, received Tdap",0,1,0
1,118633,Montana,Female,Very good,Age 60 to 64,1.65,74.839996,27.459999,0,0,...,0,"White only, Non-Hispanic",1,1,1,1,"Yes, received Tdap",0,1,0
2,208070,Washington,Male,Good,Age 80 or older,1.83,108.860001,32.549999,0,0,...,0,"White only, Non-Hispanic",0,0,1,1,"No, did not receive any tetanus shot in the pa...",0,0,0
3,231621,Puerto Rico,Female,Excellent,Age 30 to 34,1.78,122.470001,38.740002,0,0,...,0,Hispanic,1,1,0,0,"No, did not receive any tetanus shot in the pa...",0,0,0
4,63642,Iowa,Male,Good,Age 30 to 34,1.78,99.790001,31.57,0,0,...,1,"Black only, Non-Hispanic",1,1,0,0,"Yes, received tetanus shot but not sure what type",0,0,0


In [52]:
sample.columns

Index(['PatientID', 'State', 'Sex', 'GeneralHealth', 'AgeCategory',
       'HeightInMeters', 'WeightInKilograms', 'BMI', 'HadAngina', 'HadStroke',
       'HadAsthma', 'HadSkinCancer', 'HadCOPD', 'HadDepressiveDisorder',
       'HadKidneyDisease', 'HadArthritis', 'HadDiabetes',
       'DeafOrHardOfHearing', 'BlindOrVisionDifficulty',
       'DifficultyConcentrating', 'DifficultyWalking',
       'DifficultyDressingBathing', 'DifficultyErrands', 'SmokerStatus',
       'ECigaretteUsage', 'ChestScan', 'RaceEthnicityCategory',
       'AlcoholDrinkers', 'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver',
       'TetanusLast10Tdap', 'HighRiskLastYear', 'CovidPos', 'HadHeartAttack'],
      dtype='object')

In [40]:
!pip install scikit-learn

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp311-cp311-macosx_12_0_arm64.whl.metadata (13 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp311-cp311-macosx_12_0_arm64.whl (11.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.0/11.0 MB[0m [31m37.3 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hUsing cached joblib-1.4.2-py3-none-any.whl (301 kB)
Using cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.5.2 threadpoolctl-3.5.0


In [41]:
## Train Test Split 80-20

from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2, random_state=42)


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif

# Load dataset
# df = pd.read_csv('heart_disease.csv')
df = data

# Correlation matrix
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.show()

# Feature importance using Random Forest
X = df.drop('HadHeartAttack', axis=1)  # Features
y = df['HadHeartAttack']  # Target variable
model = RandomForestClassifier()
model.fit(X, y)
importances = model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

# Mutual Information
mi = mutual_info_classif(X, y)
mi_df = pd.DataFrame({'Feature': X.columns, 'Mutual Information': mi})
print(mi_df.sort_values(by='Mutual Information', ascending=False))


In [None]:
inputs.columns

In [None]:
"""
Dieseas:

Angina
Stroke
Asthma
Diabetes
Skin Cancer
Arthritis
Depressive Disorder
Kidney Disease
COPD
"""

disease_features = [
    "HadAngina", "HadStroke", "HadAsthma", "HadSkinCancer", "HadCOPD", "HadDepressiveDisorder", "HadKidneyDisease", "HadArthritis", "HadDiabetes"
]

patient_id_features = [
    "PatientID", "State", "Sex", "AgeCategory", "RaceEthnicityCategory"
]

general_health_features = [
    "GeneralHealth", "HeightInMeters", "WeightInKilograms", "BMI"
]

disability_features = [
    "DeafOrHardOfHearing", "BlindOrVisionDifficulty", "DifficultyConcentrating", "DifficultyWalking", "DifficultyDressingBathing", "DifficultyErrands"
]

lifestyle_features = [
    "SmokerStatus", "ECigaretteUsage", "AlcoholDrinkers"
]

preventive_health_features = [
    "ChestScan", "HIVTesting", "FluVaxLast12", "PneumoVaxEver", "TetanusLast10Tdap", "HighRiskLastYear"
]

covid_features = [
    "CovidPos"
]



In [None]:
import pandas as pd
train_data_path = pd.read_csv("train_patient_descriptions.csv")
train_data_path.columns

In [1]:
from datasets import load_dataset

# Specify the files for train and test splits
data_files = {
    "train": "data/train.jsonl",
    "test": "data/test.jsonl"
}

# Load the dataset
dataset = load_dataset('json', data_files=data_files)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def generate_prompt(example):

    prefix_text = """Below is a patient report text that contains the information about patient's health confitions, it is followed up by a question. Answer the question as 'yes' or 'no'."""
    text = f"""<start_of_turn>user {prefix_text} \n <text>:\n {example["text"]} \n<end_of_text>\n <end_of_turn>\n<start_of_turn>model {example["label"]} <end_of_turn>"""
    return text

In [3]:
text_column = [generate_prompt(data_point) for data_point in dataset["train"]]
dataset = dataset["train"].add_column("prompt", text_column)
dataset

Dataset({
    features: ['id', 'text', 'label', 'prompt'],
    num_rows: 152126
})

In [4]:
print(dataset["prompt"][0])

<start_of_turn>user Below is a patient report text that contains the information about patient's health confitions, it is followed up by a question. Answer the question as 'yes' or 'no'. 
 <text>:
 The patient is a female with a BMI of 21.73, weighing 52.16 kg and standing 1.55 meters tall. The patient did not have angina, did not have a stroke, did not have COPD, did not have kidney disease, and did not have a depressive disorder. She does not have difficulty walking and does not have difficulty completing errands alone. The patient drinks alcohol. She did not have a chest scan and was not considered high risk last year. The patient did not have arthritis, has been tested for HIV, did not have asthma, and does not have difficulty concentrating. The patient did not have skin cancer, does not have vision difficulties, and does not have difficulty dressing or bathing. The patient did not test positive for COVID-19, has not received the pneumococcal vaccine, and does not have hearing diff

In [13]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

os.environ["HF_TOKEN"] = "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"

#set the qunatization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
#
#Load the model and Tokenizer
model_id = "google/gemma-2b-it"
#
# model = AutoModelForCausalLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)

In [14]:
dataset = dataset.shuffle(seed=1234)  # Shuffle dataset here
dataset = dataset.map(lambda samples: tokenizer(samples["prompt"]), batched=True)

Map: 100%|██████████| 152126/152126 [00:13<00:00, 11057.30 examples/s]


In [15]:
dataset

Dataset({
    features: ['id', 'text', 'label', 'prompt', 'input_ids', 'attention_mask'],
    num_rows: 152126
})

In [None]:
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
#
print(model)

#

lora_config = LoraConfig(
    r=64,
    lora_alpha=32,
    target_modules=modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
