In [1]:
  # Transformers installation
! pip install transformers datasets evaluate
! pip install accelerate -U
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading r

## Preprocess

In [2]:
from sklearn.model_selection import train_test_split
import pandas as pd

df = pd.read_csv('cleaned_shuffled_dataset.csv')
train_ratio = 0.9
train_df, test_df = train_test_split(df, train_size=train_ratio, test_size=1 - train_ratio, random_state=42)

The next step is to load a DistilBERT tokenizer to preprocess the `text` field:

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Create a preprocessing function to tokenize `text` and truncate sequences to be no longer than DistilBERT's maximum input length:

In [4]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [5]:
from datasets import Dataset

# Sample dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Rename columns 'Review' to 'text' and 'Sentiment' to 'label'
train_dataset = train_dataset.rename_column('heart_disease_detected', 'label')
test_dataset = test_dataset.rename_column('heart_disease_detected', 'label')

# Update the 'label' values
def map_text_to_integers(example):
    if example['label'] == 'no':
        example['label'] = 0
    elif example['label'] == 'yes':
        example['label'] = 1
    return example

train_dataset = train_dataset.map(map_text_to_integers)
test_dataset = test_dataset.map(map_text_to_integers)

# Check the content of the modified dataset
print(train_dataset)

Map:   0%|          | 0/1376 [00:00<?, ? examples/s]

Map:   0%|          | 0/153 [00:00<?, ? examples/s]

Dataset({
    features: ['index', 'text', 'label', '__index_level_0__'],
    num_rows: 1376
})


In [6]:
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1376 [00:00<?, ? examples/s]

Map:   0%|          | 0/153 [00:00<?, ? examples/s]

In [7]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Evaluate

In [8]:
import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Then create a function that passes your predictions and labels to [compute](https://huggingface.co/docs/evaluate/main/en/package_reference/main_classes#evaluate.EvaluationModule.compute) to calculate the accuracy:

In [9]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

## Train

In [10]:
id2label = {0: "NO", 1: "YES"}
label2id = {"NO": 0, "YES": 1}

In [11]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
training_args = TrainingArguments(
    output_dir="seq_model",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=8,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.269067,0.882353
2,No log,0.234206,0.888889
3,No log,0.261562,0.901961
4,No log,0.296553,0.895425
5,No log,0.342391,0.915033
6,No log,0.40052,0.908497
7,No log,0.372178,0.915033
8,No log,0.368254,0.915033


TrainOutput(global_step=344, training_loss=0.10577312735624092, metrics={'train_runtime': 547.2855, 'train_samples_per_second': 20.114, 'train_steps_per_second': 0.629, 'total_flos': 1458201124405248.0, 'train_loss': 0.10577312735624092, 'epoch': 8.0})

## Inference

In [13]:
from transformers import AutoModelForSequenceClassification
import torch
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("/content/seq_model/checkpoint-344")

model = AutoModelForSequenceClassification.from_pretrained("/content/seq_model/checkpoint-344")

In [14]:
# Load the CSV file
input_csv_path = "/content/test_dataset1.csv"
output_csv_path = "/content/predicted_test_dataset1.csv"

data = pd.read_csv(input_csv_path)

In [15]:
# Create lists to store data for the output CSV
predicted_texts = []

# Loop through each row in the input CSV
for index, row in data.iterrows():
    text = row["text"]
    inputs = tokenizer(text, return_tensors="pt")

    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_class_id = logits.argmax().item()
    predicted_text = model.config.id2label[predicted_class_id]

    predicted_texts.append(predicted_text)

data["text"] = predicted_texts

# Save the DataFrame to the output CSV
data.to_csv(output_csv_path, index=False)

In [16]:
# Manual input of text samples
text_samples = [
    "Normal ef is 55-70%. That means that 57% of the total amount of blood in your left ventricle is pumped out with each heartbeat, which is really good. .... I?m a 44 year old male in good shape with an ejection fraction of 29%, which is really low. it has difficulty pumping efficiently causing severe shortness of breath, fatigue and a host of other symptoms. I also have a faint pulse known as Bigeminy. Sometimes when sleeping my heart rate drops as low as 29 beats per minute",
    "my tounge and mouth feel hurt and i can't eating properly",
    "I'm fine"
]

# Create lists to store data for the output CSV
predicted_texts = []

# Loop through each text sample
for text in text_samples:
    inputs = tokenizer(text, return_tensors="pt")

    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_class_id = logits.argmax().item()
    predicted_text = model.config.id2label[predicted_class_id]

    predicted_texts.append(predicted_text)

data = pd.DataFrame({"text": text_samples, "predicted_text": predicted_texts})

print(data)

                                                text predicted_text
0  Normal ef is 55-70%. That means that 57% of th...            YES
1  my tounge and mouth feel hurt and i can't eati...             NO
2                                           I'm fine             NO
