In [1]:
!pip install datasets evaluate transformers[sentencepiece]
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
!unzip drugsCom_raw.zip

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading eval

In [2]:
import plotly.express as px
import numpy as np
import pandas as pd
from datasets import load_dataset, DatasetDict, Dataset
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import html
from collections import Counter

data_files = {"train": "drugsComTrain_raw.tsv", "test": "drugsComTest_raw.tsv"}
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

# Preprocessing the dataset

drug_dataset = drug_dataset.remove_columns(["drugName", "rating", "usefulCount", "date"])
drug_dataset = drug_dataset.rename_column("Unnamed: 0", "patient_id")
drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)
drug_dataset = drug_dataset.map(lambda x: {"condition": x["condition"].lower()})
drug_dataset = drug_dataset.map(lambda x: {"review_length": len(x["review"].split())})
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 30)
drug_dataset = drug_dataset.map(lambda x: {"review": html.unescape(x["review"])})


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/161297 [00:00<?, ? examples/s]

Filter:   0%|          | 0/53766 [00:00<?, ? examples/s]

Map:   0%|          | 0/160398 [00:00<?, ? examples/s]

Map:   0%|          | 0/53471 [00:00<?, ? examples/s]

Map:   0%|          | 0/160398 [00:00<?, ? examples/s]

Map:   0%|          | 0/53471 [00:00<?, ? examples/s]

Filter:   0%|          | 0/160398 [00:00<?, ? examples/s]

Filter:   0%|          | 0/53471 [00:00<?, ? examples/s]

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

In [3]:
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'condition', 'review', 'review_length'],
        num_rows: 138514
    })
    test: Dataset({
        features: ['patient_id', 'condition', 'review', 'review_length'],
        num_rows: 46108
    })
})

In [4]:
condition_counts_train = Counter(drug_dataset["train"]["condition"])
condition_counts_test = Counter(drug_dataset["test"]["condition"])

drug_dataset["train"] = drug_dataset["train"].filter(lambda x: condition_counts_train[x["condition"]] >= 25)
drug_dataset["test"] = drug_dataset["test"].filter(lambda x: condition_counts_test[x["condition"]] >= 25)

# Koşulları azaltma
df_train = pd.DataFrame(drug_dataset["train"])
df_test = pd.DataFrame(drug_dataset["test"])

def downsample_condition(df, condition, max_samples=200):
    condition_df = df[df['condition'] == condition]
    if len(condition_df) > max_samples:
        condition_df = condition_df.sample(n=max_samples, random_state=42)
    return condition_df

downsampled_df_train_list = [downsample_condition(df_train, condition) for condition in Counter(df_train["condition"]).keys()]
downsampled_df_test_list = [downsample_condition(df_test, condition) for condition in Counter(df_test["condition"]).keys()]

downsampled_df_train = pd.concat(downsampled_df_train_list)
downsampled_df_test = pd.concat(downsampled_df_test_list)

drug_dataset["train"] = Dataset.from_pandas(downsampled_df_train)
drug_dataset["test"] = Dataset.from_pandas(downsampled_df_test)

Filter:   0%|          | 0/138514 [00:00<?, ? examples/s]

Filter:   0%|          | 0/46108 [00:00<?, ? examples/s]

In [5]:
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'condition', 'review', 'review_length', '__index_level_0__'],
        num_rows: 30295
    })
    test: Dataset({
        features: ['patient_id', 'condition', 'review', 'review_length', '__index_level_0__'],
        num_rows: 15656
    })
})

In [6]:
import plotly.express as px
import pandas as pd
from collections import Counter

# Count occurrences of each condition in the training set
condition_counts = Counter(drug_dataset["train"]["condition"])

# Create a DataFrame for plotting
condition_df = pd.DataFrame(condition_counts.items(), columns=["condition", "count"])

# Sort the DataFrame by count in descending order
condition_df = condition_df.sort_values(by="count", ascending=False)

# Plot the data
fig = px.bar(condition_df, x="condition", y="count", title="Condition Counts in the Dataset",
             labels={"condition": "Condition", "count": "Count"})
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

In [7]:
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'condition', 'review', 'review_length', '__index_level_0__'],
        num_rows: 30295
    })
    test: Dataset({
        features: ['patient_id', 'condition', 'review', 'review_length', '__index_level_0__'],
        num_rows: 15656
    })
})

In [8]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["review"], padding="max_length", truncation=True, max_length=128)

drug_dataset["train"] = drug_dataset["train"].map(tokenize_function, batched=True)
drug_dataset["test"] = drug_dataset["test"].map(tokenize_function, batched=True)

# Veri bölme
train_test_split = drug_dataset["train"].train_test_split(test_size=0.3, seed=42)
train_val_split = train_test_split["train"].train_test_split(test_size=0.2, seed=42)




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/30295 [00:00<?, ? examples/s]

Map:   0%|          | 0/15656 [00:00<?, ? examples/s]

In [9]:
drug_dataset_clean = DatasetDict({
    "train": train_val_split["train"],
    "validation": train_val_split["test"],
    "test": train_test_split["test"]
})


In [10]:
drug_dataset_clean

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'condition', 'review', 'review_length', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16964
    })
    validation: Dataset({
        features: ['patient_id', 'condition', 'review', 'review_length', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4242
    })
    test: Dataset({
        features: ['patient_id', 'condition', 'review', 'review_length', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9089
    })
})

In [13]:
drug_dataset_clean = drug_dataset_clean.remove_columns(["patient_id", "review_length"])

In [14]:
drug_dataset_clean

DatasetDict({
    train: Dataset({
        features: ['condition', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16964
    })
    validation: Dataset({
        features: ['condition', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4242
    })
    test: Dataset({
        features: ['condition', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9089
    })
})

In [15]:
label_encoder = LabelEncoder()
label_encoder.fit(drug_dataset_clean["train"]["condition"])

def encode_labels(example):
    return {"labels": label_encoder.transform([example["condition"]])[0]}

drug_dataset_clean = drug_dataset_clean.map(encode_labels)


Map:   0%|          | 0/16964 [00:00<?, ? examples/s]

Map:   0%|          | 0/4242 [00:00<?, ? examples/s]

Map:   0%|          | 0/9089 [00:00<?, ? examples/s]

In [16]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_encoder.classes_))

# Eğitim argümanları
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=drug_dataset_clean["train"],
    eval_dataset=drug_dataset_clean["validation"],
    tokenizer=tokenizer,
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

`evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead



In [None]:
drug_dataset_clean

In [17]:
trainer.train()

# Modeli değerlendirme
results = trainer.evaluate()

Epoch,Training Loss,Validation Loss
1,4.5401,4.106776
2,3.5528,3.352134
3,3.0804,3.114471


In [18]:
print(results)

{'eval_loss': 3.1144707202911377, 'eval_runtime': 31.1923, 'eval_samples_per_second': 135.995, 'eval_steps_per_second': 8.528, 'epoch': 3.0}


In [20]:
import torch
def predict_condition(review_text):
    # Tokenize the review text
    inputs = tokenizer(review_text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
    inputs = {key: value.to(trainer.model.device) for key, value in inputs.items()}

    # Get the model's predictions
    with torch.no_grad():
        outputs = trainer.model(**inputs)
    logits = outputs.logits

    # Get the predicted label
    predicted_label_id = logits.argmax().item()
    predicted_label = label_encoder.inverse_transform([predicted_label_id])[0]

    return predicted_label

# Örnek bir inceleme metni ile tahmin yapma
example_review = "I have taken Cymbalta for about a year and a half for fibromyalgia pain. It is great"
predicted_condition = predict_condition(example_review)
print(f"Predicted condition: {predicted_condition}")

Predicted condition: ibromyalgia
