In [1]:
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip" | mv drugsCom_raw.zip ../datasets/drugsCom_raw.zip

--2024-03-05 11:52:49--  https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘drugsCom_raw.zip’

drugsCom_raw.zip        [              <=>   ]  41.00M  13.2MB/s    in 3.3s    

2024-03-05 11:52:53 (12.6 MB/s) - ‘drugsCom_raw.zip’ saved [42989872]



In [55]:
!unzip ../datasets/drugsCom_raw.zip

Archive:  ../datasets/drugsCom_raw.zip
  inflating: drugsComTest_raw.tsv    
  inflating: drugsComTrain_raw.tsv   


In [56]:
from datasets import load_dataset
import os

BASE_DIR = "../datasets/"

train_path = os.path.join(BASE_DIR, "drugsComTrain_raw.tsv")
test_path = os.path.join(BASE_DIR, "drugsComTest_raw.tsv")

data_files = {"train": train_path, "test": test_path}

drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

In [57]:
### analyzing a random sample

drug_sample = drug_dataset['train'].shuffle(seed=42).select(range(1000))

In [58]:
drug_sample[:3]

{'Unnamed: 0': [87571, 178045, 80482],
 'drugName': ['Naproxen', 'Duloxetine', 'Mobic'],
 'condition': ['Gout, Acute', 'ibromyalgia', 'Inflammatory Conditions'],
 'review': ['"like the previous person mention, I&#039;m a strong believer of aleve, it works faster for my gout than the prescription meds I take. No more going to the doctor for refills.....Aleve works!"',
  '"I have taken Cymbalta for about a year and a half for fibromyalgia pain. It is great\r\nas a pain reducer and an anti-depressant, however, the side effects outweighed \r\nany benefit I got from it. I had trouble with restlessness, being tired constantly,\r\ndizziness, dry mouth, numbness and tingling in my feet, and horrible sweating. I am\r\nbeing weaned off of it now. Went from 60 mg to 30mg and now to 15 mg. I will be\r\noff completely in about a week. The fibro pain is coming back, but I would rather deal with it than the side effects."',
  '"I have been taking Mobic for over a year with no side effects other than 

Hypotheses for cleaning:
- The `Unnamed: 0` column looks like an ID of the patient
- The `condition` column have a mix of lowercase and uppercase characters
- The `reviews` are of varying length and contain a mix of Python line separators \r\n as well as HTML character codes

In [59]:
for split in drug_dataset.keys():
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique('Unnamed: 0'))

In [60]:
### Hypothesis confirmed, renaming the column
drug_dataset = drug_dataset.rename_column(original_column_name="Unnamed: 0", new_column_name="patient_id")

In [61]:
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [62]:
### Trying it out - finding the number of unique drugs and conditions
def print_drugs_conditions(drug_dataset):

    for split in drug_dataset.keys():
        
        unique_drugs_split = drug_dataset[split].unique('drugName')
        unique_conditions_split = drug_dataset[split].unique('condition')

        print(f"Unique drugs in {split}: {len(unique_drugs_split)}")
        print(f"Unique conditions in {split}: {len(unique_conditions_split)}")


In [63]:
print_drugs_conditions(drug_dataset)

Unique drugs in train: 3436
Unique conditions in train: 885
Unique drugs in test: 2637
Unique conditions in test: 709


In [64]:
### lowercasing the conditions
def lowercase_condition(example):
    return {'condition': example['condition'].lower()}

# drug_dataset.map(lowercase_condition)

In [65]:
### Dropping the rows having condition = None
def filter_nones(x):
    return x["condition"] is not None

filtered_drugs = drug_dataset.filter(lambda x: filter_nones(x))

In [66]:
lowercased_drugs = filtered_drugs.map(lowercase_condition)

In [67]:
lowercased_drugs

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 160398
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53471
    })
})

In [68]:
print_drugs_conditions(lowercased_drugs)

Unique drugs in train: 3431
Unique conditions in train: 884
Unique drugs in test: 2635
Unique conditions in test: 708


In [69]:
### lambda try-outs
lambda x: x * x

<function __main__.<lambda>(x)>

In [70]:
(lambda x: x * x)(3)

9

In [71]:
(lambda base, height: 0.5 * base * height)(4, 8)

16.0

In [72]:
### eliminating the None values using lambda
drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)

drug_dataset = drug_dataset.map(lowercase_condition)

In [73]:
drug_dataset['train']['condition'][:3]

['left ventricular dysfunction', 'adhd', 'birth control']

In [74]:
### Cleaning up the reviews


# `review_length is not a column in our example - will create a column automatically`
def compute_review_length(example):
    return {'review_length': len(example['review'].split())}

drug_dataset = drug_dataset.map(compute_review_length)

In [75]:
drug_dataset['train'][0]

{'patient_id': 206461,
 'drugName': 'Valsartan',
 'condition': 'left ventricular dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27,
 'review_length': 17}

In [76]:
drug_dataset['train'].sort('review_length')[:3]

{'patient_id': [111469, 13653, 53602],
 'drugName': ['Ledipasvir / sofosbuvir',
  'Amphetamine / dextroamphetamine',
  'Alesse'],
 'condition': ['hepatitis c', 'adhd', 'birth control'],
 'review': ['"Headache"', '"Great"', '"Awesome"'],
 'rating': [10.0, 10.0, 10.0],
 'date': ['February 3, 2015', 'October 20, 2009', 'November 23, 2015'],
 'usefulCount': [41, 3, 0],
 'review_length': [1, 1, 1]}

In [77]:
### filtering for the reviews that are greater than 30 symbols

drug_dataset = drug_dataset.filter(lambda x: x['review_length'] > 30)
print(drug_dataset.num_rows)

{'train': 138514, 'test': 46108}


In [78]:
### Maximum length reviews
# drug_dataset.sort('review_length', reverse=True)[:3]


drug_dataset.set_format("pandas")

In [79]:
train_df = drug_dataset['train'][:]

In [80]:
train_df['review_length'].max()

1894

In [81]:
train_df['review_length'].min()

drug_dataset.reset_format()

In [82]:
### removing html tags

import html

text = "I&#039;m a transformer called BERT"
html.unescape(text)

"I'm a transformer called BERT"

In [85]:
# using map api
drug_dataset = drug_dataset.map(lambda x: {"review": html.unescape(x['review'])})

Map: 100%|██████████| 138514/138514 [00:23<00:00, 5933.83 examples/s]
Map: 100%|██████████| 46108/46108 [00:07<00:00, 6057.50 examples/s]


In [86]:
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 138514
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})

In [87]:
### The superpowers of "Map" method

### Using "batched" processing with list comprehensions

new_drug_dataset1 = drug_dataset.map(
    lambda x: {"review": [html.unescape(o) for o in x['review']]}, batched=True
)

Map: 100%|██████████| 138514/138514 [00:00<00:00, 222524.83 examples/s]
Map: 100%|██████████| 46108/46108 [00:00<00:00, 227918.58 examples/s]


DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 138514
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})

In [90]:
### Using fast tokenizers
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

def tokenizer_function(examples):
    return tokenizer(examples['review'], truncation=True)

In [91]:
drug_dataset['train'][0]['review']

'"My son is halfway through his fourth week of Intuniv. We became concerned when he began this last week, when he started taking the highest dose he will be on. For two days, he could hardly get out of bed, was very cranky, and slept for nearly 8 hours on a drive home from school vacation (very unusual for him.) I called his doctor on Monday morning and she said to stick it out a few days. See how he did at school, and with getting up in the morning. The last two days have been problem free. He is MUCH more agreeable than ever. He is less emotional (a good thing), less cranky. He is remembering all the things he should. Overall his behavior is better. \r\nWe have tried many different medications and so far this is the most effective."'

In [92]:
%time tokenized_dataset = drug_dataset.map(tokenizer_function, batched=True)

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map: 100%|██████████| 138514/138514 [00:34<00:00, 3970.62 examples/s]
Map: 100%|██████████| 46108/46108 [00:11<00:00, 3948.31 examples/s]

CPU times: user 1min 8s, sys: 682 ms, total: 1min 9s
Wall time: 46.6 s





In [93]:
%time tokenized_not_batched = drug_dataset.map(tokenizer_function)

Map: 100%|██████████| 138514/138514 [01:10<00:00, 1953.95 examples/s]
Map: 100%|██████████| 46108/46108 [00:25<00:00, 1828.79 examples/s]

CPU times: user 1min 31s, sys: 984 ms, total: 1min 32s
Wall time: 1min 36s





In [95]:
### Slow tokenizers will be even slower (AutoTokenizer.from_pretrained(use_fast=False))

In [99]:
### Running fast tokenizer with batching and parallelism
%time tokenized_full_fledged = drug_dataset.map(tokenizer_function, batched=True, num_proc=2)

Map (num_proc=2):   0%|          | 0/138514 [00:00<?, ? examples/s]

Map (num_proc=2): 100%|██████████| 138514/138514 [00:33<00:00, 4193.58 examples/s]
Map (num_proc=2): 100%|██████████| 46108/46108 [00:10<00:00, 4242.22 examples/s]


CPU times: user 483 ms, sys: 201 ms, total: 683 ms
Wall time: 44.1 s


In [100]:
### Extracting multiple features from the same example

def tokenize_and_split(examples):
    return tokenizer(
        examples['review'],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True
    )

In [101]:
result = tokenize_and_split(drug_dataset['train'][0])
[len(inp) for inp in result['input_ids']]

[128, 49]

In [103]:
# removing old columns
tokenized_dataset = drug_dataset.map(
    tokenize_and_split,
    batched=True,
    remove_columns=drug_dataset['train'].column_names
)

Map: 100%|██████████| 138514/138514 [00:44<00:00, 3088.32 examples/s]
Map: 100%|██████████| 46108/46108 [00:14<00:00, 3176.87 examples/s]


In [106]:
len(tokenized_dataset['train']), len(drug_dataset['train'])

(206772, 138514)

In [107]:
### overflow to sample mapping - associating values in the new dataset with old dataset rows - index

def tokenize_and_split(example):

    result = tokenizer(
        example['review'],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )

    # extracting mapping between new and old indices
    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in example.items():
        result[key] = [values[i] for i in sample_map]
    
    return result

In [108]:
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)

Map: 100%|██████████| 138514/138514 [00:46<00:00, 2979.80 examples/s]
Map: 100%|██████████| 46108/46108 [00:15<00:00, 2934.19 examples/s]


In [109]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 206772
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 68876
    })
})

### Creating a validation set

In [112]:
drug_dataset_clean = drug_dataset['train'].train_test_split(train_size=0.8, seed=42)
# Renaming test to validation
drug_dataset_clean['validation'] = drug_dataset_clean.pop("test")

# add the "test" to DatasetDict
drug_dataset_clean['test'] = drug_dataset['test']
drug_dataset_clean

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 110811
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 27703
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})

### Save'n'load

In [119]:
drug_dataset_clean.save_to_disk('../datasets/drug_arrow_datasets')

Saving the dataset (1/1 shards): 100%|██████████| 110811/110811 [00:01<00:00, 75038.00 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 27703/27703 [00:00<00:00, 69480.08 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 46108/46108 [00:00<00:00, 316887.88 examples/s]


In [120]:
from datasets import load_from_disk

drug_datasets = load_from_disk('../datasets/drug_arrow_datasets/')

In [None]:
### IF CSVs - use these
# for split, dataset in drug_datasets.items():
#     dataset.to_csv(f"my-dataset-{split}.csv", index=None)

# data_files = {
#     "train": "my-dataset-train.csv",
#     "validation": "my-dataset-validation.csv",
#     "test": "my-dataset-test.csv"
# }

# csv_datasets_reloaded = load_dataset("csv", data_files=data_files)
# csv_datasets_reloaded

In [121]:
# The same for JSON and PARQUET formats