**Download and load dataset.**

In [3]:
# Download dataset locally
# !wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
# !unzip drugsCom_raw.zip

# Load dataset function
from datasets import load_dataset

data_files = {"train": "drugsComTrain_raw.tsv", "test": "drugsComTest_raw.tsv"}
# \t is the tab character in Python
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

**Sample dataset randomly to check for data quality.**

In [4]:
# It is good practice to grab a random sample from the dataset. 
drug_sample = drug_dataset["train"].shuffle(seed=42).select(range(1000))
# Peek at the first few examples
drug_sample[:3]

{'Unnamed: 0': [87571, 178045, 80482],
 'drugName': ['Naproxen', 'Duloxetine', 'Mobic'],
 'condition': ['Gout, Acute', 'ibromyalgia', 'Inflammatory Conditions'],
 'review': ['"like the previous person mention, I&#039;m a strong believer of aleve, it works faster for my gout than the prescription meds I take. No more going to the doctor for refills.....Aleve works!"',
  '"I have taken Cymbalta for about a year and a half for fibromyalgia pain. It is great\r\nas a pain reducer and an anti-depressant, however, the side effects outweighed \r\nany benefit I got from it. I had trouble with restlessness, being tired constantly,\r\ndizziness, dry mouth, numbness and tingling in my feet, and horrible sweating. I am\r\nbeing weaned off of it now. Went from 60 mg to 30mg and now to 15 mg. I will be\r\noff completely in about a week. The fibro pain is coming back, but I would rather deal with it than the side effects."',
  '"I have been taking Mobic for over a year with no side effects other than 

In [5]:
# The Unnamed: 0 column looks suspiciously like an anonymized ID for each patient.
# The condition column includes a mix of uppercase and lowercase labels.
# The reviews are of varying length and contain a mix of Python line separators (\r\n) as well as HTML character codes like &\#039;.

# Let's fix it using Datasets library

**Rename column.**

In [6]:
# We first verify that the number of IDs matches the number of rows in each split:
for split in drug_dataset.keys():
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique("Unnamed: 0"))

In [7]:
# After verifying, we rename the Unnamed: 0 column to something a bit more interpretable. We can use 
# the DatasetDict.rename_column() function to rename the column across both splits in one go:

drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name="patient_id"
)
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

**Turn all column entry into lowercase letters.**

In [8]:
# Next, let’s normalize all the condition labels using Dataset.map(). As we did with tokenization 
# in Chapter 3, we can define a simple function that can be applied across all the rows of each split in drug_dataset:
def lowercase_condition(example):
    return {"condition": example["condition"].lower()}

drug_dataset.map(lowercase_condition)

Map:   0%|          | 0/161297 [00:00<?, ? examples/s]

AttributeError: 'NoneType' object has no attribute 'lower'

In [9]:
# From the error we can infer that some of the entries in the condition column are None, 
# which cannot be lowercased as they’re not strings. Let’s drop these rows using Dataset.filter()

drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)

In [10]:
# With the None entries removed, we can normalize our condition column:

drug_dataset = drug_dataset.map(lowercase_condition)
# Check that lowercasing worked
drug_dataset["train"]["condition"][:3]

['left ventricular dysfunction', 'adhd', 'birth control']

**Creating new columns.**

In [11]:
# Let’s define a simple function that counts the number of words in each review:
def compute_review_length(example):
    return {"review_length": len(example["review"].split())}

# When we map a non-existing column on the dataset, we create a new column "review_length" instead.
drug_dataset = drug_dataset.map(compute_review_length)
# Inspect the first training example
drug_dataset["train"][0]

{'patient_id': 206461,
 'drugName': 'Valsartan',
 'condition': 'left ventricular dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27,
 'review_length': 17}

In [12]:
# We can sort this new column with Dataset.sort() to see what the extreme values look like:
drug_dataset["train"].sort("review_length")[:3]

{'patient_id': [111469, 13653, 53602],
 'drugName': ['Ledipasvir / sofosbuvir',
  'Amphetamine / dextroamphetamine',
  'Alesse'],
 'condition': ['hepatitis c', 'adhd', 'birth control'],
 'review': ['"Headache"', '"Great"', '"Awesome"'],
 'rating': [10.0, 10.0, 10.0],
 'date': ['February 3, 2015', 'October 20, 2009', 'November 23, 2015'],
 'usefulCount': [41, 3, 0],
 'review_length': [1, 1, 1]}

In [13]:
# Let’s use the Dataset.filter() function to remove reviews that contain fewer than 30 words. 
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 30)
print(drug_dataset.num_rows)

{'train': 138514, 'test': 46108}


In [14]:
# We can use Python’s html module to unescape HTML character codes, like so:
import html

drug_dataset = drug_dataset.map(lambda x: {"review": html.unescape(x["review"])})

**List Comprehensions** are usually ***faster*** than executing the same code in a ***for loop***, and we also gain some performance by accessing lots of elements at the same time instead of one by one.

In [15]:
# batching the inputs to our map() function through list comprehension. We send in a list of values
# and receive a list of values back:

new_drug_dataset = drug_dataset.map(
    lambda x: {"review": [html.unescape(o) for o in x["review"]]}, batched=True
)

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

In [16]:
# To tokenize all the drug reviews with a fast tokenizer, we could use a function like this:
from transformers import AutoTokenizer # AutoTokenizer uses fast-tokenizer by default

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["review"], truncation=True)

In [17]:
# We can improve the run time by batching it and increasing the num_proc to 8 processes.
# Using num_proc to speed up your processing is usually a great idea, as long as the function you are using is not already doing some kind of multiprocessing of its own.
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=True, num_proc=8)

CPU times: user 117 ms, sys: 90.2 ms, total: 207 ms
Wall time: 832 ms


With *Dataset.map()* and *batched=True* you can **change the number of elements in your dataset**. This is super useful in many situations where you want to **create several training features from one example**.

In [18]:
# Truncate the example to max. length of 128 but return all text chunks instead of just the first one
def tokenize_and_split(examples):
    return tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )
    
# See example
result = tokenize_and_split(drug_dataset["train"][0])

When you change the number of elements in a dataset you have to be careful when an example generates more features than its number. In this case we have overflowing tokens which will result in more features. Now we have mismatched column lengths. To deal with this we have to either remove the columns from the old dataset or make them the same size as they are in the new dataset.

In [23]:
# Removing columns
tokenized_dataset = drug_dataset.map(
    tokenize_and_split, batched=True, remove_columns=drug_dataset["train"].column_names
)

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

In [24]:
# Now this works without error. We can check that our new dataset has many more elements 
# than the original dataset by comparing the lengths:
len(tokenized_dataset["train"]), len(drug_dataset["train"])

(206772, 138514)

In [25]:
# Adding more columns. 
def tokenize_and_split(examples):
    result = tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )
    # Extract mapping between new and old indices
    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result

tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)
tokenized_dataset

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 206772
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 68876
    })
})

Interacting with ***Pandas***, *NumPy*, *PyTorch*, *TensorFlow*, and *JAX*.

In [28]:
# Let's try converting our dataset to Pandas, 
# Now when we access elements of the dataset we get a pandas.DataFrame instead of a dictionary:
drug_dataset.set_format("pandas")
drug_dataset["train"][:3]

Unnamed: 0,patient_id,drugName,condition,review,rating,date,usefulCount,review_length
0,95260,Guanfacine,adhd,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,141
1,92703,Lybrel,birth control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,134
2,138000,Ortho Evra,birth control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10,89


In [29]:
#Lets create a pandas.DataFrame for the whole training set by selecting all the elements of drug_dataset["train"]:
train_df = drug_dataset["train"][:]

In [30]:
# From here we can use all the Pandas functionality that we want. 
# For example, we can do fancy chaining to compute the class distribution among the condition entries:
frequencies = (
    train_df["condition"]
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={"index": "condition", "condition": "frequency"})
)
frequencies.head()

Unnamed: 0,frequency,count
0,birth control,27655
1,depression,8023
2,acne,5209
3,anxiety,4991
4,pain,4744


In [31]:
# And once we’re done with our Pandas analysis, we can always create a new Dataset object by using the Dataset.from_pandas() function as follows:
from datasets import Dataset

freq_dataset = Dataset.from_pandas(frequencies)
freq_dataset

Dataset({
    features: ['frequency', 'count'],
    num_rows: 819
})

In [32]:
# let’s create a validation set to prepare the dataset for training a classifier on. First we reset the format.
drug_dataset.reset_format()

**Creating a validation set**

In [33]:
# Create a train:test split
drug_dataset_clean = drug_dataset["train"].train_test_split(train_size=0.8, seed=42)
# Rename the default "test" split to "validation"
drug_dataset_clean["validation"] = drug_dataset_clean.pop("test")
# Add the "test" set to our `DatasetDict`
drug_dataset_clean["test"] = drug_dataset["test"]
drug_dataset_clean

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 110811
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 27703
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})

In [34]:
# Saving dataset (save_to_disk, to_csv, to_json):
drug_dataset_clean.save_to_disk("drug-reviews")

Saving the dataset (0/1 shards):   0%|          | 0/110811 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/27703 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/46108 [00:00<?, ? examples/s]

In [35]:
# Loading dataset:
from datasets import load_from_disk

drug_dataset_reloaded = load_from_disk("drug-reviews")
drug_dataset_reloaded

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 110811
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 27703
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})

In [None]:
# Use the techniques from Chapter 3 to train a classifier that can predict the patient condition based on the drug review.
# Use the summarization pipeline from Chapter 1 to generate summaries of the reviews.