In [2]:
from datasets import load_dataset

url = "https://github.com/crux82/squad-it/raw/master/"
data_files = {
    "train": url + "SQuAD_it-train.json.gz",
    "test": url + "SQuAD_it-test.json.gz",
}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")


In [7]:
#shuffle shuffles the pll
sample = squad_it_dataset["train"].shuffle().select(range(5))
print(sample)

Dataset({
    features: ['title', 'paragraphs'],
    num_rows: 5
})


In [11]:
#filter allows us to put condetions on the result
sample2 = squad_it_dataset["train"].filter(lambda x : x["title"].startswith("L")).shuffle().select(range(5))
print(sample2)
print(sample2[0])

Dataset({
    features: ['title', 'paragraphs'],
    num_rows: 5
})
{'title': 'Lavoro infantile', 'paragraphs': [{'context': "Il lavoro minorile si riferisce all' impiego di bambini in qualsiasi lavoro che priva i bambini della loro infanzia, interferisce con la loro capacità di frequentare una scuola regolare, e ciò è mentalmente, fisicamente, fisicamente, socialmente o moralmente pericoloso e dannoso. Questa pratica è considerata uno sfruttamento da molte organizzazioni internazionali. La legislazione mondiale vieta il lavoro minorile. Queste leggi non considerano tutti i lavori dei bambini come lavoro minorile; le eccezioni includono il lavoro dei bambini artisti, i doveri familiari, la formazione supervisionata, alcune categorie di lavoro come quelle dei bambini Amish, alcune forme di lavoro minorile comuni tra i bambini indigeni americani e altre.", 'qas': [{'answers': [{'answer_start': 80, 'text': 'priva i bambini della loro infanzia'}], 'id': '57275b0a5951b619008f88a7', 'questio

In [15]:
sample2 = squad_it_dataset["train"].filter(lambda x : len(x["title"]))
print(sample2)

Dataset({
    features: ['title', 'paragraphs'],
    num_rows: 442
})


In [1]:
from datasets import load_dataset

data_files = {"train": "drugsComTrain_raw.tsv", "test": "drugsComTest_raw.tsv"}
# \t is the tab character in Python
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [6]:
drug_sample = drug_dataset["train"].filter(lambda x: x["drugName"] == "Naproxen").shuffle(seed=42).select(range(5))
# Peek at the first few examples
drug_sample[:3]

{'Unnamed: 0': [87591, 87760, 87620],
 'drugName': ['Naproxen', 'Naproxen', 'Naproxen'],
 'condition': ['Period Pain', 'Muscle Pain', 'Osteoarthritis'],
 'review': ['"I have tried everything but only with this painkiller I can function on first day of my period. I still feel little pain but bearable. I can go to job. Until I discover this drug I was every month minimum one day in bed vomiting, screaming and crying. Other pills never helped this much."',
  '"I used Naproxen when I tore a tendon in my knee and the swelling would not go down. Naproxen works. It takes some time but you have to be on a regular dose and once you stop taking it, it goes back to the way it was before (at least in my case it did)."',
  '"This is an excellent pain medication as well as an anti inflammatory. I take 500 mg twice a day and had no adverse side effects."'],
 'rating': [10.0, 8.0, 10.0],
 'date': ['July 7, 2015', 'May 12, 2008', 'February 1, 2014'],
 'usefulCount': [24, 33, 62]}

In [7]:
for split in drug_dataset.keys():
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique("Unnamed: 0"))

In [8]:
drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name="patient_id"
)
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [13]:
len(drug_dataset["train"].unique("drugName"))

3436

In [15]:
drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)

Filter:   0%|          | 0/161297 [00:00<?, ? examples/s]

Filter:   0%|          | 0/53766 [00:00<?, ? examples/s]

In [17]:
"abc".lower()

'abc'

In [18]:
def lowercase_condition(example):
    return {"condition": example["condition"].lower()}

In [19]:
drug_dataset = drug_dataset.map(lowercase_condition)
# Check that lowercasing worked
drug_dataset["train"]["condition"][:3]

Map:   0%|          | 0/160398 [00:00<?, ? examples/s]

Map:   0%|          | 0/53471 [00:00<?, ? examples/s]

['left ventricular dysfunction', 'adhd', 'birth control']

In [21]:
def compute_review_length(example):
    return {"review_length": len(example["review"].split())}

In [22]:
drug_dataset = drug_dataset.map(compute_review_length)
# Inspect the first training example
drug_dataset["train"][0]

Map:   0%|          | 0/160398 [00:00<?, ? examples/s]

Map:   0%|          | 0/53471 [00:00<?, ? examples/s]

{'patient_id': 206461,
 'drugName': 'Valsartan',
 'condition': 'left ventricular dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27,
 'review_length': 17}

In [23]:
drug_dataset["train"].sort("review_length")[:3]

{'patient_id': [111469, 13653, 53602],
 'drugName': ['Ledipasvir / sofosbuvir',
  'Amphetamine / dextroamphetamine',
  'Alesse'],
 'condition': ['hepatitis c', 'adhd', 'birth control'],
 'review': ['"Headache"', '"Great"', '"Awesome"'],
 'rating': [10.0, 10.0, 10.0],
 'date': ['February 3, 2015', 'October 20, 2009', 'November 23, 2015'],
 'usefulCount': [41, 3, 0],
 'review_length': [1, 1, 1]}

In [24]:
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 30)
print(drug_dataset.num_rows)

SyntaxError: invalid syntax (2036149770.py, line 2)

In [25]:
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 30)
print(drug_dataset.num_rows)

Filter:   0%|          | 0/160398 [00:00<?, ? examples/s]

Filter:   0%|          | 0/53471 [00:00<?, ? examples/s]

{'train': 138514, 'test': 46108}


In [29]:
import html

drug_dataset = drug_dataset.map(lambda x: {"review": html.unescape(x["review"])})

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

In [28]:
new_drug_dataset = drug_dataset.map(
    lambda x: {"review": [html.unescape(o) for o in x["review"]]}, batched=True
)


Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

In [36]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["review"], truncation=True)

In [38]:
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

CPU times: total: 8.39 s
Wall time: 12.7 s


In [40]:
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=True)

CPU times: total: 31.2 ms
Wall time: 46.4 ms


In [34]:
%time tokenized_dataset = drug_dataset.map(tokenize_function)

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

CPU times: total: 49.5 s
Wall time: 1min 20s


In [42]:
slow_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False)


def slow_tokenize_function(examples):
    return slow_tokenizer(examples["review"], truncation=True)
