In [2]:
import pandas as pd

In [None]:
# open the feather dataframe bhf.feather
df = pd.read_feather("./bhf.feather")
display(df.head())

# drop the links that are not from the bhf website
df = df[df.url.str.startswith("https://www.bhf.org.uk")]

# remove useless links
df = df[df.url.str.contains("/heart-matters-magazine") == False]
df = df[df.url.str.contains("/healthy-eating") == False]
df = df[df.url.str.contains("/publications") == False]

keep = ["https://www.bhf.org.uk/informationsupport/how-a-healthy-heart-works"]
remove = ["https://www.bhf.org.uk/informationsupport/conditions",
          "https://www.bhf.org.uk/informationsupport/conditions/az-of-heart-and-circulatory-diseases",
          "https://www.bhf.org.uk/informationsupport/support/cardiac-rehabilitation-at-home", 
          "https://www.bhf.org.uk/informationsupport/support/children-and-young-people"
]

urls = [i for i in df.url.to_list() if (i.split("https://www.bhf.org.uk/informationsupport")[1].count("/") > 1 and i not in remove) or i in keep]
df = df[df.url.isin(urls)]
df.shape

In [None]:
# only keep the text from the web page
from bs4 import BeautifulSoup

soups = [BeautifulSoup(i.decode("utf-8"), "html.parser") for i in df.content]
name = [soup.find("h1", {"itemprop":"headline"}).text for soup in soups]
content = []
for soup in soups:
    r = soup.find("section", {"class":"c-text-component"})
    if r is not None:
        content.append(r.text)
    else:
        content.append(None)

ds = pd.DataFrame({"name":name, "content":content, "url":df.url})
ds = ds.dropna()

In [None]:
t = []
for c, n, in zip(ds.content, ds.name):
    if c.startswith("\n"):
        t.append("Article title: " + n + c)
    else:
        t.append("Article title: " + n + "\n" + c)

ds["full"] = t

#### Test avec le contenu des pages web, non parsé

In [None]:
import re
import json
from sklearn.model_selection import train_test_split

train, test = train_test_split(ds["full"] ,test_size=0.15) 

print("Train dataset length: "+str(len(train)))
print("Test dataset length: "+ str(len(test)))

with open("train.txt", "w", encoding="utf-8") as f:
    f.write("#### START ARTICLE ####\n\n" + "\n#### END ARTICLE ####\n\n#### START ARTICLE ####\n\n".join(train.to_list()) + "\n#### END ARTICLE ####\n")

with open("test.txt", "w", encoding="utf-8") as f:
    f.write("#### START ARTICLE ####\n\n" + "\n#### END ARTICLE ####\n\n#### START ARTICLE ####\n\n".join(test.to_list()) + "\n#### END ARTICLE ####\n")


#### Test avec le dataset parsé manuellement, avec prompt

In [7]:
from sklearn.model_selection import train_test_split

ds = pd.read_feather("./dataset_manual.feather")

prompt = """### Question:
{question}

### Answer:
{answer}"""

data = []

for q, a in zip(ds.questions, ds.answers):
    if q.count("?") > 1:
        q = q.split("?")[1] + "?"
    data.append(prompt.format(question=q, answer=a))

train, test = train_test_split(data ,test_size=0.15)

print("Train dataset length: "+str(len(train)))
print("Test dataset length: "+ str(len(test)))

with open("train.txt", "w", encoding="utf-8") as f:
    f.write("\n### End Answer\n\n".join(train) + "\n### End Answer")

with open("test.txt", "w", encoding="utf-8") as f:
    f.write("\n### End Answer\n\n".join(test) + "\n### End Answer")

Train dataset length: 141
Test dataset length: 26


In [8]:
from transformers import AutoTokenizer
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [9]:
from transformers import TextDataset,DataCollatorForLanguageModeling

def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)
     
    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset,test_dataset,data_collator = load_dataset("./train.txt","./test.txt",tokenizer)



In [26]:
ds.head()

Unnamed: 0,questions,answers,url,name
0,What is angina ?,Angina is a symptom. It’s a pain or an uncomfo...,https://www.bhf.org.uk/informationsupport/cond...,angina - cause symptoms treatments
1,What are the symptoms of angina?,"Angina usually feels like pressure, tightness ...",https://www.bhf.org.uk/informationsupport/cond...,angina - cause symptoms treatments
2,When should I get help if i feel like i have a...,If you’ve not been diagnosed with angina and y...,https://www.bhf.org.uk/informationsupport/cond...,angina - cause symptoms treatments
3,What types of angina are there?,There are several types of angina including:\n...,https://www.bhf.org.uk/informationsupport/cond...,angina - cause symptoms treatments
4,What is Stable angina?,This is the most common type of angina. It ten...,https://www.bhf.org.uk/informationsupport/cond...,angina - cause symptoms treatments


# Initialize `Trainer` with `TrainingArguments` and GPT-2 model

The [Trainer](https://huggingface.co/transformers/main_classes/trainer.html#transformers.Trainer) class provides an API for feature-complete training. It is used in most of the [example scripts](https://huggingface.co/transformers/examples.html) from Huggingface. Before we can instantiate our `Trainer` we need to download our GPT-2 model and create a [TrainingArguments](https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments) to access all the points of customization during training. In the `TrainingArguments`, we can define the Hyperparameters we are going to use in the training process like our `learning_rate`, `num_train_epochs`, or  `per_device_train_batch_size`. A complete list can you find [here](https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments).

In [10]:
from transformers import Trainer, TrainingArguments

model = GPT2LMHeadModel.from_pretrained('gpt2')

training_args = TrainingArguments(
    output_dir="/Users/leovi/gpt2-medical", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=5, # number of training epochs
    per_device_train_batch_size=16, # batch size for training
    per_device_eval_batch_size=64,  # batch size for evaluation
    eval_steps = 400, # Number of update steps between two evaluations.
    save_steps=800, # after # steps model is saved 
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    prediction_loss_only=True,
    )


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [11]:
trainer.train()



  0%|          | 0/335 [00:00<?, ?it/s]

{'train_runtime': 175.4045, 'train_samples_per_second': 30.358, 'train_steps_per_second': 1.91, 'train_loss': 2.807032525361474, 'epoch': 5.0}


TrainOutput(global_step=335, training_loss=2.807032525361474, metrics={'train_runtime': 175.4045, 'train_samples_per_second': 30.358, 'train_steps_per_second': 1.91, 'train_loss': 2.807032525361474, 'epoch': 5.0})

In [12]:
trainer.save_model()

# Test the model

To test the model we are going to use another [highlight of the transformers library](https://huggingface.co/transformers/main_classes/pipelines.html?highlight=pipelines) called `pipeline`. [Pipelines](https://huggingface.co/transformers/main_classes/pipelines.html?highlight=pipelines) are objects that offer a simple API dedicated to several tasks, among others also `text-generation`

In [24]:
from transformers import pipeline

bot = pipeline('text-generation', model='/Users/leovi/gpt2-medical', tokenizer='gpt2')



In [14]:
questions = pd.read_feather("./dataset_manual.feather")
questions.questions.head()

0                                     What is angina ?
1                     What are the symptoms of angina?
2    When should I get help if i feel like i have a...
3                      What types of angina are there?
4                               What is Stable angina?
Name: questions, dtype: object

In [21]:
def answer(question):
    print("\n" + bot(question)[0]["generated_text"].strip())

In [42]:
q = ds.questions.sample(1).iloc[0]
answer(prompt.format(question=q, answer=""))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.




### Question:
What should I do before heart bypass surgery?

### Answer:

Your doctor must tell you when you may need surgery before you go for it. You may need surgery if:

your condition doesn't change


In [31]:
print(train)

['### Question:\nWhat happens during an angiogram?\n\n### Answer:\nThe test is done in a cardiac catheter laboratory or ‘cath lab’. You can expect the test to last around half an hour, although it can sometimes take longer. You will need to lie flat for the procedure.\n- You will be asked not to eat or drink anything for a few hours before your procedure.\n- You’ll be given a local anaesthetic injection in the wrist or groin. The catheter (a thin, flexible tube) will then be passed into an artery.\n- The catheter will be directed through your blood vessels and up to your heart. The doctors will use X-ray to help guide them to the arteries.\n- A special dye called contrast will then be passed through the catheter and a series of images will be taken. It is very common to feel a hot, flushing sensation as the dye enters your bloodstream, but this is completely normal and only lasts for a few seconds. The dye will show up any narrowed areas or blockages in the artery on the X-ray.\n- Duri

In [33]:
ds.questions.head()

0                                     What is angina ?
1                     What are the symptoms of angina?
2    When should I get help if i feel like i have a...
3                      What types of angina are there?
4                               What is Stable angina?
Name: questions, dtype: object