In [1]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer
# from dataset import llama_dataset

In [32]:
compute_dtype = getattr(torch, "float16")
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)
base_model = "/root/autodl-tmp/llm_training_outputs_update_wonhs/checkpoint-44000"
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [None]:
<s>[INST] <<SYS>>
{{ system_prompt }}
<</SYS>>
{{ user_msg_1 }} [/INST]{{ model_answer_1 }} </s><s>[INST]{{ user_msg_2 }} [/INST]{{ model_answer_2 }} </s><s>[INST]{{ user_msg_3 }} [/INST]

In [6]:
def chatbot_answer(question, history: list=None):
    system_prompt = "You are a helpful, respectful and honest health acknowledge assistant.\n\n If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."
    input_ = f"<s>[INST] <<SYS>>{system_prompt}<</SYS>>"
    if history is not None:
        for i, (meg, ans) in enumerate(history):
            
            input_ += meg + "[/INST]" + ans + "</s><s>[INST]"
    input_ += question + "[/INST]"
           
    return input_

In [33]:
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=512)
prompt = "What is the weather today"
# question = chatbot_answer(prompt,[("What is Covid?", hs)])
question = f"You are an expect classifier to classify healthcare and non-healthcare related questions. Please classify wether this question : {prompt} is healthcare related or not. Only answer yes or no."
# result = pipe(f"<s>[INST] {prompt} [/INST]")
result = pipe(question)

print(result[0]['generated_text'])



You are an expect classifier to classify healthcare and non-healthcare related questions. Please classify wether this question : What is the weather today is healthcare related or not. Only answer yes or no. [/] no it is not healthcare related so i would classify it as non healthcare related and not expect you to answer it thank you for your help i am a new member and i am still learning how to navigate this site and i am not sure how to classify a question so i will just ask you to classify it for me thank you again for your help i am still learning how to navigate this site and i am not sure how to classify a question so i will just ask you to classify it for me thank you again for your help i am still learning how to navigate this site and i am not sure how to classify a question so i will just ask you to classify it for me thank you again for your help i am still learning how to navigate this site and i am not sure how to classify a question so i will just ask you to classify it fo

In [25]:
result[0]['generated_text']

"<s>[INST] <<SYS>>You are a helpful, respectful and honest health acknowledge assistant.\n\n If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.<</SYS>>What is Covid?[/INST]Covid-19 is a disease caused by a virus. It is a member of the coronavirus family. Covid-19 is a respiratory illness that can spread from person to person. It is not the same as the common cold or flu. Covid-19 is a serious disease that can cause severe illness. It can also cause death. Covid-19 spreads easily from person to person. It spreads through close contact with an infected person. It can also spread through droplets that are in the air. Covid-19 can also spread through contact with surfaces that have the virus on them. Covid-19 is not spread through food or drink. Covid-19 is not spread through pets. Covid-19 is not spread through mosqu</s><s>[INST]What i

In [34]:
a = result[0]['generated_text']
a.rfind("[/INST]")


-1

In [31]:
print(a[1011+7:])

The symptoms of Covid-19 are similar to those of other respiratory viruses. They include:
a cough
a fever
a runny nose
a sore throat
headache
muscle or body aches
tiredness
Some people who get Covid-19 have no symptoms at all. Others may have mild symptoms and don't know they have it.
The symptoms of Covid-19 can appear anywhere from 2 to 14 days after you're infected.
The symptoms of Covid-19 can be different in children. Children may have a runny nose, sneezing, a cough, and a fever. They may also have stomach problems, such as diarrhea or vomiting.
The symptoms of Covid-19 can be different in older adults. Older adults may have a cough, fever, and shortness of breath. They may also have confusion, disorientation, and hallucinations.
The symptoms of Covid-19


In [2]:
import xml.etree.ElementTree as ET
import os

def xml_parser(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    res = []
    questions = root.findall(".//Question")
    for question in questions:
        res.append(question.text)
    return res

In [3]:
xml_parser("/root/autodl-tmp/dataset/MedQuAD/1_CancerGov_QA/0000001_1.xml")

['What is (are) Adult Acute Lymphoblastic Leukemia ?',
 'What are the symptoms of Adult Acute Lymphoblastic Leukemia ?',
 'How to diagnose Adult Acute Lymphoblastic Leukemia ?',
 'What is the outlook for Adult Acute Lymphoblastic Leukemia ?',
 'Who is at risk for Adult Acute Lymphoblastic Leukemia? ?',
 'What are the stages of Adult Acute Lymphoblastic Leukemia ?',
 'What are the treatments for Adult Acute Lymphoblastic Leukemia ?']

In [4]:
tree = ET.parse("/root/autodl-tmp/dataset/MedQuAD/1_CancerGov_QA/0000001_1.xml")
root = tree.getroot()
root

<Element 'Document' at 0x7f337ac3fc20>

In [8]:
questions = root.findall(".//Question")
for question in questions:
    print(question.text)

What is (are) Adult Acute Lymphoblastic Leukemia ?
What are the symptoms of Adult Acute Lymphoblastic Leukemia ?
How to diagnose Adult Acute Lymphoblastic Leukemia ?
What is the outlook for Adult Acute Lymphoblastic Leukemia ?
Who is at risk for Adult Acute Lymphoblastic Leukemia? ?
What are the stages of Adult Acute Lymphoblastic Leukemia ?
What are the treatments for Adult Acute Lymphoblastic Leukemia ?


In [5]:
answers = root.findall(".//Answer")
for answer in answers:
    print(answer.text.replace("\n",""))
    break

Key Points                    - Adult acute lymphoblastic leukemia (ALL) is a type of cancer in which the bone marrow makes too many lymphocytes (a type of white blood cell).    - Leukemia may affect red blood cells, white blood cells, and platelets.    - Previous chemotherapy and exposure to radiation may increase the risk of developing ALL.    - Signs and symptoms of adult ALL include fever, feeling tired, and easy bruising or bleeding.     - Tests that examine the blood and bone marrow are used to detect (find) and diagnose adult ALL.    - Certain factors affect prognosis (chance of recovery) and treatment options.                                                    Adult acute lymphoblastic leukemia (ALL) is a type of cancer in which the bone marrow makes too many lymphocytes (a type of white blood cell).                    Adult acute lymphoblastic leukemia (ALL; also called acute lymphocytic leukemia) is a cancer of the blood and bone marrow. This type of cancer usually gets worse

In [1]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer
from dataset import llama_dataset


import argparse

In [2]:
dataset = load_dataset("csv", data_files="/root/autodl-tmp/dataset/combined_data.csv")
dataset = dataset.map(llama_dataset.transform_conversation)['train']

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetGenerationError: An error occurred while generating the dataset

In [11]:
import pandas as pd
df = pd.read_csv("/root/autodl-tmp/dataset/combined_data.csv")

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44332 entries, 0 to 44331
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  44332 non-null  object
 1   answer    44332 non-null  object
dtypes: object(2)
memory usage: 692.8+ KB


In [6]:
df

Unnamed: 0,question,answer
0,What is (are) A guide to clinical trials for c...,
1,what research (or clinical trials) is being do...,
2,what research (or clinical trials) is being do...,
3,what research (or clinical trials) is being do...,
4,what research (or clinical trials) is being do...,
...,...,...
75361,felt a stinging pain then a sharp pain…fang marks,mix baking soda with water put paste on bite d...
75362,is creatine safe for a 9 year old child,i would not recommend or encourage using creat...
75363,what is the best skin care routine to prevent ...,use sun protection every day all year round pr...
75364,is garcinia mangostana safe to take with crestor,in general it is best to discuss with your the...


In [10]:
import pandas as pd
df = pd.read_csv("/root/autodl-tmp/dataset/string/MedQuAD.csv")
df.dropna().head(10)

Unnamed: 0,question,answer
31029,What is (are) Adult Acute Lymphoblastic Leukem...,Key Points\n - Adult acute ...
31030,What are the symptoms of Adult Acute Lymphobla...,"Signs and symptoms of adult ALL include fever,..."
31031,How to diagnose Adult Acute Lymphoblastic Leuk...,Tests that examine the blood and bone marrow a...
31032,What is the outlook for Adult Acute Lymphoblas...,Certain factors affect prognosis (chance of re...
31033,Who is at risk for Adult Acute Lymphoblastic L...,Previous chemotherapy and exposure to radiatio...
31034,What are the stages of Adult Acute Lymphoblast...,Key Points\n - Once adult A...
31035,What are the treatments for Adult Acute Lympho...,Key Points\n - There are di...
31036,What is (are) Adult Acute Myeloid Leukemia ?,Key Points\n - Adult acute ...
31037,Who is at risk for Adult Acute Myeloid Leukemi...,"Smoking, previous chemotherapy treatment, and ..."
31038,What are the symptoms of Adult Acute Myeloid L...,"Signs and symptoms of adult AML include fever,..."


In [8]:
import pandas as pd
df = pd.read_csv("/root/autodl-tmp/dataset/string/mquad-v1.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23802 entries, 0 to 23801
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   question       23802 non-null  object
 1   answer         23802 non-null  object
 2   Q_FFNN_embeds  23802 non-null  object
 3   A_FFNN_embeds  23802 non-null  object
dtypes: object(4)
memory usage: 743.9+ KB


In [None]:
df