In [1]:
import pandas as pd
# Read csv data
df = pd.read_csv("mtsamples.csv")
df = df.drop(["Unnamed: 0"],axis=1)

In [None]:
# Import generic packages
import numpy as np 
from textwrap import fill
from IPython.display import Markdown, display 
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Import LLM model-related packages
import torch
from langchain import PromptTemplate
from langchain.llms import HuggingFacePipeline
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline

In [4]:
# Specifies the path to the model
path = "./Mistral-7B-Instruct-v0.1/"
# Online loading model
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1"

# Configure the quantitative parameters of the model
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

# Initialize the model's tokenizer
tokenizer = AutoTokenizer.from_pretrained(path, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

# Initialize the model's tokenizer
model = AutoModelForCausalLM.from_pretrained(
    path, torch_dtype=torch.float16,
    trust_remote_code=True,
    device_map="auto",
    quantization_config=quantization_config
)

# Configure parameters for text generation tasks
generation_config = GenerationConfig.from_pretrained(path)
generation_config.max_new_tokens = 1024 # maximum number of new tokens that can be generated by the model
generation_config.temperature = 0.7 # randomness of the generated tex
generation_config.top_p = 0.95 # diversity of the generated text
generation_config.do_sample = True # sampling during the generation process
generation_config.repetition_penalty = 1.15 # the degree to which the model should avoid repeating tokens in the generated text

# Create a text generation pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    generation_config=generation_config,
)

Loading checkpoint shards: 100%|███████████████████████████████████████| 2/2 [00:05<00:00,  2.85s/it]


In [5]:
# Initialize an LLM
llm = HuggingFacePipeline(pipeline=pipe)

In [18]:
# Since the age information is not in a single column, concatenate description and transcription
df['description_transcription'] = df[["description",'transcription']].apply(lambda x: 'Description: %s Transcription: %s'%(x[0],x[1]), axis=1)

In [77]:
# The first use of promot does not work well because there are too many requirements, and this language model will ignore some of the commands.
template = """[INST] Assuming you are a medical expert, extract the age information from text1 and the treatment information from text2. For age, simply output x years(or months, days) old. For treatment, you do not need to include the analysis process, just provide the results. If the patient received multiple treatments, please separate them with semicolons. Show the results like:

AGE: The age of the paitent.

TREATMENT: patient's treatments prescripted by a doctor

The text is: ```{text}```
[/INST]
"""

In [111]:
# Generates a promot class
prompt = PromptTemplate.from_template(template)

In [112]:
# test
result = llm(prompt.format(text=df["description_transcription"].values[1]))

KeyError: 'format_instructions'

In [80]:
# first test result
display(Markdown(f"<p>{result}</p>"))

<p>[INST] Assuming you are a medical expert, extract the age information from text1 and the treatment information from text2. For age, simply output x years(or months, days) old. For treatment, you do not need to include the analysis process, just provide the results. If the patient received multiple treatments, please separate them with semicolons. Show the results like:

AGE: The age of the paitent.

TREATMENT: patient's treatments prescripted by a doctor

The text is: ```Description:  Consult for laparoscopic gastric bypass. Transcription: PAST MEDICAL HISTORY:, He has difficulty climbing stairs, difficulty with airline seats, tying shoes, used to public seating, and lifting objects off the floor.  He exercises three times a week at home and does cardio.  He has difficulty walking two blocks or five flights of stairs.  Difficulty with snoring.  He has muscle and joint pains including knee pain, back pain, foot and ankle pain, and swelling.  He has gastroesophageal reflux disease.,PAST SURGICAL HISTORY:, Includes reconstructive surgery on his right hand 13 years ago.  ,SOCIAL HISTORY:, He is currently single.  He has about ten drinks a year.  He had smoked significantly up until several months ago.  He now smokes less than three cigarettes a day.,FAMILY HISTORY:, Heart disease in both grandfathers, grandmother with stroke, and a grandmother with diabetes.  Denies obesity and hypertension in other family members.,CURRENT MEDICATIONS:, None.,ALLERGIES:,  He is allergic to Penicillin.,MISCELLANEOUS/EATING HISTORY:, He has been going to support groups for seven months with Lynn Holmberg in Greenwich and he is from Eastchester, New York and he feels that we are the appropriate program.  He had a poor experience with the Greenwich program.  Eating history, he is not an emotional eater.  Does not like sweets.  He likes big portions and carbohydrates.  He likes chicken and not steak.  He currently weighs 312 pounds.  Ideal body weight would be 170 pounds.  He is 142 pounds overweight.  If ,he lost 60% of his excess body weight that would be 84 pounds and he should weigh about 228.,REVIEW OF SYSTEMS: ,Negative for head, neck, heart, lungs, GI, GU, orthopedic, and skin.  Specifically denies chest pain, heart attack, coronary artery disease, congestive heart failure, arrhythmia, atrial fibrillation, pacemaker, high cholesterol, pulmonary embolism, high blood pressure, CVA, venous insufficiency, thrombophlebitis, asthma, shortness of breath, COPD, emphysema, sleep apnea, diabetes, leg and foot swelling, osteoarthritis, rheumatoid arthritis, hiatal hernia, peptic ulcer disease, gallstones, infected gallbladder, pancreatitis, fatty liver, hepatitis, hemorrhoids, rectal bleeding, polyps, incontinence of stool, urinary stress incontinence, or cancer.  Denies cellulitis, pseudotumor cerebri, meningitis, or encephalitis.,PHYSICAL EXAMINATION:, He is alert and oriented x 3.  Cranial nerves II-XII are intact.  Afebrile.  Vital Signs are stable.```
[/INST]

AGE: Unknown due to lack of information provided in the input text.

TREATMENT: Laparoscopic gastric bypass</p>

In [121]:
# We define each piece of information to be extracted using ResponseSchema
age_schema = ResponseSchema(name="PATIENT AGE",
                             description="the age of the patient")
treatment_schema = ResponseSchema(name="TREATMENT",
                             description="patient's treatments prescripted by a doctor")
# list of schemas
response_schemas = [age_schema, treatment_schema]
# Instance of the parser
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

In [127]:
template = """[INST] For the following text describe in two lines the pacient situation including his age and treatment proposed by the doctor: 
PATIENT AGE: the age of the patient
TREATMENT: list the patient's treatments prescripted by a doctor
The texs is: ```{text}```
{format_instructions}
[/INST]
"""

In [137]:
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

In [138]:
prompt = PromptTemplate.from_template(template)

In [160]:
result = llm(prompt.format(text=df.loc[3,'description_transcription'],format_instructions=output_parser.get_format_instructions()))
output_dict = output_parser.parse(result.split("[/INST]")[-1])
output_dict

{'PATIENT AGE': 'Unknown',
 'TREATMENT': 'Echocardiogram with Doppler analysis showing left atrial enlargement with left atrial diameter of 4.7 cm., normal size right and left ventricle., normal LV systolic function with left ventricular ejection fraction of 51%, normal LV diastolic function., no pericardial effusion., normal morphology of aortic valve, mitral valve, tricuspid valve, and pulmonary valve., PA systolic pressure is 36 mmHg., mild mitral and tricuspid regurgitation., trace aortic and pulmonary regurgitation.'}

{'PATIENT AGE': None, 'TREATMENT': None}

In [155]:
import json
 
def is_json(myjson):
    try:
        json.loads(myjson)
    except ValueError:
        return False
    return True

In [None]:
from tqdm import tqdm
i = 53
while i < df.shape[0]:
    try:
        print(i)
        result = llm(prompt.format(text=df.loc[i,'description_transcription'],format_instructions=output_parser.get_format_instructions()))
        i+=1
        output_dict = output_parser.parse(result.split("[/INST]")[-1])
        age = output_dict["PATIENT AGE"]
        treatment = output_dict["TREATMENT"]
        df.loc[i,'patient_age'] = age
        df.loc[i,'treatment'] = treatment
        df.to_csv("./result.csv",index=False)
    except Exception as e:
        i = i-1
    

53
54
55
55
56
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
79
79
80
81
82
83
83
83
84
84
85
86
87
88
89
90
91
92
93
93
94
95
96
96
96
97
98
99
99
100


In [None]:
result = llm(prompt.format(text=df.loc[3,'description_transcription'],format_instructions=output_parser.get_format_instructions()))
output_dict = output_parser.parse(result.split("[/INST]")[-1])