## Environment Setup
**Make sure to set your runtime to use a GPU by going to `Runtime` -> `Change runtime type` -> `Hardware accelerator` -> `T4 GPU`**

In [None]:
import sys
if 'google.colab' in sys.modules:  # If in Google Colab environment
    # Mount google drive to enable access to data files
    from google.colab import drive
    drive.mount('/content/drive')
    
    # Installing requisite packages
    !pip install transformers accelerate PyPDF2 &> /dev/null
    
    # Change working directory to health
    %cd /content/drive/MyDrive/LLM4BeSci_GSERM2024/day_5

In [ ]:
import os
import PyPDF2
import pandas as pd
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import torch
from tqdm.notebook import tqdm_notebook as tqdm

## Load Data and Model

In [ ]:
# Load articles and convert to text
articles = {}
for article in os.listdir('articles'):
    if article.endswith('.pdf'):
        pdf_file = open(f'articles/{article}', 'rb')
        pdf_reader = PyPDF2.PdfFileReader(pdf_file)
        text = ''
        for page in range(pdf_reader.numPages):
            text += pdf_reader.getPage(page).extract_text()
        articles[article] = text
        
# convert to dataframe using from_dict method
articles = pd.DataFrame.from_dict(articles, orient='index', columns=['text'])
articles

In [ ]:
torch.random.manual_seed(42) # For reproducibility

# Load the model 
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-128k-instruct",
    device_map="cuda", # Use GPU
    torch_dtype=torch.float16, # Use half-precision
    trust_remote_code=True, 
    attn_implementation='eager' # For faster inference on T4 GPUs
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")

In [ ]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 10,  # Maximum number of tokens to generate
    "return_full_text": False, # Return only the generated text
    "do_sample": False # Use greedy decoding
}

In [ ]:
question = ""
answers = []
for text in tqdm(articles['text']): 
    # Define prompt with JSON format
    prompt = [{"role": "user", "content": question + text}]
    
    # Generate response
    answer = pipe(prompt, **generation_args)[0]['generated_text']

articles['answer'] = answers
articles