<a href="https://colab.research.google.com/github/amuzetnoM/Wingman/blob/main/Project_Wingman.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## DEPENDENCIES

In [None]:
# Install dependencies
!pip install transformers
!pip install torch
!pip install torchvision
!pip install pypdf2
!pip install pdfplumber
!pip install -U --force-reinstall pillow
!pip install beautifulsoup4
!pip install textwrap
!pip install mosaicml
!pip install einops
!pip install --upgrade transformers

# Upload Data and Mention Websites


In [2]:
import os
import shutil
import urllib.request
from huggingface_hub import hf_hub_url



# Import required modules
from transformers import GPTJForCausalLM, GPT2Tokenizer
from PIL import Image
from IPython.display import display

# Set up directories
os.makedirs("context_data", exist_ok=True)
os.makedirs("raw_data", exist_ok=True)

# Display successful directory creation
print("Directories 'context_data' and 'raw_data' created successfully.")

# Request user to upload data
print("Please upload the necessary data files to the 'raw_data' directory.")

# Ask user to input up to 5 websites
websites = []
for i in range(5):
    website = input("Enter a website URL (leave blank to skip): ")
    if website:
        websites.append(website)
    else:
        break

# Display the list of websites
print("Websites entered:")
for website in websites:
    print(website)

Directories 'context_data' and 'raw_data' created successfully.
Please upload the necessary data files to the 'raw_data' directory.
Enter a website URL (leave blank to skip): https://www.clinicalanatomy.ca/
Enter a website URL (leave blank to skip): 
Websites entered:
https://www.clinicalanatomy.ca/


# PreProcessing & Training

In [3]:
from bs4 import BeautifulSoup

# Scrape data from the websites
scraped_data = []

for website in websites:
    try:
        # Open the URL
        with urllib.request.urlopen(website) as response:
            html = response.read()

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(html, 'html.parser')

        # Extract the text content from the HTML
        text = soup.get_text()

        # Append the scraped text to the list
        scraped_data.append(text)

    except Exception as e:
        print(f"Error scraping data from {website}: {str(e)}")

# Display the scraped data
print("Scraped data:")
for data in scraped_data:
    print(data[:500])  # Display the first 100 characters of each scraped data

    # Save scraped data to text files
for i, data in enumerate(scraped_data):
    filename = f"website_{i}.txt"
    filepath = os.path.join("context_data", filename)
    with open(filepath, "w") as file:
        file.write(data)
    print(f"Scraped data saved to {filepath}")




Scraped data:






Clinical Anatomy | Home

















CLINICAL ANATOMY


VISIT OUR LATEST SECTION WITH NEW MODULES: Clinical Approaches













Head & Neck





Thorax





Upper Limb





Pelvis







Anatomy Videos





Clinical Approaches





Anatomical Illustrations





UBC Anatomy Labs







Back & Core





Abdomen





Lower Limb





Embryology



 



ABOUT US

This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License

NEUROANATOMY


Scraped data saved to context_data/website_0.txt


In [4]:
import os
import shutil

# Set the source and destination directories
source_dir = "/content"
destination_dir = "/content/raw_data"

# Get the list of files in the source directory
file_list = os.listdir(source_dir)

# Iterate over the files and move the PDF files to the destination directory
for file_name in file_list:
    if file_name.endswith(".pdf"):
        source_path = os.path.join(source_dir, file_name)
        destination_path = os.path.join(destination_dir, file_name)
        shutil.move(source_path, destination_path)
        print(f"Moved {file_name} to {destination_dir}")


Moved snell clinical anatomy by regions 9th ed 2012.pdf to /content/raw_data


In [5]:
# List all files in the raw_data folder
raw_data_folder = "raw_data"
files = os.listdir(raw_data_folder)

# Display files with their extensions
print("Files in the 'raw_data' folder:")
for file in files:
    filename, extension = os.path.splitext(file)
    print(f"{file} ({extension})")


Files in the 'raw_data' folder:
snell clinical anatomy by regions 9th ed 2012.pdf (.pdf)


In [6]:
# List only PDF files in the raw_data folder
raw_data_folder = "raw_data"
pdf_files = [file for file in os.listdir(raw_data_folder) if file.endswith(".pdf")]

# Display PDF files
print("PDF files in the 'raw_data' folder:")
for file in pdf_files:
    print(file)


PDF files in the 'raw_data' folder:
snell clinical anatomy by regions 9th ed 2012.pdf


In [7]:
import pdfplumber
import textwrap

# List only PDF files in the raw_data folder
raw_data_folder = "raw_data"
pdf_files = [file for file in os.listdir(raw_data_folder) if file.endswith(".pdf")]

# Convert each PDF file to text
converted_text = ""
for pdf_file in pdf_files:
    pdf_path = os.path.join(raw_data_folder, pdf_file)
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            converted_text += text

# Save the converted text to a single file
output_file = os.path.join("context_data", "converted.txt")
with open(output_file, "w") as file:
    file.write(converted_text)

# Display the first 500 words with text wrapping
words = converted_text.split()
wrapped_text = " ".join(words[:500])
wrapped_text = textwrap.fill(wrapped_text, width=80)  # Adjust the width as needed
print("First 500 words (with text wrapping):")
print(wrapped_text)


First 500 words (with text wrapping):
CLINICAL ANATOMY BY REGIONS N I N T H E D I T I O NC L I N I C AL A N A T O MY
BY R E G I O NS Richard ร. Snell, M.R.C.S., L.R.C.P., M.B., B.S., M.D., Ph.D.
Emeritus Professor of Anatomy (formerly Chairman of the Department of Anatomy)
George Washington University School of Medicine and Health Sciences Washington,
District of Columbia Previously Associate Professor of Anatomy and Medicine,
Yale University Medical School Lecturer in Anatomy, King's College, University
of London Visiting Professor of Anatomy, Harvard Medical SchoolAcquisitions
Editor: Crystal Taylor Product Manager: Julie Montalbano Marketing Manager: Joy
Fisher Williams Designer: Steve Druding Compositor: SPi Global 9th Edition
Copyright © 2012, 2008, 2004 Lippincott Williams & Wilkins, a Wolters Kluwer
business. 351 West Camden Street Two Commerce Square Baltimore, MD 21201 2001
Market Street Philadelphia, PA 19103 Printed in China All rights reserved. This
book is protected by cop

In [8]:
import os

# List all .txt files in the context_data folder
context_data_folder = "context_data"
txt_files = [file for file in os.listdir(context_data_folder) if file.endswith(".txt")]

# Combine the contents of the .txt files
combined_text = ""
for txt_file in txt_files:
    txt_path = os.path.join(context_data_folder, txt_file)
    with open(txt_path, "r") as file:
        text = file.read()
        combined_text += text

# Save the combined text to the master.txt file
output_file = os.path.join(context_data_folder, "master.txt")
with open(output_file, "w") as file:
    file.write(combined_text)

# Display success message
print(f"Combined text saved to {output_file}")


Combined text saved to context_data/master.txt


In [9]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler

# Load the GPT2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Add a new padding token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Read the contents of the master.txt file
file_path = "context_data/master.txt"
with open(file_path, "r") as file:
    text = file.read()

# Tokenize and preprocess the data
tokenized_text = tokenizer.encode(text, truncation=True, max_length=1024, padding=True, return_tensors="pt")

# Create a custom dataset
class TextDataset(Dataset):
    def __init__(self, tokenized_text):
        self.tokenized_text = tokenized_text

    def __len__(self):
        return len(self.tokenized_text)

    def __getitem__(self, idx):
        return self.tokenized_text[idx]

dataset = TextDataset(tokenized_text)

# Create a DataLoader for batching the data
batch_size = 4
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Model Training

In [12]:
# Load the GPT2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Fine-tune the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

for epoch in range(3):
    for batch in dataloader:
        inputs = batch.to(device)
        labels = batch.to(device)

        optimizer.zero_grad()

        outputs = model(inputs, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()

# Save the trained model
output_dir = "trained_model"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('trained_model/tokenizer_config.json',
 'trained_model/special_tokens_map.json',
 'trained_model/vocab.json',
 'trained_model/merges.txt',
 'trained_model/added_tokens.json')

cutoff


# CHAT

In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the trained model
model_path = "trained_model"
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

# Set the model to evaluation mode
model.eval()

# Chat interface
print("Welcome to the chat interface. Type 'exit' to end the conversation.")
while True:
    # Get user input
    user_input = input("User: ")

    # Check if user wants to exit
    if user_input.lower() == "exit":
        print("Chat ended.")
        break

    # Tokenize the user input
    input_ids = tokenizer.encode(user_input, return_tensors="pt")

    # Generate a response
    with torch.no_grad():
        response = model.generate(input_ids.to(model.device))

    # Decode and print the response
    generated_text = tokenizer.decode(response[0], skip_special_tokens=True)
    print("Response:")
    print(generated_text.strip())
    print("=" * 50)


Welcome to the chat interface. Type 'exit' to end the conversation.
User: what is clinical anatomy


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Response:
what is clinical anatomy?

The anatomy of the brain is a complex and complex subject. The
User: again


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Response:
again, I'm not sure if I'm going to be able to do it. I'm not
