In [74]:
# Import dependencies
import requests
from bs4 import BeautifulSoup as bs

In [75]:
# Get request for the URL
url = "https://studentaffairs.jhu.edu/policies-guidelines/amnesty/"
response = requests.get(url)

# Check if the request was successful
response.status_code

200

In [76]:
# Parse HTML
soup = bs(response.text, "html.parser")
soup

# h2 p ol li ARTICLe

<!DOCTYPE html>
 <!--[if IE 8]><html class="ie8" lang="en"><![endif]--> <!--[if gt IE 8]><!--><html lang="en"><!--<![endif]--> <head> <meta content="IE=edge" http-equiv="X-UA-Compatible"/> <meta charset="utf-8"/> <meta content="width=device-width, initial-scale=1, minimum-scale=1" name="viewport"/> <title>Student Amnesty for Alcohol &amp; Drug Emergencies | Policies &amp; Guidelines</title> <meta content="index, follow, max-image-preview:large, max-snippet:-1, max-video-preview:-1" name="robots"> <link href="https://studentaffairs.jhu.edu/policies-guidelines/amnesty/" rel="canonical"/> <meta content="en_US" property="og:locale"> <meta content="article" property="og:type"> <meta content="Student Amnesty for Alcohol &amp; Drug Emergencies" property="og:title"/> <meta content="Amnesty Protocol Safety is first and foremost. As a Hopkins community member, we expect students to demonstrate the utmost care and concern for others in matters of medical emergency and/or crisis. To encourage stud

In [77]:
# Find target elements
target = soup.find_all("article")
target

[<article aria-label="Main Content" class="main content__main" role="main"> <h1>Student Amnesty for Alcohol &amp; Drug Emergencies</h1> <h2>Amnesty Protocol</h2> <p>Safety is first and foremost. As a Hopkins community member, we expect students to demonstrate the utmost care and concern for others in matters of medical emergency and/or crisis. To encourage students to immediately seek necessary medical attention for themselves or others, the University will not impose disciplinary action of record for a violation of student alcohol or drug possession or consumption against individual students or Recognized Student Groups/Organizations when they report to or seek assistance from on-duty medical staff or law enforcement for a medical emergency or condition. The University will not impose disciplinary action of record for a violation of alcohol or drug consumption/use against the student who is subject of such medical emergency or condition.</p> <p>To initiate the Amnesty and Responsible 

In [78]:
content = []
for t in target:
    content.append(t.text)
    
# Split content into list of strings
content = content[0].split(".")

# Remove empty strings (Normalize Data)
content = [c for c in content if c != ""]
content


[' Student Amnesty for Alcohol & Drug Emergencies Amnesty Protocol Safety is first and foremost',
 ' As a Hopkins community member, we expect students to demonstrate the utmost care and concern for others in matters of medical emergency and/or crisis',
 ' To encourage students to immediately seek necessary medical attention for themselves or others, the University will not impose disciplinary action of record for a violation of student alcohol or drug possession or consumption against individual students or Recognized Student Groups/Organizations when they report to or seek assistance from on-duty medical staff or law enforcement for a medical emergency or condition',
 ' The University will not impose disciplinary action of record for a violation of alcohol or drug consumption/use against the student who is subject of such medical emergency or condition',
 ' To initiate the Amnesty and Responsible Action Protocol, you must:  Call for help: In the moment of witnessing a medical emergenc

In [79]:
# Put content in a QA format
qa_pairs = []

# Question is first item, answer is second item. For ex: index (0,1) is a pair, (1,2) is a pair, etc.
for i in range(len(content)-1):
    qa_pairs.append({"prompt": content[i], "completion": content[i+1]})

qa_pairs


[{'prompt': ' Student Amnesty for Alcohol & Drug Emergencies Amnesty Protocol Safety is first and foremost',
  'completion': ' As a Hopkins community member, we expect students to demonstrate the utmost care and concern for others in matters of medical emergency and/or crisis'},
 {'prompt': ' As a Hopkins community member, we expect students to demonstrate the utmost care and concern for others in matters of medical emergency and/or crisis',
  'completion': ' To encourage students to immediately seek necessary medical attention for themselves or others, the University will not impose disciplinary action of record for a violation of student alcohol or drug possession or consumption against individual students or Recognized Student Groups/Organizations when they report to or seek assistance from on-duty medical staff or law enforcement for a medical emergency or condition'},
 {'prompt': ' To encourage students to immediately seek necessary medical attention for themselves or others, the 

In [80]:
# Output to JSON
import json

for qa in qa_pairs:
    with open("qa_pairs.json", "a") as f:
        json.dump(qa, f)
        f.write("\n")

In [81]:
# Use bert to train the model
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertForQuestionAnswering, BertTokenizer

# Load the pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)

# Define your custom dataset
class QADataset(Dataset):
    def __init__(self, file_path):
        self.data = []
        with open(file_path, 'r') as file:
            for line in file:
                example = eval(line)  # Parse the JSON-like line
                self.data.append(example)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        example = self.data[idx]
        prompt = example['prompt']
        completion = example['completion']
        return prompt, completion

# Function to preprocess the dataset
def preprocess_dataset(dataset):
    input_ids = []
    attention_masks = []
    for prompt, completion in dataset:
        encoded = tokenizer.encode_plus(prompt, completion, add_special_tokens=True, truncation=True, max_length=512, padding='max_length')
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return torch.tensor(input_ids), torch.tensor(attention_masks)

# Load and preprocess the dataset
dataset = QADataset('qa_pairs.json')
input_ids, attention_masks = preprocess_dataset(dataset)

# Split the dataset into train and validation sets (you can modify this based on your needs)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_input_ids, val_input_ids = input_ids[:train_size], input_ids[train_size:]
train_attention_masks, val_attention_masks = attention_masks[:train_size], attention_masks[train_size:]

# Create data loaders
batch_size = 16
train_data = torch.utils.data.TensorDataset(train_input_ids, train_attention_masks)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

val_data = torch.utils.data.TensorDataset(val_input_ids, val_attention_masks)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

# Fine-tuning parameters
epochs = 3
learning_rate = 2e-5

# Set the model in training mode
model.train()

# Optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()

# Fine-tuning loop
for epoch in range(epochs):
    train_loss = 0.0
    for batch in train_loader:
        batch_input_ids, batch_attention_masks = batch
        batch_labels = torch.zeros_like(batch_input_ids)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_masks)
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

        # Compute the loss
        start_positions = torch.argmax(batch_labels, dim=1)
        end_positions = torch.argmax(batch_labels, dim=1)
        loss = (loss_fn(start_logits, start_positions) + loss_fn(end_logits, end_positions)) / 2.0

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        train_loss += loss.item()


    # Validate the model
    val_loss = 0.0
    model.eval()
    with torch.no_grad():
        for batch in val_loader:
            batch_input_ids, batch_attention_masks = batch

            outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_masks)
            loss = outputs.loss

            if loss is not None:
                val_loss += loss.item()

    print(f"Epoch {epoch+1}: Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}")
    model.train()

# Save the fine-tuned model
model.save_pretrained("fine_tuned_bert")

# Inference example
question = "What happens when there is a failure to seek assistance?"
context = "Failure to seek appropriate assistance may constitute a violation of the Student Conduct Code."
encoded = tokenizer.encode_plus(question, context, add_special_tokens=True, truncation=True, max_length=512, padding='max_length', return_tensors='pt')
input_ids = encoded['input_ids']
attention_mask = encoded['attention_mask']

# Set the model in evaluation mode
model.eval()

# Forward pass for inference
with torch.no_grad():
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)

start_logits = outputs.start_logits
end_logits = outputs.end_logits

# Get the predicted answer span
start_index = torch.argmax(start_logits)
end_index = torch.argmax(end_logits)
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[0][start_index:end_index+1]))

print("Question:", question)
print("Answer:", answer)



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

Epoch 1: Train Loss: 5.5085, Val Loss: 0.0000
Epoch 2: Train Loss: 3.4255, Val Loss: 0.0000
Epoch 3: Train Loss: 1.5749, Val Loss: 0.0000
Question: What happens when there is a failure to seek assistance?
Answer: [CLS]
