In [1]:
# Human Anatomy Chatbot with AI - Jupyter Notebook


In [2]:
# Import Required Libraries
import json
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from flask import Flask, request, jsonify


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Dataset Loading and Preprocessing
def load_and_prepare_dataset(folder_path):
    all_data = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.jsonl'):
            with open(os.path.join(folder_path, file_name), 'r') as file:
                all_data.extend([json.loads(line) for line in file])
    
    # Convert to DataFrame
    df = pd.DataFrame(all_data)
    df['contents'] = df['title'] + " " + df['content']
    return df

folder_path = 'C:\\Users\\alandavis\\Desktop\\Project\\chunk'  # Update with the actual folder containing JSONL files
dataset = load_and_prepare_dataset(folder_path)

# Display Sample Data
print("Dataset Sample:")
print(dataset.head())

Dataset Sample:
               id         title  \
0  Anatomy_Gray_0  Anatomy_Gray   
1  Anatomy_Gray_1  Anatomy_Gray   
2  Anatomy_Gray_2  Anatomy_Gray   
3  Anatomy_Gray_3  Anatomy_Gray   
4  Anatomy_Gray_4  Anatomy_Gray   

                                             content  \
0  What is anatomy? Anatomy includes those struct...   
1  Observation and visualization are the primary ...   
2  How can gross anatomy be studied? The term ana...   
3  This includes the vasculature, the nerves, the...   
4  Each of these approaches has benefits and defi...   

                                            contents  
0  Anatomy_Gray What is anatomy? Anatomy includes...  
1  Anatomy_Gray Observation and visualization are...  
2  Anatomy_Gray How can gross anatomy be studied?...  
3  Anatomy_Gray This includes the vasculature, th...  
4  Anatomy_Gray Each of these approaches has bene...  


In [4]:
# Split Dataset
def split_dataset(df, train_ratio=0.8):
    train_size = int(len(df) * train_ratio)
    train_data = df[:train_size]
    test_data = df[train_size:]
    return train_data, test_data

train_data, test_data = split_dataset(dataset)


In [5]:
# Tokenizer and Model Initialization
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Tokenization
def tokenize_function(examples):
    return tokenizer(examples['contents'], padding="max_length", truncation=True, return_tensors="pt")

tokenized_train = Dataset.from_pandas(train_data).map(tokenize_function, batched=True)
tokenized_test = Dataset.from_pandas(test_data).map(tokenize_function, batched=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 100677/100677 [00:36<00:00, 2794.63 examples/s]
Map: 100%|██████████| 25170/25170 [00:09<00:00, 2784.33 examples/s]


In [12]:
import torch
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix

try:
    # Verify installation of required packages
    import transformers
    import accelerate

    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",  # Updated from evaluation_strategy
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        greater_is_better=True
    )
    print("Training arguments initialized successfully")
except ImportError as e:
    print(f"Please run: pip install --upgrade transformers[torch] accelerate>=0.26.0")
except Exception as e:
    print(f"Error initializing training arguments: {e}")

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "confusion_matrix": confusion_matrix(labels, preds)
    }

Training arguments initialized successfully


In [20]:
# Training Arguments and Trainer Initialization
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Updated from evaluation_strategy
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics
)

# Training
trainer.train()

# Evaluation
metrics = trainer.evaluate()
print("Evaluation Metrics:", metrics)

# Plot Accuracy and Loss
training_logs = trainer.state.log_history
accuracy = [log['eval_accuracy'] for log in training_logs if 'eval_accuracy' in log]
loss = [log['eval_loss'] for log in training_logs if 'eval_loss' in log]

if accuracy and loss:  # Check if lists are not empty
    def plot_metrics(accuracy, loss):
        epochs = range(1, len(accuracy) + 1)
        plt.figure(figsize=(12, 5))
        # Accuracy
        plt.subplot(1, 2, 1)
        plt.plot(epochs, accuracy, label='Accuracy')
        plt.title('Model Accuracy')
        plt.xlabel('Epochs')
        plt.ylabel('Accuracy')
        plt.legend()
        # Loss
        plt.subplot(1, 2, 2)
        plt.plot(epochs, loss, label='Loss', color='orange')
        plt.title('Model Loss')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend()
        plt.tight_layout()
        plt.show()

    plot_metrics(accuracy, loss)
else:
    print("No accuracy or loss logs found to plot.")

# Confusion Matrix
cm = metrics['confusion_matrix']
ConfusionMatrixDisplay(confusion_matrix=cm).plot(cmap="Blues")
plt.show()


IndexError: list index out of range

In [None]:
# Flask API Setup
app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    data = request.json
    input_text = data.get("input")
    if not input_text:
        return jsonify({"error": "No input provided"}), 400

    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    prediction = outputs.logits.argmax().item()

    return jsonify({"prediction": prediction})

if __name__ == '__main__':
    app.run(host="0.0.0.0", port=5000)


In [None]:
# Unity Integration Tab
print("\nInstructions for Unity Integration:")
print("1. Set up an HTTP client in Unity (e.g., using UnityWebRequest).")
print("2. Use POST requests to send user input to http://<host>:5000/predict.")
print("3. Parse the JSON response to get predictions.")


In [None]:
# Additional Table Creation
def create_additional_table(df):
    df['query_log'] = None  # Placeholder for logging user queries
    df['response_time'] = None  # Placeholder for tracking response times
    return df

dataset_with_additional_table = create_additional_table(dataset)
print("Updated Dataset Structure:")
print(dataset_with_additional_table.head())
