### Testing a Baseline BERT Model
#### DSC 672 - Impact Genome - Jack Leniart

In [2]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
# Function to compute accuracy metric
def compute_metrics(p):
    predictions, labels = p
    preds = predictions.argmax(axis=-1)
    return classification_report(labels, preds, output_dict=True)

In [None]:
# Main script
def main(input_csv, output_csv, model_name='bert-base-uncased', epochs=3, batch_size=8, max_length=128):
    # Load the input data - CSV file
    df = pd.read_csv(input_csv)

    # Split data into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(df['programdescription'], df['outcomeid'], test_size=0.2, random_state=42)

    # Prepare the dataset using the datasets library
    train_data = Dataset.from_dict({'text': X_train.tolist(), 'label': y_train.tolist()})
    val_data = Dataset.from_dict({'text': X_val.tolist(), 'label': y_val.tolist()})

    # Load tokenizer and encode the text data
    tokenizer = BertTokenizer.from_pretrained(model_name)

    def tokenize_function(examples):
        return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=max_length)

    train_data = train_data.map(tokenize_function, batched=True)
    val_data = val_data.map(tokenize_function, batched=True)

    # Load pre-trained model
    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(df['outcomeid'].unique()))

    # Define training arguments
    training_args = TrainingArguments(
        output_dir='./results',          # output directory
        num_train_epochs=epochs,         # number of training epochs
        per_device_train_batch_size=batch_size,  # batch size for training
        per_device_eval_batch_size=batch_size,   # batch size for evaluation
        evaluation_strategy="epoch",     # evaluate at the end of each epoch
        logging_dir='./logs',            # directory for storing logs
        logging_steps=10,
        load_best_model_at_end=True
    )

    # Define Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=val_data,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer.train()

    # Make predictions on the full dataset (training + validation)
    predictions = trainer.predict(val_data)
    preds = predictions.predictions.argmax(axis=-1)

    # Add predictions to the original DataFrame
    df['predicted_outcome'] = preds

    # Save the results to a new CSV file
    df.to_csv(output_csv, index=False)
    print(f"Predictions saved to {output_csv}")

In [None]:
# Run the main function
if __name__ == "__main__":
    #Prompt user to enter input file name
    input_csv = input("Enter file name containing the program descriptions: ")

    #Or, you could hard code the input file name
    #input_csv = 'your_input_file.csv'  #input CSV file path
    
    output_csv = 'predicted_outcomes.csv'  #output CSV file with predictions

    main(input_csv, output_csv)