<a href="https://colab.research.google.com/github/OmdenaAI/RebootRx/blob/main/src/colab_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Using the local package in colab

In [None]:
# get CUDA version (if using GPU)
!nvidia-smi | grep -oP '(?<=CUDA Version: )(\d*\.\d*)'

In [None]:
from getpass import getpass
import os

os.environ['GIT_USER'] = input('Enter the user of your GitHub account: ')
os.environ['PASSWORD'] = getpass('Enter the password (or PAT if 2FA is enabled) of your GitHub account: ')
os.environ['GIT_AUTH'] = os.environ['GIT_USER'] + ':' + os.environ['PASSWORD']

print('Start installing git repo...')
!pip install git+https://$GIT_AUTH@github.com/OmdenaAI/RebootRx.git@main > /dev/null

print('Package installed. Clear sensitive data...')
os.environ['PASSWORD'] = os.environ['GIT_AUTH'] = ""

In [None]:
%%bash
pip install seqeval > /dev/null
echo "Done!"

In [None]:
import os
import warnings
from datetime import datetime

from google.colab import drive
import matplotlib.pyplot as plt
import pandas as pd
from src.data_utils import ner
from src.model_utils import spacy_ner
from src import data_utils
from tqdm import tqdm

tqdm.pandas()
drive.mount('/content/drive')

ANNOTATIONS_PATH = "/content/drive/MyDrive/RebootRx/Official Folder of Reboot Rx Challenge/Task1 - Annotation/Final_datsets/RCT_Annotations_Final.csv"
MODEL_DIRPATH = "/content/drive/MyDrive/RebootRx/Official Folder of Reboot Rx Challenge/TASK3-MODELING/models/"

In [None]:
data = pd.read_csv(ANNOTATIONS_PATH)
data.info()

In [None]:
data = data_utils.labelbox(data)
data.head()

In [None]:
nlp = spacy_ner.create_blank_nlp(data["annotations"]) # specifying the tokenizer makes it much faster

# create a new column with zipped data and create TaggedCorpus object
data["tagged_corpus"] = pd.Series(zip(data["text"], data["annotations"]))
data["tagged_corpus"] = data["tagged_corpus"].progress_apply(
    lambda x: ner.TaggedCorpus(text=x[0], annotations=x[1], tokenizer=nlp.tokenizer)
)
data.head()

## Modeling

Use your model here!

In [None]:
_df = data.copy()
train = _df.sample(frac=0.9, random_state=42)
val = _df[~_df.index.isin(train.index)]

## Evaluate

Use [seqeval](https://github.com/chakki-works/seqeval) for evaluation

In [None]:
from seqeval.metrics import (
    accuracy_score,
    classification_report,
    f1_score,
    performance_measure,
    precision_score,
    recall_score,
)


In [None]:
# generate docs from validation set
docs_true = val.docs.to_list()

# generate iob list of tags from validation set
y_true = [ner.doc2ents(doc) for doc in docs_true]

# get the list of predictions from your model (`y_pred`) and run the evaluation below

In [None]:
print(classification_report(list(y_true), list(y_pred)))

### Log metrics, hyperparameters and models

In [None]:
from dagshub import dagshub_logger

metrics = {
    "loss": training_loss[-1],
    "loss_val": validation_loss[-1],
    "accuracy": accuracy_score(y_true, y_pred),
    "precision": precision_score(y_true, y_pred),
    "recall": recall_score(y_true, y_pred),
    "f1": f1_score(y_true, y_pred),
    "class_report": classification_report(list(y_true), list(y_pred), output_dict=True),
    **performance_measure(y_true, y_pred),
}
hp = {
    "lib": "'spacy_ner'",
    "n_epochs": n_epochs,
    "dropout": dropout,
    "n_train": len(train),
    "n_test": len(val),
}


with dagshub_logger(
    metrics_path=YOUR_METRICS_PATH,
    hparams_path=YOUR_HYPERPARAMS_PATH,
) as logger:
    # Metrics:
    logger.log_metrics(metrics)

    # Hyperparameters:
    logger.log_hyperparams(hp)

In [None]:
if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)
model.save_to(MODEL_PATH) # this is model-variant
print("Saved model to", MODEL_PATH)

# # to load
# # model = spacy.load('model_name')