In [1]:
!pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.70.0-py3-none-any.whl (315 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.5/315.5 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from simpletransformers)
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorboardx (from simpletransformers)
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
Collecting wandb>=0.10.32 (from simpletransformers)


In [15]:
import numpy as np
import pandas as pd
import torch
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, f1_score


cuda_available = torch.cuda.is_available()

def load_data(file_path):
    """
    Load and preprocess the data.
    """
    df = pd.read_csv(file_path)
    df['transcription'] = df['keywords'].fillna('') + ' ' + df['transcription'].fillna('')
    df.dropna(subset=['medical_specialty', 'transcription'], inplace=True)

    # Handling class imbalance by grouping rare specialties as 'others'
    counts = df['medical_specialty'].value_counts()
    others = counts[counts < 100].index
    df.loc[df['medical_specialty'].isin(others), 'medical_specialty'] = 'Others'

    class_dict = {value: idx for idx, value in enumerate(df['medical_specialty'].unique())}
    df['medical_specialty'] = df['medical_specialty'].map(class_dict)

    X_train, X_test, y_train, y_test = train_test_split(df['transcription'], df['medical_specialty'], stratify=df['medical_specialty'], test_size=0.25)

    num_classes = len(class_dict)
    class_weights = [1.0] * num_classes 

    return X_train, X_test, y_train, y_test, num_classes, class_weights, class_dict

def train_model(X_train, y_train, num_classes, class_weights):
    """
    Train the model.
    """
    train_df = pd.DataFrame({'text': X_train, 'labels': y_train})

    model_args = ClassificationArgs(num_train_epochs=3, learning_rate=1e-5, overwrite_output_dir=True)
    model = ClassificationModel('roberta', 'roberta-base', num_labels=num_classes, args=model_args, use_cuda=cuda_available, weight=class_weights)
    model.train_model(train_df)

    return model

def evaluate_model(model, X_test, y_test):
    """
    Evaluate the model's performance.
    """
    test_df = pd.DataFrame({'text': X_test, 'labels': y_test})

    predictions, raw_outputs = model.predict(test_df['text'].tolist())
    acc = balanced_accuracy_score(test_df['labels'], predictions)
    f1 = f1_score(test_df['labels'], predictions, average='weighted')  

    print(f'Balanced Accuracy: {acc}, F1 Score: {f1}')

def analyze_keywords(df, keywords):
    """
    Analyze occurrences of specific keywords in the dataset.
    """
    for keyword in keywords:
        occurrences = df[df['transcription'].str.contains(keyword, case=False)]
        print(f"Occurrences of '{keyword}': {len(occurrences)}")





In [16]:
if __name__ == '__main__':
    file_path = '/content/drive/MyDrive/Colab_Notebooks/FemTherapeutics_Technical_Interview-main/DATA.csv'  # Update this to your dataset location
    X_train, X_test, y_train, y_test, num_classes, class_weights, class_dict = load_data(file_path)

    model = train_model(X_train, y_train, num_classes, class_weights)
    model.save_model('./trained_model')
    evaluate_model(model, X_test, y_test)

  
    df = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/FemTherapeutics_Technical_Interview-main/DATA.csv')
    df['transcription'] = df['keywords'].fillna('') + ' ' + df['transcription'].fillna('')
    keywords = ['Rectocele', 'Cystocele']
    analyze_keywords(df, keywords)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/7 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/469 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/469 [00:00<?, ?it/s]

Running Epoch 3 of 3:   0%|          | 0/469 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Balanced Accuracy: 0.8208040902456636, F1 Score: 0.8339200750102773
Occurrences of 'Rectocele': 12
Occurrences of 'Cystocele': 9
