In [1]:
# !pip install datasets

In [2]:
# pip install transformers

In [3]:
# pip install textstat

In [4]:
# !pip install torch torchvision torchaudio

In [5]:
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bc5cdr_md-0.5.0.tar.gz

In [6]:
# !pip install spacy
# !pip install scispacy

In [7]:
# !pip install typing_extensions

In [8]:
# !pip install -U transformers datasets evaluate accelerate scikit-learn --quiet

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import pandas as pd
import torch
import os
import time
from google.colab import drive

# Mount Google Drive to access folders
drive.mount('/content/drive')

# Load Text Files from Folders
def load_abstracts(folder_path, label):
    data = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read().strip()
                if content:
                    data.append({'abstract_id': filename.replace('.txt', ''),
                                 'abstract': content,
                                 'label': label})
    return data

cancer_folder = "/content/drive/MyDrive/Dataset/Cancer"
non_cancer_folder = "/content/drive/MyDrive/Dataset/Non-Cancer"

cancer_data = load_abstracts(cancer_folder, label='Cancer')
non_cancer_data = load_abstracts(non_cancer_folder, label='Non-Cancer')

df = pd.DataFrame(cancer_data + non_cancer_data)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)



📂 Please mount your Google Drive and place 'Cancer' and 'Non-Cancer' folders inside a known path
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
df.head()

Unnamed: 0,abstract_id,abstract,label
0,29775646,<ID:29775646>\nTitle: The RNA binding protein ...,Non-Cancer
1,26022167,<ID:26022167>\nTitle: Tuberous sclerosis compl...,Non-Cancer
2,25600436,<ID:25600436>\nTitle: Identification of chimer...,Non-Cancer
3,38041113,<ID:38041113>\nTitle: Development of a neutral...,Non-Cancer
4,31367627,<ID:31367627>\nTitle: Multimodality-imaging ma...,Cancer


In [11]:
# Check for Relevance (Cancer vs Non-Cancer)
df['is_cancer_mentioned'] = df['abstract'].str.contains(r'cancer', case=False)
relevance_check = pd.crosstab(df['label'], df['is_cancer_mentioned'])
print(relevance_check)

is_cancer_mentioned  False  True 
label                            
Cancer                 180    320
Non-Cancer             491      9


In [12]:
# Check for Relevance (Cancer vs Non-Cancer)
df['is_cancer_alike_words_mentioned'] = df['abstract'].str.contains(r'tumor|carcinoma|neoplasm|oncology', case=False)
relevance_check = pd.crosstab(df['label'], df['is_cancer_alike_words_mentioned'])
print(relevance_check)

is_cancer_alike_words_mentioned  False  True 
label                                        
Cancer                             156    344
Non-Cancer                         485     15


In [13]:
# Check for Relevance (Cancer vs Non-Cancer)
df['is_cancer_alike_words_mentioned'] = df['abstract'].str.contains(r'tumor|carcinoma|neoplasm|oncology', case=False)
relevance_check = pd.crosstab(df['label'], df['is_cancer_alike_words_mentioned'])
print(relevance_check)


is_cancer_alike_words_mentioned  False  True 
label                                        
Cancer                             156    344
Non-Cancer                         485     15


In [14]:
# Check for Relevance (Cancer vs Non-Cancer)
df['is_cancer_relevant_mentioned'] = df['abstract'].str.contains(r'cancer|tumor|carcinoma|neoplasm|oncology', case=False)
relevance_check = pd.crosstab(df['label'], df['is_cancer_relevant_mentioned'])
print(relevance_check)


is_cancer_relevant_mentioned  False  True 
label                                     
Cancer                           56    444
Non-Cancer                      477     23


In [15]:
# Check the number and consistency of in-text citations

df['citation_count'] = df['abstract'].str.count(r'\[\d+\]|\(\w+ et al\., \d{4}\)')
df['citation_density'] = df['citation_count'] / df['abstract'].str.split().str.len()
print(df[['citation_count', 'citation_density']].describe())


       citation_count  citation_density
count          1000.0            1000.0
mean              0.0               0.0
std               0.0               0.0
min               0.0               0.0
25%               0.0               0.0
50%               0.0               0.0
75%               0.0               0.0
max               0.0               0.0


In [16]:
df['has_id_mentioned'] = df['abstract'].str.contains(r'^<ID:\d+>\n', case=False)
print(df[['has_id_mentioned']].mean())

has_id_mentioned    1.0
dtype: float64


In [17]:
# remove rows with null abstract and clean citation mentions
df = df.dropna(subset=["abstract"])
df["abstract"] = df["abstract"].str.replace(r"\[[^\]]*\]", "", regex=True)  # remove citations like [1], [2]
df['abstract'].iloc[3]


'<ID:38041113>\nTitle: Development of a neutralization monoclonal antibody with a broad neutralizing effect against SARS-CoV-2 variants.\nAbstract: The emergence of severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) variants has challenged the effectiveness of current therapeutic regimens. Here, we aimed to develop a potent SARS-CoV-2 antibody with broad neutralizing effect by screening a scFv library with the spike protein receptor-binding domain (RBD) via phage display.\nSKAI-DS84 was identified through phage display, and we performed pseudovirus neutralization assays, authentic virus neutralization assays, and in vivo neutralization efficacy evaluations. Furthermore, surface plasmon resonance (SPR) analysis was conducted to assess the physical characteristics of the antibody, including binding kinetics and measure its affinity for variant RBDs.\nThe selected clones were converted to human IgG, and among them, SKAI-DS84 was selected for further analyses based on its binding

In [18]:
df['abstract'] = df['abstract'].str.replace(r'^<ID:\d+>\n', '', regex=True)
df['abstract'].iloc[4]

'Title: Multimodality-imaging manifestations of primary renal-allograft synovial sarcoma: First case report and literature review.\nAbstract: BACKGROUND: Primary renal synovial sarcoma (PRSS) is an extremely rare tumor with a poor prognosis. Its imaging and immunohistochemical characteristics may overlap with other renal tumors, which renders its early diagnosis in a dilemma. The diagnosis of primary renal synovial sarcoma requires histopathology and the confirmation of SYT-SSX gene fusion using molecular techniques. Cases of primary renal synovial sarcoma have been previously reported in the literature. However, to our knowledge, primary renal allograft synovial sarcoma was never described. CASE SUMMARY: A 43-year-old male patient who underwent kidney transplantation 9 months ago came to our hospital for regular follow-up. Traditional ultrasonography revealed multiple hypo-echo neoplasms in the renal allograft. Contrast-enhanced computed tomography (CECT) showed slightly hyper-density

In [19]:
df['abstract'] = df['abstract'].str.strip() # remove whitespace from start and end
df['abstract'] = df['abstract'].str.replace('\n', ' ', regex=False) # remove the linechange code
df['abstract'] = df['abstract'].str.replace(r'\s+', ' ', regex=True) #remove multiple whitespace between the sentence

In [20]:
# Remove "Title:" and "Abstract:" tags (case-insensitive)
df['abstract'] = df['abstract'].str.replace(r'\bTitle:\s*', '', regex=True)
df['abstract'] = df['abstract'].str.replace(r'\bAbstract:\s*', '', regex=True)

In [21]:
# check the abstract quality
df['has_title'] = df['abstract'].str.contains(r'^Title:', case=False)
df['has_methods'] = df['abstract'].str.contains(r'\bMethods?:\b', case=False)
df['has_results'] = df['abstract'].str.contains(r'\bResults?:\b', case=False)

print(df[['has_title', 'has_methods', 'has_results']].mean())

#Insight:
# Helps understand how well-formatted your abstracts are.
# Structured abstracts improve NER, topic tagging, and summarization tasks.

has_title      0.0
has_methods    0.0
has_results    0.0
dtype: float64


Find the diseases mentioned in each abstract

In [22]:
# import spacy
# import scispacy
# print("spaCy version:", spacy.__version__)
# print("scispaCy version:", scispacy.__version__)
# # only disease extraction
# import spacy

# # Load disease-specific NER model
# nlp = spacy.load("en_ner_bc5cdr_md")  # includes labels like DISEASE and CHEMICAL
# # Extract only diseases
# df['diseases'] = df['abstract'].apply(
#     lambda x: [ent.text for ent in nlp(x).ents if ent.label_ == "DISEASE"]
# )

# disease_mentioned_in_abstract_dict = dict(zip(df['abstract_id'], df['diseases']))

In [23]:
# !pip install --upgrade typing_extensions

In [24]:
from transformers import AutoTokenizer
import torch
from sklearn.model_selection import train_test_split

tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [25]:
df=df[['abstract_id', 'abstract', 'label']]

In [26]:
df['label']

Unnamed: 0,label
0,Non-Cancer
1,Non-Cancer
2,Non-Cancer
3,Non-Cancer
4,Cancer
...,...
995,Cancer
996,Cancer
997,Non-Cancer
998,Cancer


In [27]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset
from sklearn.model_selection import train_test_split
import torch

# Step 1: Encode labels
df['label'] = df['label'].map({'Non-Cancer': 0, 'Cancer': 1})

# Step 2: Train-validation split
df_train, df_val = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

# Step 3: Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")

# Step 4: Tokenize function
def tokenize_function(examples):
    return tokenizer(
        examples["abstract"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df['label'].map({'Non-Cancer': 0, 'Cancer': 1})


In [28]:
# Step 5: Convert DataFrames to Hugging Face Dataset
hf_train = Dataset.from_pandas(df_train.reset_index(drop=True))
hf_val = Dataset.from_pandas(df_val.reset_index(drop=True))

# Step 6: Tokenize the datasets
hf_train = hf_train.map(tokenize_function, batched=True)
hf_val = hf_val.map(tokenize_function, batched=True)

# Step 7: Set format for PyTorch
hf_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
hf_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [29]:
# # Step 8: Load model
# model = AutoModelForSequenceClassification.from_pretrained(
#     "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract",
#     num_labels=2
# )


In [30]:
# model with LoRa
from peft import get_peft_model, LoraConfig, TaskType
from transformers import AutoModelForSequenceClassification

base_model = AutoModelForSequenceClassification.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract", num_labels=2)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query", "key", "value"],
    lora_dropout=0.05,
    bias="all",  # ← critical: allows classifier to train
    task_type=TaskType.SEQ_CLS
)

model = get_peft_model(base_model, lora_config)
print(model.print_trainable_parameters())



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 989,186 || all params: 110,370,052 || trainable%: 0.8962
None


In [31]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/dataset_4",
    eval_strategy="epoch",
    run_name="pubmedbert_cancer_classifier",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    report_to="none",
    logging_strategy="steps",
    logging_steps=50
)

In [32]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)

    # Confusion matrix as counts (TP, FP, FN, TN)
    cm = confusion_matrix(labels, preds)
    cm_dict = {
        "true_negative": int(cm[0][0]),
        "false_positive": int(cm[0][1]),
        "false_negative": int(cm[1][0]),
        "true_positive": int(cm[1][1])
    }

    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
        **cm_dict
    }


In [33]:
# Step 1: Install Required Libraries
# !pip install -U transformers datasets evaluate accelerate scikit-learn numpy<2 --quiet

In [34]:
# !pip install -U datasets --quiet

In [None]:
from transformers import Trainer
from transformers import EarlyStoppingCallback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_train,
    eval_dataset=hf_val,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# Step 8: Train
import time
start = time.time()
# trainer.train()
trainer.train(resume_from_checkpoint=True)
print("⏱ Finished in %.2f minutes" % ((time.time() - start)/60))

  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss


In [36]:
trainer.evaluate()

{'eval_loss': 0.54628586769104,
 'eval_accuracy': 0.835,
 'eval_f1': 0.8235294117647058,
 'eval_precision': 0.8850574712643678,
 'eval_recall': 0.77,
 'eval_true_negative': 90,
 'eval_false_positive': 10,
 'eval_false_negative': 23,
 'eval_true_positive': 77,
 'eval_runtime': 459.5459,
 'eval_samples_per_second': 0.435,
 'eval_steps_per_second': 0.054,
 'epoch': 2.0}

In [37]:
# # for train metrices
# from sklearn.metrics import classification_report

# # Predict on training set
# train_predictions = trainer.predict(hf_train)
# train_preds = train_predictions.predictions.argmax(axis=1)
# train_labels = train_predictions.label_ids

# # Classification report
# print("Train classification report:\n", classification_report(train_labels, train_preds, target_names=['Non-Cancer', 'Cancer']))


In [38]:
import torch
import numpy as np
import pandas as pd

predictions = trainer.predict(hf_val)

# Use sigmoid instead of softmax
probs = torch.sigmoid(torch.tensor(predictions.predictions)).numpy()



In [44]:

label_map = {0: "Non-Cancer", 1: "Cancer"}

top_k = 2
predicted_labels = [
    [label_map[j] for j in probs[i].argsort()[-top_k:][::-1]]
    for i in range(len(probs))
]



In [45]:
i = 0
confidence_scores = {label_map[j]: round(probs[i][j], 2) for j in range(len(label_map))}

output = {
    "predicted_labels": predicted_labels[i],
    "confidence_scores": confidence_scores
}

print(output)


{'predicted_labels': ['Cancer', 'Non-Cancer'], 'confidence_scores': {'Non-Cancer': np.float32(0.41), 'Cancer': np.float32(0.59)}}


In [41]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_path = "/content/drive/MyDrive/dataset_4"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()

label_map = {0: "Non-Cancer", 1: "Cancer"}

def predict_cancer_labels(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    with torch.no_grad():
        logits = model(**inputs).logits
        probs = torch.nn.functional.softmax(logits, dim=1).squeeze().numpy()
        pred = int(probs.argmax())
        return {
            "predicted_labels": [label_map[pred]],
            "confidence_scores": {label_map[i]: round(float(p), 3) for i, p in enumerate(probs)}
        }

In [42]:
def predict_on_dataframe(df, text_col="abstract"):
    results = []
    for text in df[text_col]:
        results.append(predict_cancer_labels(text))
    return results


In [49]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import nbformat

# Define input and output paths
input_path = '/content/drive/MyDrive/Colab Notebook/Cancer_Classification_Assignment.ipynb'
output_path = '/content/drive/MyDrive/colab notebook/Cancer_Classification_Assignment_CLEAN.ipynb'

# Load notebook
nb = nbformat.read(input_path, as_version=4)

# Remove 'widgets' metadata if it exists
if 'widgets' in nb['metadata']:
    del nb['metadata']['widgets']

# Save the cleaned notebook
with open(output_path, 'w') as f:
    nbformat.write(nb, f)

print(f"Cleaned notebook saved to: {output_path}")
