## Importing Libraries

In [1]:
import numpy as np
import random
import tensorflow as tf
import json
import os
import pandas as pd
from pathlib import Path
from datetime import datetime
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification, BertConfig
import tensorflow as tf
from sklearn.metrics import classification_report
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.preprocessing import LabelEncoder

# Set random seeds
random.seed(42)  # Python random seed
np.random.seed(42)  # NumPy random seed
tf.random.set_seed(42)  # TensorFlow random seed

  from .autonotebook import tqdm as notebook_tqdm


# Extracting and Processing JSON Files for Paper Metadata

This notebook processes a dataset of JSON files containing metadata for academic papers.



In [2]:
# Directory containing the JSON files
directory = "assignementdataset"

# List to store paper data
papers_data = []

# Process each JSON file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".json"):
        file_path = os.path.join(directory, filename)
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                pdf_parse = data.get('pdf_parse', {})

                # Get abstract using multiple methods
                abstract = data.get('abstract', '')

                # If no abstract found, try pdf_parse methods
                if not abstract and pdf_parse:
                    # Try abstract from pdf_parse
                    abstract = ' '.join([a.get('text', '')
                                        for a in pdf_parse.get('abstract', [])])

                    # If still no abstract, check body_text
                    if not abstract:
                        body_text = pdf_parse.get('body_text', [])
                        # First try to find a section explicitly marked as abstract
                        for section in body_text:
                            if section.get('section', '').lower() == 'abstract':
                                abstract = section.get('text', '')
                                break

                        # If still no abstract, use the first section if it looks like an abstract
                        if not abstract and body_text:
                            first_section = body_text[0].get('text', '')
                            # Use first section if it's not too long and doesn't look like references
                            if (len(first_section.split()) < 500 and
                                not any(ref_word in first_section.lower()
                                        for ref_word in ['references', 'bibliography', 'works cited'])):
                                abstract = first_section

                # If still no abstract, use first body text section regardless
                if not abstract and pdf_parse.get('body_text'):
                    abstract = pdf_parse['body_text'][0].get('text', '')

                # Extract author information
                authors = []
                for author in data.get('authors', []):
                    name_parts = []
                    if author.get("first"):
                        name_parts.append(author["first"])
                    if author.get("middle"):
                        name_parts.extend(author["middle"])
                    if author.get("last"):
                        name_parts.append(author["last"])
                    authors.append(" ".join(name_parts))
                authors_str = "; ".join(authors)

                # Extract relevant information
                paper_info = {
                    'file': filename,
                    'title': data.get('title', ''),
                    'abstract': abstract,
                    'authors': authors_str,
                    'keywords': '; '.join(pdf_parse.get('keywords', []))
                }

                papers_data.append(paper_info)
                print(f"Processed: {filename}")

        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")

# Create DataFrame and save to CSV
df = pd.DataFrame(papers_data)

# Save to CSV with proper encoding for special characters
output_file = "papers_dataset.csv"
df.to_csv(output_file, index=False, encoding='utf-8-sig')

print(f"\nProcessing complete. Data saved to: {output_file}")
print(f"Total papers processed: {len(papers_data)}")


Processed: -em-Fertility-and-Sterility--em--top-videos-from-2.grobid.tei.json
Processed: A-&#x201c;first&#x201d;-on-the-horizon--the-expans.grobid.tei.json
Processed: A-behind-the-scenes-look-at-retroperitoneal-ectopi.grobid.tei.json
Processed: A-call-to-action-to-reproductive-endocrinologists-.grobid.tei.json
Processed: A-case-report-of-retroperitoneal-ectopic-pregnancy.grobid.tei.json
Processed: A-combination-of-two-novel-ligation-techniques-for.grobid.tei.json
Processed: A-diagnosis-of-diminished-ovarian-reserve-does-not.grobid.tei.json
Processed: A-framework-approach-for-hysteroscopic-uterine-sep.grobid.tei.json
Processed: A-higher-number-of-oocytes-retrieved-is-associated.grobid.tei.json
Processed: A-nail-in-the-coffin--the-antim&#xfc;llerian-hormo.grobid.tei.json
Processed: A-novel-approach-using-vaginal-natural-orifice-tra.grobid.tei.json
Processed: Abdallah_et_al-2021-Cochrane_Database_of_Systematic_Reviews.grobid.tei.json
Processed: Abdel-Aleem_et_al-2022-Cochrane_Database_of_

In [3]:
df

Unnamed: 0,file,title,abstract,authors,keywords
0,-em-Fertility-and-Sterility--em--top-videos-fr...,Fertility and Sterility top videos from 2021,Objective: To objectively grade all video publ...,Joseph Findley; Zaraq Khan; John Preston Parry...,Reproductive surgery; top 10; video publicatio...
1,A-&#x201c;first&#x201d;-on-the-horizon--the-ex...,A “first” on the horizon: the expansion of ute...,Since the first live birth from uterus transpl...,Elliott G Richards; Cecile A Ferrando; Ruth M ...,
2,A-behind-the-scenes-look-at-retroperitoneal-ec...,A behind-the-scenes look at retroperitoneal ec...,Although ectopic pregnancies are commonly enco...,Sarah Simko; Sadikah Behbehani,
3,A-call-to-action-to-reproductive-endocrinologi...,A call to action to reproductive endocrinologi...,"I began life as a Catholic; however, by the ag...",Nanette Santoro,
4,A-case-report-of-retroperitoneal-ectopic-pregn...,A case report of retroperitoneal ectopic pregn...,Objective: To present a rare case of retroperi...,Jing Yuan; Hui Xie; Muyu Chen; Fei Zeng; Songs...,Infertility; IVF-ET; laparoscopy; lymphatic mi...
5,A-combination-of-two-novel-ligation-techniques...,A combination of two novel ligation techniques...,Objective: To perform laparoscopic myomectomy ...,Shengke Wang; Dongdong Wang; Fujie Zhao,Intraligamental myoma; laparoscopy; loop ligat...
6,A-diagnosis-of-diminished-ovarian-reserve-does...,A diagnosis of diminished ovarian reserve does...,Objective: To estimate the aneuploidy rates in...,Yuval Fouks; Alan Penzias; Werner Neuhausser; ...,Diminished ovarian reserve; poor ovarian respo...
7,A-framework-approach-for-hysteroscopic-uterine...,A framework approach for hysteroscopic uterine...,Objective: To demonstrate safe and efficient t...,Phillip A Romanski; Pietro Bortoletto; Samanth...,Uterine septum; partial septum; complete septu...
8,A-higher-number-of-oocytes-retrieved-is-associ...,A higher number of oocytes retrieved is associ...,Objective: To investigate the association betw...,Michael Fanton; Justina Hyunjii Cho; Valerie L...,Retrospective study. Setting: Retrieval cycles...
9,A-nail-in-the-coffin--the-antim&#xfc;llerian-h...,A nail in the coffin: the antimüllerian hormon...,Imagine a blood test so powerful that it can p...,Bradley S Hurst,


## Checking for Missing Abstracts and Keywords

To ensure the dataset is complete and suitable for analysis, we check for any missing entries in key fields such as `abstract` and `keywords`.

In [4]:
# Check for empty strings
empty_abstracts = df[df['abstract'] == '']
empty_keywords = df[df['keywords'] == '']

# Print the count of empty entries
print(f"Empty abstracts: {empty_abstracts.shape[0]}")
print(f"Empty keywords: {empty_keywords.shape[0]}")


Empty abstracts: 0
Empty keywords: 29


In [5]:
df

Unnamed: 0,file,title,abstract,authors,keywords
0,-em-Fertility-and-Sterility--em--top-videos-fr...,Fertility and Sterility top videos from 2021,Objective: To objectively grade all video publ...,Joseph Findley; Zaraq Khan; John Preston Parry...,Reproductive surgery; top 10; video publicatio...
1,A-&#x201c;first&#x201d;-on-the-horizon--the-ex...,A “first” on the horizon: the expansion of ute...,Since the first live birth from uterus transpl...,Elliott G Richards; Cecile A Ferrando; Ruth M ...,
2,A-behind-the-scenes-look-at-retroperitoneal-ec...,A behind-the-scenes look at retroperitoneal ec...,Although ectopic pregnancies are commonly enco...,Sarah Simko; Sadikah Behbehani,
3,A-call-to-action-to-reproductive-endocrinologi...,A call to action to reproductive endocrinologi...,"I began life as a Catholic; however, by the ag...",Nanette Santoro,
4,A-case-report-of-retroperitoneal-ectopic-pregn...,A case report of retroperitoneal ectopic pregn...,Objective: To present a rare case of retroperi...,Jing Yuan; Hui Xie; Muyu Chen; Fei Zeng; Songs...,Infertility; IVF-ET; laparoscopy; lymphatic mi...
5,A-combination-of-two-novel-ligation-techniques...,A combination of two novel ligation techniques...,Objective: To perform laparoscopic myomectomy ...,Shengke Wang; Dongdong Wang; Fujie Zhao,Intraligamental myoma; laparoscopy; loop ligat...
6,A-diagnosis-of-diminished-ovarian-reserve-does...,A diagnosis of diminished ovarian reserve does...,Objective: To estimate the aneuploidy rates in...,Yuval Fouks; Alan Penzias; Werner Neuhausser; ...,Diminished ovarian reserve; poor ovarian respo...
7,A-framework-approach-for-hysteroscopic-uterine...,A framework approach for hysteroscopic uterine...,Objective: To demonstrate safe and efficient t...,Phillip A Romanski; Pietro Bortoletto; Samanth...,Uterine septum; partial septum; complete septu...
8,A-higher-number-of-oocytes-retrieved-is-associ...,A higher number of oocytes retrieved is associ...,Objective: To investigate the association betw...,Michael Fanton; Justina Hyunjii Cho; Valerie L...,Retrospective study. Setting: Retrieval cycles...
9,A-nail-in-the-coffin--the-antim&#xfc;llerian-h...,A nail in the coffin: the antimüllerian hormon...,Imagine a blood test so powerful that it can p...,Bradley S Hurst,


## Categorizing Papers into Balanced Categories

This section assigns each paper to one of four predefined categories using titles, abstracts, and keyword matching and scoring.

In [6]:


BALANCED_CATEGORIES = {
    'ASSISTED_REPRODUCTION': [
        'ivf', 'embryo', 'fertility treatment', 'sperm', 'ovarian', 
        'oocyte', 'antral follicle', 'assisted reproductive', 'fertilization', 
        'preimplantation', 'insemination', 'gonadotrophin', 'amh', 
        'antimüllerian', 'follicle', 'ovarian reserve', 'infertility',
        'subfertility', 'reproductive technology', 'icsi', 'blastocyst',
        'in vitro', 'fertility care', 'fertility center'
    ],
    
    'SURGICAL_PROCEDURES': [
        'laparoscopy', 'hysteroscopy', 'surgical', 'surgery',
        'catheter', 'transplantation', 'ligation', 'myomectomy', 
        'adhesion', 'perioperative', 'postoperative', 
        'procedure', 'operative', 'resection', 'excision', 'dissection',
        'minimally invasive', 'surgical approach'
    ],
    
    'REPRODUCTIVE_CONDITIONS': [
        'endometriosis', 'adenomyosis', 'ectopic pregnancy', 'uterine', 
        'fibroids', 'cervical', 'preterm', 'pregnancy loss',
        'diminished ovarian', 'reproductive disorders', 'miscarriage',
        'disorder', 'pathology', 'complication', 'placenta',
        'gynecologic', 'obstetric', 'maternal', 'fetal', 'pelvic pain',
        'ovarian cyst', 'reproductive health', 'pregnancy', 'birth',
        'gestational', 'prenatal', 'postnatal'
    ],
    
    'CLINICAL_RESEARCH': [
        'systematic review', 'clinical trial', 'meta-analysis', 'assessment',
        'evaluation', 'cohort study', 'research', 'randomized',
        'evidence-based', 'outcomes', 'protocol', 'intervention',
        'clinical practice', 'guidelines', 'database', 'retrospective',
        'cochrane', 'review', 'evidence', 'controlled trial',
        'study design', 'statistical', 'analysis'
    ]
}

def calculate_score(text, keywords):
    if pd.isna(text):
        return 0
    text = str(text).lower()
    score = 0
    for keyword in keywords:
        if keyword in text:
            score += 1
    return score

def assign_category(row):
    scores = {}
    
    # Calculate scores for each category
    for category, keywords in BALANCED_CATEGORIES.items():
        # Weight different fields
        title_score = calculate_score(row['title'], keywords) * 3
        abstract_score = calculate_score(row['abstract'], keywords) * 2
        keyword_score = calculate_score(row['keywords'], keywords) * 2
        
        total_score = title_score + abstract_score + keyword_score
        scores[category] = total_score
    
    # If no scores, assign to least represented category
    if all(score == 0 for score in scores.values()):
        if 'category' in df.columns:
            counts = df['category'].value_counts()
            return min(BALANCED_CATEGORIES.keys(), key=lambda x: counts.get(x, float('inf')))
        return np.random.choice(list(BALANCED_CATEGORIES.keys()))
    
    # Get max score
    max_score = max(scores.values())
    candidates = [cat for cat, score in scores.items() if score == max_score]
    
    # If multiple categories have the same score, use current distribution
    if len(candidates) > 1 and 'category' in df.columns:
        counts = df['category'].value_counts()
        return min(candidates, key=lambda x: counts.get(x, float('inf')))
    
    return candidates[0]

# Initialize categories column
df['category'] = None

# First pass: basic categorization
for idx, row in df.iterrows():
    df.at[idx, 'category'] = assign_category(row)

# Second pass: balance categories
target_count = len(df) // len(BALANCED_CATEGORIES)
current_counts = df['category'].value_counts()

# Identify overrepresented and underrepresented categories
over_represented = [cat for cat in current_counts.index if current_counts[cat] > target_count]
under_represented = [cat for cat in BALANCED_CATEGORIES.keys() if cat not in current_counts or current_counts[cat] < target_count]

# Rebalance if needed
for over_cat in over_represented:
    excess = current_counts[over_cat] - target_count
    if excess > 0 and under_represented:
        # Find articles that could be recategorized
        candidates = df[df['category'] == over_cat].index
        for idx in candidates[:excess]:
            # Recategorize to most underrepresented category
            new_cat = min(under_represented, key=lambda x: current_counts.get(x, 0))
            df.at[idx, 'category'] = new_cat
            # Update counts
            current_counts[over_cat] -= 1
            current_counts[new_cat] = current_counts.get(new_cat, 0) + 1

# Display results
print("\nCategory Distribution:")
print(df['category'].value_counts())
print("\nPercentage Distribution:")
print((df['category'].value_counts() / len(df) * 100).round(1), "%")

# Display sample of categorized articles
print("\nSample of Categorized Articles:")
print(df[['title', 'category']].head(10))

# Save categorized data
df.to_csv('categorized_articles.csv', index=False)


Category Distribution:
category
SURGICAL_PROCEDURES        13
CLINICAL_RESEARCH          13
REPRODUCTIVE_CONDITIONS    12
ASSISTED_REPRODUCTION      12
Name: count, dtype: int64

Percentage Distribution:
category
SURGICAL_PROCEDURES        26.0
CLINICAL_RESEARCH          26.0
REPRODUCTIVE_CONDITIONS    24.0
ASSISTED_REPRODUCTION      24.0
Name: count, dtype: float64 %

Sample of Categorized Articles:
                                               title             category
0       Fertility and Sterility top videos from 2021  SURGICAL_PROCEDURES
1  A “first” on the horizon: the expansion of ute...  SURGICAL_PROCEDURES
2  A behind-the-scenes look at retroperitoneal ec...  SURGICAL_PROCEDURES
3  A call to action to reproductive endocrinologi...    CLINICAL_RESEARCH
4  A case report of retroperitoneal ectopic pregn...  SURGICAL_PROCEDURES
5  A combination of two novel ligation techniques...  SURGICAL_PROCEDURES
6  A diagnosis of diminished ovarian reserve does...  SURGICAL_PROCEDURES
7  

In [7]:
df.head()

Unnamed: 0,file,title,abstract,authors,keywords,category
0,-em-Fertility-and-Sterility--em--top-videos-fr...,Fertility and Sterility top videos from 2021,Objective: To objectively grade all video publ...,Joseph Findley; Zaraq Khan; John Preston Parry...,Reproductive surgery; top 10; video publicatio...,SURGICAL_PROCEDURES
1,A-&#x201c;first&#x201d;-on-the-horizon--the-ex...,A “first” on the horizon: the expansion of ute...,Since the first live birth from uterus transpl...,Elliott G Richards; Cecile A Ferrando; Ruth M ...,,SURGICAL_PROCEDURES
2,A-behind-the-scenes-look-at-retroperitoneal-ec...,A behind-the-scenes look at retroperitoneal ec...,Although ectopic pregnancies are commonly enco...,Sarah Simko; Sadikah Behbehani,,SURGICAL_PROCEDURES
3,A-call-to-action-to-reproductive-endocrinologi...,A call to action to reproductive endocrinologi...,"I began life as a Catholic; however, by the ag...",Nanette Santoro,,CLINICAL_RESEARCH
4,A-case-report-of-retroperitoneal-ectopic-pregn...,A case report of retroperitoneal ectopic pregn...,Objective: To present a rare case of retroperi...,Jing Yuan; Hui Xie; Muyu Chen; Fei Zeng; Songs...,Infertility; IVF-ET; laparoscopy; lymphatic mi...,SURGICAL_PROCEDURES


## Encoding Categories into Numeric Labels

To prepare the data for machine learning models, the categorical `category` column is converted into numeric labels using `LabelEncoder`.


In [8]:
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['category'])

# Print label mapping for reference
print("Label Mapping:")
for i, category in enumerate(label_encoder.classes_):
    print(f"{category}: {i}")

Label Mapping:
ASSISTED_REPRODUCTION: 0
CLINICAL_RESEARCH: 1
REPRODUCTIVE_CONDITIONS: 2
SURGICAL_PROCEDURES: 3


In [9]:
df.head()

Unnamed: 0,file,title,abstract,authors,keywords,category,label
0,-em-Fertility-and-Sterility--em--top-videos-fr...,Fertility and Sterility top videos from 2021,Objective: To objectively grade all video publ...,Joseph Findley; Zaraq Khan; John Preston Parry...,Reproductive surgery; top 10; video publicatio...,SURGICAL_PROCEDURES,3
1,A-&#x201c;first&#x201d;-on-the-horizon--the-ex...,A “first” on the horizon: the expansion of ute...,Since the first live birth from uterus transpl...,Elliott G Richards; Cecile A Ferrando; Ruth M ...,,SURGICAL_PROCEDURES,3
2,A-behind-the-scenes-look-at-retroperitoneal-ec...,A behind-the-scenes look at retroperitoneal ec...,Although ectopic pregnancies are commonly enco...,Sarah Simko; Sadikah Behbehani,,SURGICAL_PROCEDURES,3
3,A-call-to-action-to-reproductive-endocrinologi...,A call to action to reproductive endocrinologi...,"I began life as a Catholic; however, by the ag...",Nanette Santoro,,CLINICAL_RESEARCH,1
4,A-case-report-of-retroperitoneal-ectopic-pregn...,A case report of retroperitoneal ectopic pregn...,Objective: To present a rare case of retroperi...,Jing Yuan; Hui Xie; Muyu Chen; Fei Zeng; Songs...,Infertility; IVF-ET; laparoscopy; lymphatic mi...,SURGICAL_PROCEDURES,3


## Combining Text Features into a Single Column

To streamline text processing, the `title`, `abstract`, and `keywords` columns are merged into a new column called `combined_text`.

In [10]:
# Combine title, abstract, and keywords into a single column using .loc
df.loc[:, 'combined_text'] = df['title'] + ' ' + df['abstract'] + ' ' + df['keywords']

# Replace NaN values in the combined text with an empty string using .loc
df.loc[:, 'combined_text'] = df['combined_text'].fillna('')

# Verify the new column
df.head()

Unnamed: 0,file,title,abstract,authors,keywords,category,label,combined_text
0,-em-Fertility-and-Sterility--em--top-videos-fr...,Fertility and Sterility top videos from 2021,Objective: To objectively grade all video publ...,Joseph Findley; Zaraq Khan; John Preston Parry...,Reproductive surgery; top 10; video publicatio...,SURGICAL_PROCEDURES,3,Fertility and Sterility top videos from 2021 O...
1,A-&#x201c;first&#x201d;-on-the-horizon--the-ex...,A “first” on the horizon: the expansion of ute...,Since the first live birth from uterus transpl...,Elliott G Richards; Cecile A Ferrando; Ruth M ...,,SURGICAL_PROCEDURES,3,A “first” on the horizon: the expansion of ute...
2,A-behind-the-scenes-look-at-retroperitoneal-ec...,A behind-the-scenes look at retroperitoneal ec...,Although ectopic pregnancies are commonly enco...,Sarah Simko; Sadikah Behbehani,,SURGICAL_PROCEDURES,3,A behind-the-scenes look at retroperitoneal ec...
3,A-call-to-action-to-reproductive-endocrinologi...,A call to action to reproductive endocrinologi...,"I began life as a Catholic; however, by the ag...",Nanette Santoro,,CLINICAL_RESEARCH,1,A call to action to reproductive endocrinologi...
4,A-case-report-of-retroperitoneal-ectopic-pregn...,A case report of retroperitoneal ectopic pregn...,Objective: To present a rare case of retroperi...,Jing Yuan; Hui Xie; Muyu Chen; Fei Zeng; Songs...,Infertility; IVF-ET; laparoscopy; lymphatic mi...,SURGICAL_PROCEDURES,3,A case report of retroperitoneal ectopic pregn...


## Splitting the Dataset into Training and Testing Sets

To prepare the data for model training and evaluation, the dataset is divided into training and testing sets.

In [11]:
# Select the important features
df_selected = df[['combined_text', 'label']]

# Split the data into training and validation sets (80% train, 20% validation)
train_df, test_df = train_test_split(
    df, 
    test_size=0.2, 
    random_state=42
)
# Preview the resulting splits
print(f"Training set size: {len(train_df)}")
print(f"Testing set size: {len(test_df)}")

Training set size: 40
Testing set size: 10


In [12]:
train_df.head()

Unnamed: 0,file,title,abstract,authors,keywords,category,label,combined_text
12,Abdel-Aleem_et_al-2022-Cochrane_Database_of_Sy...,Cervical pessary for preventing preterm birth ...,"Preterm birth (PTB), defined as birth prior to...",Hany Abdel-Aleem; Omar M Shaaban; Mahmoud A Ab...,,REPRODUCTIVE_CONDITIONS,2,Cervical pessary for preventing preterm birth ...
4,A-case-report-of-retroperitoneal-ectopic-pregn...,A case report of retroperitoneal ectopic pregn...,Objective: To present a rare case of retroperi...,Jing Yuan; Hui Xie; Muyu Chen; Fei Zeng; Songs...,Infertility; IVF-ET; laparoscopy; lymphatic mi...,SURGICAL_PROCEDURES,3,A case report of retroperitoneal ectopic pregn...
37,Barrowclough_et_al-2022-Cochrane_Database_of_S...,Maternal postures for fetal malposition in lab...,Fetal malposition (occipito-posterior and pers...,Jennifer A Barrowclough; Luling Lin; Bridget K...,,REPRODUCTIVE_CONDITIONS,2,Maternal postures for fetal malposition in lab...
8,A-higher-number-of-oocytes-retrieved-is-associ...,A higher number of oocytes retrieved is associ...,Objective: To investigate the association betw...,Michael Fanton; Justina Hyunjii Cho; Valerie L...,Retrospective study. Setting: Retrieval cycles...,CLINICAL_RESEARCH,1,A higher number of oocytes retrieved is associ...
3,A-call-to-action-to-reproductive-endocrinologi...,A call to action to reproductive endocrinologi...,"I began life as a Catholic; however, by the ag...",Nanette Santoro,,CLINICAL_RESEARCH,1,A call to action to reproductive endocrinologi...


In [13]:
test_df.head()

Unnamed: 0,file,title,abstract,authors,keywords,category,label,combined_text
13,Absolute-uterine-infertility-a-cornelian-dilem...,Absolute uterine infertility a cornelian dilem...,"Absolute uterine factor infertility (AUFI), be...",Mats Brännström; Catherine Racowsky; Elliott G...,,REPRODUCTIVE_CONDITIONS,2,Absolute uterine infertility a cornelian dilem...
39,Bergman_et_al-2019-Cochrane_Database_of_System...,Comparison of different human papillomavirus (...,Comparison of di erent human papillomavirus (HPV,Hanna Bergman; Brian S Buckley; Gemma Villanue...,,CLINICAL_RESEARCH,1,Comparison of different human papillomavirus (...
30,Association-of-endometriosis-and-adenomyosis-w...,Association of endometriosis and adenomyosis w...,We performed a comprehensive narrative synthes...,Paolo Vercellini; Paola Viganò; Veronica Bandi...,Endometriosis; adenomyosis; infertility; pregn...,REPRODUCTIVE_CONDITIONS,2,Association of endometriosis and adenomyosis w...
45,Boomsma_et_al-2019-Cochrane_Database_of_System...,Semen preparation techniques for intrauterine ...,3 Multiple pregnancy rate per couple.....,Carolien M Boomsma; Ben J Cohlen; Cindy Farquhar,,REPRODUCTIVE_CONDITIONS,2,Semen preparation techniques for intrauterine ...
17,Ahmad_et_al-2020-Cochrane_Database_of_Systemat...,Barrier agents for adhesion prevention after g...,No studies reported our primary outcomes of pe...,Gaity Ahmad; Kyungmin Kim; Matthew Thompson; P...,Informed decisions; Better health Trusted evid...,CLINICAL_RESEARCH,1,Barrier agents for adhesion prevention after g...


## Loading BioBERT for Sequence Classification

In this step, we load the BioBERT model, which is specifically fine-tuned for biomedical text, for sequence classification.


In [14]:
# Load BioBERT tokenizer and TensorFlow model
tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-v1.1')

# Load configuration for the BioBERT model with specific parameters
config = BertConfig.from_pretrained(
    'dmis-lab/biobert-v1.1',
    num_labels=4,
    hidden_dropout_prob=0.3,
    attention_probs_dropout_prob=0.3
)

# Load model with config
model = TFBertForSequenceClassification.from_pretrained(
    'dmis-lab/biobert-v1.1', 
    config=config,
    from_pt=True
)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Freezing Initial Layers of BioBERT

To speed up training and reduce computational requirements, we freeze the first 6 layers of the BioBERT model. Freezing these layers prevents them from being updated during training, allowing the model to focus on learning from the later layers.


In [15]:
for layer in model.bert.encoder.layer[:6]:  # Freeze first 6 layers
    layer.trainable = False

## Tokenizing Text Data and Preparing TensorFlow Datasets

In this step, we tokenize the text data and convert it into a format suitable for training a TensorFlow model.


In [16]:
# Tokenize the text data
train_encodings = tokenizer(train_df['combined_text'].tolist(), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_df['combined_text'].tolist(), truncation=True, padding=True, max_length=128)

# Convert to TensorFlow Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_df['label'].values
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_df['label'].values
))

## Batching and Shuffling the Dataset

To improve training efficiency and model generalization, the datasets are shuffled and batched.


In [17]:
# Batch and shuffle the dataset
train_dataset = train_dataset.shuffle(buffer_size=1000).batch(4)
test_dataset = test_dataset.batch(4)

## Compiling the Model

In this step, we compile the BioBERT model with an optimizer, loss function, and evaluation metrics.


In [18]:
# Compile the model
model.compile(
    optimizer=Adam(learning_rate=5e-5),
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

## Defining Callbacks for Model Training

To improve training efficiency and prevent overfitting, we define two callbacks: `EarlyStopping` and `ReduceLROnPlateau`.


In [19]:
# Define the callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',       # Metric to monitor
    patience=5,               # Number of epochs with no improvement to stop training
    restore_best_weights=True # Restore weights of the best model
)

reduce_lr_on_plateau = ReduceLROnPlateau(
    monitor='val_loss',       # Metric to monitor
    factor=0.5,               # Factor to reduce the learning rate
    patience=2,               # Number of epochs with no improvement to reduce learning rate
    min_lr=1e-7               # Minimum learning rate
)

## Training the Model with Callbacks

The model is trained using the previously defined training and validation datasets, with the added benefit of callbacks for early stopping and learning rate adjustment.


In [20]:
# Train the model with callbacks
history = model.fit(
    train_dataset,
    epochs=20,
    validation_data=test_dataset,
    callbacks=[early_stopping, reduce_lr_on_plateau],
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20


## Evaluating Model Performance with Classification Report

After training the model, we evaluate its performance using a modified classification report that includes precision, recall, and F1-score for each class.


In [21]:
# Get predictions and true labels with modified classification report
y_pred = []
y_true = []

for batch in test_dataset:
    logits = model.predict(batch[0], verbose=0)['logits']
    predictions = tf.argmax(logits, axis=1)
    y_pred.extend(predictions.numpy())
    y_true.extend(batch[1].numpy())

# classification report with zero_division parameter
print("\nClassification Report:")
print(classification_report(
    y_true, 
    y_pred,
    target_names = ['ASSISTED_REPRODUCTION', 'CLINICAL_RESEARCH', 'REPRODUCTIVE_CONDITIONS', 'SURGICAL_PROCEDURES'],
    zero_division=0  # Explicitly handle zero division
))


Classification Report:
                         precision    recall  f1-score   support

  ASSISTED_REPRODUCTION       1.00      1.00      1.00         2
      CLINICAL_RESEARCH       0.67      1.00      0.80         4
REPRODUCTIVE_CONDITIONS       1.00      0.25      0.40         4
    SURGICAL_PROCEDURES       0.00      0.00      0.00         0

               accuracy                           0.70        10
              macro avg       0.67      0.56      0.55        10
           weighted avg       0.87      0.70      0.68        10

