# **Classification Notebook (NovaSBE X GregoryAI)**

![Description of the image](../images/classify_pipeline_diagram.png)

## 1. Import libraries

In [1]:
import os
import sys
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
import gdown

# Add the parent directory of code_utils to the Python path
sys.path.append(os.path.abspath(os.path.join('..')))

# Suppress warnings
warnings.filterwarnings("ignore")

from code_utils.text_utils import *  # Import everything from text_utils.py
from code_utils.model_utils.LSTM_algorithm_utils import *  
from code_utils.model_utils.BERT_algorithm_utils import *  
from code_utils.model_utils.LGBM_algorithm_utils import *  
from code_utils.model_utils.classify_model_choose import *
from code_utils.download_utils import * 

## 2. Download articles

In [3]:
# load the previous data
old_articles_path = '..\\data\\articles_08-06-2024_14h13m04s.csv'
url = 'https://gregory-ms.com/developers/articles.zip'
train_df, inference_df = download_and_extract_zip(url, old_articles_path, 'max')

DataFrame saved to data\2024-06-10\train_articles.csv
DataFrame saved to data\2024-06-10\inference_articles.csv


The above code, using the downloads function, checks for new article_ids (comparing with an older version of dataset) and then returns two datasets (also saving them as csv files in the data folder)

- train_df - the complete dataset, to be used in training
- inference_df - the new articles, to be used in classification

In [4]:
inference_df.shape

(0, 15)

In this case, inference_df has 0 rows, because it's has been compared to the latest version, so there are no new articles yet, at the time of this test. In the cells below, to test and show how does it work, we used to classify a file previously downloaded, articles_08_06_2024_14h13m04.csv.

## 2. Load articles, clean and pre-process articles

**TEST CELL**

This cell is used just to test the code, the user can later delete this cell and use the one below.

In [8]:
articles_clean_df = load_and_format_dataset('../data/articles_08-06-2024_14h13m04s.csv', text_cleaning_pd_series)
articles_clean_df.head()

Unnamed: 0_level_0,text_processed,relevant
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,covid19 hhv6 mog antibody perfect storm j neur...,0
2,migraine associated brain anatomical alteratio...,0
3,patient satisfaction quality counseling provid...,0
4,rare case spinal neurosarcoidosis concomitant ...,unlabeled
5,evaluation urinary tract infection following c...,0


In [None]:
dataset_path = os.path.join('../data/2024-06-10', # choose the day folder intended to use 
                            'inference_articles.csv')

articles_clean_df = load_and_format_dataset(dataset_path, text_cleaning_pd_series)

articles_clean_df.head()

In [9]:
articles_clean_df.relevant.value_counts()

relevant
unlabeled    22050
0             1532
1             1008
Name: count, dtype: int64

**Filter to keep Unlabelled Data**

In [10]:
unlabeled_articles = articles_clean_df[articles_clean_df['relevant'] == 'unlabeled']
unlabeled_texts = unlabeled_articles['text_processed']
article_ids = unlabeled_articles.index

## 4. Classify Articles (predict Labels for Unlabeled Articles)

In the cell bellow you will find the three alternatives models to run the classification. Choose the model you want by indexing its name in the list models_available.

Note that the model weights we trained for BERT model have to be downloaded from google drive since it's size (near 400MB) exceeds the github repository capacity.

In [15]:
# This code is to download the BERT model weights from the Google Drive

# URL of the file on Google Drive
google_drive_url = 'https://drive.google.com/uc?id=1c1DTtuWKIJnlXaca_w8NPN3Qa5TnbyqX'

# Destination directory and file name
model_path_bert_classifier = '../models/Pubmed_BERT128_model_weights.h5'

# Check if the models directory exists, if not, create it
if not os.path.exists('../models'):
    os.makedirs('../models')

# Download the file from Google Drive
gdown.download(google_drive_url, model_path_bert_classifier, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1c1DTtuWKIJnlXaca_w8NPN3Qa5TnbyqX
From (redirected): https://drive.google.com/uc?id=1c1DTtuWKIJnlXaca_w8NPN3Qa5TnbyqX&confirm=t&uuid=03e07ca2-8253-4166-ba79-1dbbff15152c
To: c:\Users\lnpg1\Desktop\NOVASBE\GREGORY_AI\Pi\GregoryAIxNovaSBE\models\Pubmed_BERT128_model_weights.h5
100%|██████████| 438M/438M [00:09<00:00, 46.9MB/s] 


'../models/Pubmed_BERT128_model_weights.h5'

In [16]:
# change the weight file according to the model you want to use (see the models folder)
model_path_lgbm_classifier = '../models/lgbm_classifier.joblib'
model_path_lstm_classifier = '../models/model_bidir_1_no_reg.keras'
model_path_bert_classifier = '../models/Pubmed_BERT128_model_weights.h5'

# change the vectorizer file according to the model you want to use
vectorizer_path = '../models/tfidf_vectorizer.joblib'

In [17]:
# List of available models
models_available = ['LSTM_Classifier', 'BERT_Classifier', 'LGBM_TFIDF_Classifier']
model_paths = [model_path_lstm_classifier, model_path_bert_classifier, model_path_lgbm_classifier]

# Choose the model and its path
model_name = models_available[2]  # Choose here the model you want to use
model_path = model_path_lgbm_classifier[2]

# Predict labels for the unlabeled articles
predicted_labels_lgbm = predict_with_model(model_name, model_path_lgbm_classifier, vectorizer_path, unlabeled_texts)

Model loaded: vectorizer from ../models/tfidf_vectorizer.joblib, classifier from ../models/lgbm_classifier.joblib


  File "c:\Users\lnpg1\anaconda3\envs\gregoryai\lib\site-packages\joblib\externals\loky\backend\context.py", line 217, in _count_physical_cores
    raise ValueError(


## 5. Store the results

In [None]:
# Create a new DataFrame with article IDs and predicted labels
results_df = pd.DataFrame({
    'article_id': article_ids,
    f'{model_name}_pred': predicted_labels_lgbm
})

# Reset the index to ensure article_id is a column and not an index
results_df.reset_index(drop=True, inplace=True)

In [None]:
# Join the results with the original DataFrame

df = pd.read_csv('../data/articles_08-06-2024_14h13m04s.csv')

# df = pd.read_csv(dataset_path)

new_articles_df = df.merge(results_df, how='left', left_on='article_id', right_on='article_id')

new_articles_df.head()

Unnamed: 0.1,Unnamed: 0,article_id,title,summary,link,published_date,discovery_date,source,publisher,container_title,authors,relevant,doi,access,takeaways,categories,LGBM_TFIDF_Classifier_pred
0,0,1,"COVID-19, HHV6 and MOG antibody: A perfect storm",J Neuroimmunol. 2021 Feb 12;353:577521. doi: 1...,https://pubmed.ncbi.nlm.nih.gov/33607505/?fc=2...,2021-04-14,2021-02-23,8.0,Elsevier BV,Journal of Neuroimmunology,"Ali Fadhil, Ankita Prasad, Anthony Zampino, Fa...",False,10.1016/j.jneuroim.2021.577521,open,First case of HHV6 reactivation in central ne...,,
1,1,2,Is Migraine Associated to Brain Anatomical Alt...,Brain Topogr. 2021 Feb 19. doi: 10.1007/s10548...,https://pubmed.ncbi.nlm.nih.gov/33606142/?fc=2...,2021-01-05,2021-02-23,8.0,Springer Science and Business Media LLC,Brain Topography,"Anne Caclin, Aurélie Bidet-Caulet, David Meuni...",False,10.1007/s10548-021-00824-6,open,Growing number of studies investigate brain a...,,
2,2,3,Patient Satisfaction With the Quality of Couns...,J Neurosci Nurs. 2021 Feb 17. doi: 10.1097/JNN...,https://pubmed.ncbi.nlm.nih.gov/33605649/?fc=2...,2021-03-31,2021-02-23,8.0,Ovid Technologies (Wolters Kluwer Health),Journal of Neuroscience Nursing,"Daniela Händler-Schuster, Diana Zanolari, Gabr...",False,10.1097/JNN.0000000000000578,restricted,The challenges in dealing with multiple scler...,,
3,3,4,Rare Case of Spinal Neurosarcoidosis with Conc...,Case Rep Neurol Med. 2021 Jan 28;2021:5952724....,https://pubmed.ncbi.nlm.nih.gov/33604089/?fc=2...,2021-01-28,2021-02-23,8.0,Hindawi Limited,Case Reports in Neurological Medicine,"Achraf Makki, Maria Khoueiry, Nesreen Jaafar, ...",,10.1155/2021/5952724,open,Spinal neurosarcoidosis is a rare disease tha...,,0.0
4,4,5,Evaluation of Urinary Tract Infection followin...,Can J Infect Dis Med Microbiol. 2021 Jan 31;20...,https://pubmed.ncbi.nlm.nih.gov/33603936/?fc=2...,2021-01-31,2021-02-23,8.0,Hindawi Limited,Canadian Journal of Infectious Diseases and Me...,"Aliyeh Bazi, Monireh Ghazaeian, Narjes Hendoie...",False,10.1155/2021/6616763,open,Double-blind randomized clinical trial was co...,,


In [None]:
# Export the new DataFrame to a CSV file into the folder articles_classification,
# with name formating as follows: articles_predictions_{model_name}_{current_date}_{current_time}.csv

def get_current_date_time():
    now = datetime.now()
    current_date = now.strftime('%Y-%m-%d')
    current_time = now.strftime('%H-%M-%S')
    return current_date, current_time

current_date = get_current_date_time()[0]
current_time = get_current_date_time()[1]

output_filename = f'articles_predictions_{model_name}_{current_date}_{current_time}.csv'

output_path = os.path.join('../articles_classification', output_filename)

new_articles_df.to_csv(output_path, index=False)