# Training Notebook

## Import libraries

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib
from sklearn.svm import LinearSVC

## Load processed dataset

In [2]:
med_articles_proc_df = pd.read_csv('../data/processed/medical_articles_data_processed.csv', sep=';')
med_articles_proc_df.head()

Unnamed: 0,title,abstract,cardiovascular,hepatorenal,neurological,oncological
0,adrenoleukodystrophy: survey of 303 cases: bio...,adrenoleukodystrophy ( ald ) is a genetically ...,0,1,1,0
1,endoscopy reveals ventricular tachycardia secrets,research question: how does metformin affect c...,0,0,1,0
2,dementia and cholecystitis: organ interplay,purpose: this randomized controlled study exam...,0,1,0,0
3,the interpeduncular nucleus regulates nicotine...,partial lesions were made with kainic acid in ...,0,0,1,0
4,guillain-barre syndrome pathways in leukemia,hypothesis: statins improves stroke outcomes v...,0,0,1,0


## Prepare data for training

In [3]:
med_articles_proc_df['text'] = med_articles_proc_df['title'] + ' ' + med_articles_proc_df['abstract']
X = med_articles_proc_df['text']
y = med_articles_proc_df.drop(columns=['title', 'abstract', 'text'])

## Split data into training and testing sets

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Vectorize text data

In [5]:
vectorizer = TfidfVectorizer(max_features=5000,
                             ngram_range=(1,2),
                             stop_words='english'
                             )

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [8]:
X_train_vec.shape, X_test_vec.shape

((2852, 5000), (713, 5000))

## Train model

In [19]:
model_lr = OneVsRestClassifier(LogisticRegression(max_iter=1000, class_weight='balanced'))
model_lr.fit(X_train_vec, y_train)

0,1,2
,estimator,LogisticRegre...max_iter=1000)
,n_jobs,
,verbose,0

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [29]:
model_svm = OneVsRestClassifier(LinearSVC(max_iter=1000, class_weight='balanced'))
model_svm.fit(X_train_vec, y_train)

0,1,2
,estimator,LinearSVC(cla...ht='balanced')
,n_jobs,
,verbose,0

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,verbose,0


## Evaluate model

In [24]:
y_pred_lr = model_lr.predict(X_test_vec)
print(classification_report(y_test, y_pred_lr, zero_division=0))

              precision    recall  f1-score   support

           0       0.95      0.88      0.91       260
           1       0.95      0.78      0.86       228
           2       0.87      0.88      0.87       338
           3       0.82      0.86      0.84       130

   micro avg       0.90      0.85      0.87       956
   macro avg       0.89      0.85      0.87       956
weighted avg       0.90      0.85      0.87       956
 samples avg       0.93      0.90      0.89       956



In [30]:
y_pred_svm = model_svm.predict(X_test_vec)
print(classification_report(y_test, y_pred_svm, zero_division=0))

              precision    recall  f1-score   support

           0       0.95      0.88      0.91       260
           1       0.96      0.82      0.88       228
           2       0.89      0.88      0.89       338
           3       0.81      0.81      0.81       130

   micro avg       0.91      0.86      0.88       956
   macro avg       0.90      0.85      0.87       956
weighted avg       0.91      0.86      0.88       956
 samples avg       0.93      0.90      0.90       956




## Save model and vectorizer

In [31]:
joblib.dump(model_lr, '../models/multilabel_logistic_regression_model.joblib')

['../models/multilabel_logistic_regression_model.joblib']

In [32]:
joblib.dump(model_svm, '../models/multilabel_svm_model.joblib')

['../models/multilabel_svm_model.joblib']

In [33]:
joblib.dump(vectorizer, '../models/tfidf_vectorizer.joblib')

['../models/tfidf_vectorizer.joblib']