# Preprocessing Notebook

## Import libraries

In [2]:
import pandas as pd
import numpy as np

## Load dataset

In [6]:
med_articles_raw_df = pd.read_csv('../data/raw/challenge_data-18-ago.csv', sep=';')
med_articles_raw_df.head()

Unnamed: 0,title,abstract,group
0,Adrenoleukodystrophy: survey of 303 cases: bio...,Adrenoleukodystrophy ( ALD ) is a genetically ...,neurological|hepatorenal
1,endoscopy reveals ventricular tachycardia secrets,Research question: How does metformin affect c...,neurological
2,dementia and cholecystitis: organ interplay,Purpose: This randomized controlled study exam...,hepatorenal
3,The interpeduncular nucleus regulates nicotine...,Partial lesions were made with kainic acid in ...,neurological
4,guillain-barre syndrome pathways in leukemia,Hypothesis: statins improves stroke outcomes v...,neurological


## Exploration

In [7]:
med_articles_raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3565 entries, 0 to 3564
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     3565 non-null   object
 1   abstract  3565 non-null   object
 2   group     3565 non-null   object
dtypes: object(3)
memory usage: 83.7+ KB


In [8]:
med_articles_raw_df.describe()

Unnamed: 0,title,abstract,group
count,3565,3565,3565
unique,3563,3565,15
top,State-of-the-art thiazide diuretics for prosta...,Aim: To investigate aspirin effects on heart d...,neurological
freq,2,1,1058


In [9]:
med_articles_raw_df.isnull().sum()

title       0
abstract    0
group       0
dtype: int64

In [10]:
med_articles_raw_df.duplicated().sum()

np.int64(0)

## Data Cleaning

In [26]:
med_articles_proc_df = med_articles_raw_df.map(lambda x: x.lower() if isinstance(x, str) else x)

In [27]:
med_articles_proc_df.head()

Unnamed: 0,title,abstract,group
0,adrenoleukodystrophy: survey of 303 cases: bio...,adrenoleukodystrophy ( ald ) is a genetically ...,neurological|hepatorenal
1,endoscopy reveals ventricular tachycardia secrets,research question: how does metformin affect c...,neurological
2,dementia and cholecystitis: organ interplay,purpose: this randomized controlled study exam...,hepatorenal
3,the interpeduncular nucleus regulates nicotine...,partial lesions were made with kainic acid in ...,neurological
4,guillain-barre syndrome pathways in leukemia,hypothesis: statins improves stroke outcomes v...,neurological


In [28]:
from sklearn.preprocessing import MultiLabelBinarizer

med_articles_proc_df['group'] = med_articles_proc_df['group'].str.split('|')

mlb = MultiLabelBinarizer()

group_dummies = pd.DataFrame(mlb.fit_transform(med_articles_proc_df['group']), columns=mlb.classes_, index=med_articles_proc_df.index)

med_articles_proc_df = pd.concat([med_articles_proc_df.drop('group', axis=1), group_dummies], axis=1)

In [29]:
med_articles_proc_df.head()

Unnamed: 0,title,abstract,cardiovascular,hepatorenal,neurological,oncological
0,adrenoleukodystrophy: survey of 303 cases: bio...,adrenoleukodystrophy ( ald ) is a genetically ...,0,1,1,0
1,endoscopy reveals ventricular tachycardia secrets,research question: how does metformin affect c...,0,0,1,0
2,dementia and cholecystitis: organ interplay,purpose: this randomized controlled study exam...,0,1,0,0
3,the interpeduncular nucleus regulates nicotine...,partial lesions were made with kainic acid in ...,0,0,1,0
4,guillain-barre syndrome pathways in leukemia,hypothesis: statins improves stroke outcomes v...,0,0,1,0


In [31]:
med_articles_proc_df.to_csv('../data/processed/medical_articles_data_processed.csv', index=False, sep=';', encoding='utf-8')