## Importing Libraries

In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib inline


## Getting Data for repo (Note: downloading 20k rct data to find the best preforming model)



In [3]:
##Getting Train data for rct 20k
! wget https://raw.githubusercontent.com/average-joe25/scientific-abstract-classifier/main/data/PubMed_20k_RCT_numbers_replaced_with_at_sign/train.txt
##Getting validation data for rct 20k
! wget https://raw.githubusercontent.com/average-joe25/scientific-abstract-classifier/main/data/PubMed_20k_RCT_numbers_replaced_with_at_sign/dev.txt
## Getting test data for rct 20k
! wget https://raw.githubusercontent.com/average-joe25/scientific-abstract-classifier/main/data/PubMed_20k_RCT_numbers_replaced_with_at_sign/test.txt

--2022-05-02 05:44:44--  https://raw.githubusercontent.com/average-joe25/scientific-abstract-classifier/main/data/PubMed_20k_RCT_numbers_replaced_with_at_sign/train.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 28908792 (28M) [text/plain]
Saving to: ‘train.txt’


2022-05-02 05:44:45 (170 MB/s) - ‘train.txt’ saved [28908792/28908792]

--2022-05-02 05:44:45--  https://raw.githubusercontent.com/average-joe25/scientific-abstract-classifier/main/data/PubMed_20k_RCT_numbers_replaced_with_at_sign/dev.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting respon

In [4]:
## moving them to new folder
! mkdir rct20k
! mv dev.txt rct20k/
! mv train.txt rct20k/
! mv test.txt rct20k/

## EDA and visualization of Data

In [5]:
##Getting lines from the files
def get_data(dataset_type,rct_type):
  parent_folder='/content/'
  with open(str(parent_folder+rct_type+"/"+dataset_type),'r') as f:
    list_lines= f.readlines()
    return list_lines

In [6]:
train_lines_20k=get_data("train.txt","rct20k")
test_lines_20k=get_data("test.txt","rct20k")
val_lines_20k=get_data("dev.txt","rct20k")

In [7]:
len(train_lines_20k),len(test_lines_20k),len(val_lines_20k)

(210040, 35135, 35212)

In [8]:
## function to create dataframe from lines
def convert_to_df(lines):
  abstract_size=0
  dict_list=[]
  abstract=[]
  list_abstract=[]
  for line in lines:
    if(line.startswith("###")):
      if(len(abstract)!=0):
        list_abstract.append(abstract)
      abstract=[]
    else:
      abstract.append(line)
  list_abstract.append(abstract)
  for new_abstract in list_abstract:
    count=0
    for line in new_abstract:
      count+=1
      splitted=line.split('\t')
      if(splitted[0]=='\n'):
        continue
      category=splitted[0]
      text=splitted[1]
      dict_list.append({'target':category,'text':text,'location':float(count/len(new_abstract))})
  list_df=pd.DataFrame(dict_list)
  return list_df


In [9]:
##convert lines to dataframe
train_df=convert_to_df(train_lines_20k)
val_df=convert_to_df(val_lines_20k)
test_df=convert_to_df(test_lines_20k)

In [10]:
test_df.head(15)

Unnamed: 0,target,text,location
0,BACKGROUND,This study analyzed liver function abnormaliti...,0.1
1,RESULTS,A post hoc analysis was conducted with the use...,0.2
2,RESULTS,Liver function tests ( LFTs ) were measured at...,0.3
3,RESULTS,Survival analyses were used to assess the asso...,0.4
4,RESULTS,The percentage of patients with abnormal LFTs ...,0.5
5,RESULTS,When mean hemodynamic profiles were compared i...,0.6
6,RESULTS,Multivariable analyses revealed that patients ...,0.7
7,CONCLUSIONS,Abnormal LFTs are common in the ADHF populatio...,0.8
8,CONCLUSIONS,Elevated MELD-XI scores are associated with po...,0.9
9,BACKGROUND,Minimally invasive endovascular aneurysm repai...,0.071429


## Modelling(Baseline)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create a pipeline
model_0 = Pipeline([
  ("tf-idf", TfidfVectorizer()),
  ("clf", MultinomialNB())
])

# Fit the pipeline to the training data
model_0.fit(X=train_df['text'], y=train_df['target']);


In [12]:
# Evaluate baseline on validation dataset
model_0.score(X=val_df['text'],y=val_df['target'])


0.7218323844829869

## Model Evaluation

In [30]:
from sklearn.metrics import f1_score,accuracy_score,recall_score,precision_score
def measure(pred,value):
  eva_dict={'accuracy':accuracy_score(value,pred),'precision':precision_score(value,pred,average='weighted'),'recall':recall_score(value,pred,average='weighted'),'f1_score':f1_score(value,pred,average='weighted')}
  print(eva_dict)


In [14]:
preds=model_0.predict(test_df['text'])

array(['METHODS', 'METHODS', 'METHODS', ..., 'RESULTS', 'RESULTS',
       'RESULTS'], dtype='<U11')

In [31]:
measure(preds,test_df['target'].to_numpy())

{'accuracy': 0.716674962667994, 'precision': 0.7124322482375202, 'recall': 0.716674962667994, 'f1_score': 0.6925101703856846}
