In [3]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.5.1-py3-none-any.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 6.5 MB/s eta 0:00:01
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.2-cp36-cp36m-manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 35.4 MB/s eta 0:00:01
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 51.2 MB/s eta 0:00:01
Installing collected packages: tokenizers, sacremoses, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1


In [6]:
import os
import sys
# science
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
# process
import joblib
from tqdm import tqdm
from functools import partial
# ml
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import dump, load
import gc 
import time
import pickle
from sklearn.cluster import KMeans

In [7]:
KMEANS = False

In [8]:
device = torch.device("cuda:0")
device

device(type='cuda', index=0)

In [9]:
df = pd.read_csv('final_data.csv')
df.head(3)

Unnamed: 0,ICD Block Names,Title,Research Summary,Inclusion Criteria
0,Abnormal findings on diagnostic imaging and in...,Checkpoint inhibitor-induced liver injury (ChI...,â¢ Immune checkpoint inhibitors are proven ca...,Both patient groups and control group: Aged 1...
1,"Abnormal findings on examination of blood, wit...",Exploring the patient experience of a diagnosi...,Research Question: Exploring the patient exper...,â Coded diagnosis of pre-diabetes and have b...
2,"Abnormal findings on examination of blood, wit...",Assessment of the Impact of a Personalised Nut...,This study will determine if DNA-based dietary...,In order to be eligible to participate in this...


In [10]:
true_labels = df["ICD Block Names"] 

In [11]:
labels_to_keep = []
for i in true_labels.value_counts().index:
    if true_labels.value_counts()[i] >= 0 : 
        labels_to_keep.append(i)
labels_to_keep = list(set(labels_to_keep))

In [12]:
df = df[df["ICD Block Names"].isin(labels_to_keep)]

In [13]:
true_labels = df["ICD Block Names"] 
unique_labels = list(set(true_labels.values))
useful_labels = [unique_labels.index(label) for label in true_labels]
print("Number of unique classes:", len(unique_labels))

Number of unique classes: 224


In [14]:
content = df[["Title","Research Summary","Inclusion Criteria"]].values.T.astype(str)
title_content, abstract_content, inclusion_content = content[0], content[1], content[2] 

In [15]:
df.shape

(12263, 4)

In [16]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(inclusion_content)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
inclusion_tfidf = pd.DataFrame(denselist, columns=feature_names)

pca = PCA(n_components=128)
inclusion_tfidf_low_dims  = pca.fit_transform(inclusion_tfidf)
del inclusion_tfidf
gc.collect()

0

In [17]:
pickle.dump(vectorizer, open('tfidf_inclusion.sav', 'wb'))
pickle.dump(pca, open('pca_inclusion.sav', 'wb'))

In [18]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(title_content)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
title_tfidf = pd.DataFrame(denselist, columns=feature_names)
title_tfidf

pca = PCA(n_components=128)
title_tfidf_low_dims  = pca.fit_transform(title_tfidf)
del title_tfidf
gc.collect()

0

In [19]:
pickle.dump(vectorizer, open('tfidf_title.sav', 'wb'))
pickle.dump(pca, open('pca_title.sav', 'wb'))

In [20]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(abstract_content)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
abstract_tfidf = pd.DataFrame(denselist, columns=feature_names)

pca = PCA(n_components=128)
abstract_low_dims  = pca.fit_transform(abstract_tfidf)
del abstract_tfidf
gc.collect()

0

In [21]:
pickle.dump(vectorizer, open('tfidf_abstract.sav', 'wb'))
pickle.dump(pca, open('pca_abstract.sav', 'wb'))

In [26]:
# reduce number of dimensions
pca       = PCA(n_components=128)
final_rep = np.concatenate([title_tfidf_low_dims,inclusion_tfidf_low_dims,abstract_low_dims], axis=-1)
low_dims  = pca.fit_transform(final_rep)

In [36]:
pickle.dump(pca, open('pca_tfidf.sav', 'wb'))

In [27]:
# build sample classifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.multiclass import OneVsRestClassifier

clf_algo = partial(SVC,C=100 ,kernel="rbf", probability=True, class_weight="balanced")
# partial(KNeighborsClassifier, metric="cosine") # BernoulliNB

In [32]:
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot

from sklearn.ensemble import AdaBoostClassifier

import pickle


import warnings 
warnings.simplefilter('ignore')

In [33]:
val_proportion = 0.15
np.random.seed(42)

In [34]:
np_labels = np.array(useful_labels)
idxs = {c: np.argwhere(np_labels == c).ravel() for c in np.unique(np_labels)}
idxs_set = {c: {"train": [], "val": []} for c in np.unique(np_labels)}

# random seed so everytime the split is the same
for c in np.unique(np_labels):
    amount = int(len(idxs[c]) * val_proportion)
    choices = np.random.choice(idxs[c], size=amount, replace=False)
    # always check that not all idxs are destined to validation
    if len(choices) < len(idxs[c]):
        # assign the choices to validation and rest to training
        idxs_set[c]["val"]   = choices.tolist()
        idxs_set[c]["train"] = [i for i in idxs[c] if i not in idxs_set[c]["val"]]

# merge indexes for all classes
train_idxs, val_idxs = [], []
for c in np.unique(np_labels):
    val_idxs.extend( idxs_set[c]["val"] ) 
    train_idxs.extend( idxs_set[c]["train"] ) 
    
# get data splitted accordingly:
x_train, y_train = np.array(low_dims)[train_idxs], np.array(useful_labels)[train_idxs]
x_val, y_val     = np.array(low_dims)[val_idxs], np.array(useful_labels)[val_idxs]

# finally, check sizes:
print("Size of training data is: ", len(x_train))
print("Size of validation data is: ", len(x_val))

Size of training data is:  10530
Size of validation data is:  1733


In [35]:
# get and estimate of how well it'd do on unseen data
clf = clf_algo()
#clf = LGBMClassifier()
#clf = xgb.XGBClassifier()
#clf = OneVsRestClassifier(xgb.XGBClassifier(tree_method='gpu_hist'))
#clf = OneVsRestClassifier(clf_algo())
begin_train = time.time()
clf.fit(x_train, y_train)
end_train = time.time()
print('Training_time :',end_train-begin_train)

begin = time.time()
TOP_K = 3
# for training data
pred_probs_train  = clf.predict_proba(x_train)
in_or_out_train   = []
top_k_preds_train = []
for i,pred in enumerate(pred_probs_train):
    reordered = np.zeros(len(unique_labels))
    reordered[clf.classes_]  = pred
    this_top_k = np.argsort(reordered)[::-1][:TOP_K] 
    top_k_preds_train.append(this_top_k)
    in_or_out_train.append( y_train[i] in this_top_k )


# for validation data
pred_probs_val  = clf.predict_proba(x_val)
in_or_out_val   = []
top_k_preds_val = []
for i,pred in enumerate(pred_probs_val):
    reordered = np.zeros(len(unique_labels))
    reordered[clf.classes_]  = pred
    this_top_k = np.argsort(reordered)[::-1][:TOP_K] 
    top_k_preds_val.append(this_top_k)
    in_or_out_val.append( y_val[i] in this_top_k )
    
# print results
print("A more realistic estimate on training data can be:", np.mean(clf.predict(x_train) == y_train), "in top 1")
print("A more realistic estimate on training data can be:", np.mean(in_or_out_train), "in top", TOP_K, "\n")
print("A more realistic estimate on validation data can be:", np.mean(clf.predict(x_val) == y_val), "in top 1")
print("A more realistic estimate on validation data can be:", np.mean(in_or_out_val), "in top", TOP_K)

# write to file
is_train = []
top_1, top_2, top_3 = [], [], []
for i in range(len(useful_labels)):
    if i in val_idxs:
        is_train.append("val")
        top_1.append( unique_labels[top_k_preds_val[ val_idxs.index(i) ][0]] )
        top_2.append( unique_labels[top_k_preds_val[ val_idxs.index(i) ][1]] )
        top_3.append( unique_labels[top_k_preds_val[ val_idxs.index(i) ][2]] )
    else:
        is_train.append("train")
        top_1.append( unique_labels[top_k_preds_train[ train_idxs.index(i) ][0]] )
        top_2.append( unique_labels[top_k_preds_train[ train_idxs.index(i) ][1]] )
        top_3.append( unique_labels[top_k_preds_train[ train_idxs.index(i) ][2]] )
        
df["first_guess"]  = top_1
df["second_guess"] = top_2
df["third_guess"]  = top_3
df["is_training"] = is_train
df.to_csv("with_predictions.csv")
end = time.time()
print(end - begin)

Training_time : 112.02108383178711
A more realistic estimate on training data can be: 0.798005698005698 in top 1
A more realistic estimate on training data can be: 0.9696106362773029 in top 3 

A more realistic estimate on validation data can be: 0.6664743219849971 in top 1
A more realistic estimate on validation data can be: 0.9226774379688402 in top 3
177.20954990386963


In [38]:
pickle.dump(clf, open('clf_tfidf.sav', 'wb'))