In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import annoy
from gensim.models import FastText
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras import layers
from pymorphy2 import MorphAnalyzer
from stop_words import get_stop_words
import string



In [None]:
class TrialsModel:
    def preprocess_txt(self, line):
        spls = "".join(i for i in line.strip() if i not in self.exclude).split()
        spls = [self.morpher.parse(i.lower())[0].normal_form for i in spls]
        spls = [i for i in spls if i not in self.sw and i != ""]
        return spls    
    
    def train(self, filename):
        print("Reading dataset")
        trials = pd.read_pickle(filename) #'studies_with_keywords.pickle'

        self.morpher = MorphAnalyzer()
        self.sw = set(get_stop_words("en"))
        self.exclude = set(string.punctuation)

        print("Preprocessing brief titles")
        trials['words'] = trials.brief_title.apply(preprocess_txt)

        print("Updating keywords")
        trials.all_keywords = (trials.all_keywords.values + trials.words.values)
        trials.all_keywords = trials.all_keywords.apply(set).apply(list)

        self.vec_size = 500

        print("Creating FT model")
        modelFT = FastText(sentences=trials.all_keywords, vector_size=self.vec_size, min_count=1, window=5)

        print("Creating index")
        ft_index = annoy.AnnoyIndex(self.vec_size,'angular')

        for index, row in trials.iterrows():
            n_ft = 0
            vector_ft = np.zeros(self.vec_size)
            for word in row.all_keywords:
                if word in modelFT.wv:
                    vector_ft += modelFT.wv[word]
                    n_ft += 1
            if n_ft > 0:
                vector_ft = vector_ft / n_ft
                ft_index.add_item(index, vector_ft)

        print("Building index")
        ft_index.build(10)
        
        self.modelFT = modelFT
        self.ft_index = ft_index
        self.trials = trials

    def get_trials(self, text):
        text = preprocess_txt(text)

        n_ft = 0
        vector_ft = np.zeros(self.vec_size)

        for word in text:
            if word in self.modelFT.wv:
                vector_ft += self.modelFT.wv[word]
                n_ft += 1
        if n_ft == 0:
            return None  

        vector_ft = vector_ft / n_ft
        idxs = self.ft_index.get_nns_by_vector(vector_ft, 5)

        result = 'I have found some clinical trials that might be related to this:\n\n'
        for i in idxs:
            id = self.trials['nct_id'].iloc[i]
            title = self.trials['brief_title'].iloc[i]
            result += f'<a href="https://clinicaltrials.gov/ct2/show/{id}">{title}</a>\n'
            
        return result