In [20]:
import numpy as np
import pandas as pd
from numpy import dot
from numpy.linalg import norm
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string
import gensim
from gensim.models import Word2Vec
from gensim.models import FastText

In [109]:
def cos_sim(a,b):
    return round(dot(a,b)/(norm(a)*norm(b)),3)

In [6]:
# Text Preprocessing
# function to remove all urls 
def remove_urls(text):
    new_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^A-Z0-9a-z \t])|(\w+:\/\/\S+)"," ",text).split())
    return new_text

# make text lower case
def lower_case(text):
    text = text.lower()
    return text

# remove numbers
def remove_numbers(text):
    result = re.sub(r'\d+','',text)
    return result

# remove punctuation
def remove_punctuation(text):
    translator = str.maketrans('','',string.punctuation)
    text = text.translate(translator)
    return text

# tokenize
def tokenize(text):
    text = word_tokenize(text)
    return text

# remove stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    text = [i for i in text if i not in stop_words]
    return text

# lemmatize text
lemmatizer = WordNetLemmatizer()
def lemmatize(text):
    text = [lemmatizer.lemmatize(token) for token in text]
    return text

# preprocessing the text
def preprocessing(text):

    text = lower_case(text)
    text = remove_urls(text)
    text = remove_numbers(text)
    text = remove_punctuation(text)
    text = tokenize(text)
    text = remove_stopwords(text)
    text = lemmatize(text)
    text = ' '.join(text)
    
    return text

In [18]:
vector_size = 100

In [21]:
skipgram_100 = Word2Vec.load("skipgram_100.bin")
fast_n_100 = Word2Vec.load("fast_n_100.bin")

In [34]:
def get_mean_vector(model,words):
    words = [word for word in word_tokenize(words) if word in list(model.wv.index_to_key)] #Add the word to list of words if found in vocab
    if len(words)>=1:
        return np.mean(model.wv[words], axis=0)
    else:
        return np.zeros([vector_size])

In [129]:
# input query processing
def preprocessing_input(query,model):
    query = preprocessing(query)
    query=query.replace('\n',' ')
    k = get_mean_vector(model,query)
    return k



In [37]:
pd.set_option("display.max_colwidth", -1)

  pd.set_option("display.max_colwidth", -1)


In [38]:
with open("dimensions-covid19-export-2021-09-01-h15-01-02_clinical_trials.csv","r",encoding="utf-8",errors="ignore") as f:
    content = f.read()

In [39]:
from io import StringIO
df = pd.read_csv(StringIO(content))

  df = pd.read_csv(StringIO(content))


In [44]:
df1 = df[["Date added","Trial ID","Title","Brief title","Acronym","Abstract"]]

In [45]:
df1.head()

Unnamed: 0,Date added,Trial ID,Title,Brief title,Acronym,Abstract
0,9/1/2021,NCT05029245,"The 8-week, Prospective, Randomized Controlled of IntraDermal Administration of Comirnaty® 6 Microgram Versus Intramuscular Comirnaty® 30 Microgram by 28 Days Interval Efficacy Study in Healthy Volunteer",IntraDermal Versus Intramuscular Comirnaty® Efficacy Study,PRIDE,"The 8-week, Prospective, Randomized controlled of IntraDermal administration of Comirnaty® 6 microgram compare to Intramuscular Comirnaty® 30 microgram by 28 days interval Efficacy Study in 4 groups of healthy volunteer ( 1 people who complete sinovac vaccination 2 people who received 1 dosage of AstraZeneca vaccine 3 naive vaccination 4 any other vaccination not in 1-3 with anti Spike antibody less than 650 AU/ ml) . Comparison of antibody level and T cell response to SAR-CoV-2 antigen in vitro after 28 day post vaccination is primary outcome and the side effect as well as infection rate in 8 weeks is secondary outcomes.\n\nDetailed Description\nThe 8-week, Prospective, Randomized controlled of IntraDermal administration of Comirnaty® 6 microgram compare to Intramuscular Comirnaty® 30 microgram by 28 days interval Efficacy Study in healthy volunteer.To compare the AntiSpike antibody, ( Anti RBD ) neutralized antibody ( if possible) of SAR-CoV-2 and T-cell response after injection with Intradermal Comirnaty® 6 microgram versus Intramuscular Comirnaty® 30 microgram by 28 days interval in healthy volunteer in various immunological background groups.1000 patients with or with out vaccinated and with our without history of previous COVID-19 infection (in various immunological background ) will be recruited and received Comirnaty® 6 microgram versus Intramuscular Comirnaty® 30 microgram by 28 days intervalInclusion Criteria:Signed informed consent by any patient capable of giving consent, or, when the patient is not capable of giving consent, by his or her legal/authorized representatives prior to initiation of any study procedures.Men and women, ≥18 years of age at time of enrollment.Able to follow up the vaccination schedule.Exclusion Criteria:Patient with known hypersensitivity or intolerance to Comirnaty® or Polyethylene glycol (PEG).Patient with previous receiveing mRNA vaccine ( Pfizer, Moderna or other).Pregnancy with gestational age less than 12 weeks.Patient with History of immunosuppessive drug ( oral , IV, IM ) of which discontinue less than 6 month or any immunological abnormality which impact to Antibody production and T cell function ( eg hypergammaglobulinemia, active immne deficiency).Patient with previous used of Intravenous immunoglobulin in previous 6 monthPatient with history of abnormal coagulation or contraindication for intramuscular injection or intradermal injection.Patient with end stage disease or disease with life expectancy less than 2 yearsPatient with previous use of medication interfere with serum interferon other cytokine system or disease with cytokine abnormalities.Patient with history of abnormal platelet or platelet dysfunction, blood coagulopathy abnormality.Patient with active pulmonary tuberculosis or systemic tuberculosis, atypical non mycobacterium tuberculosis.Primary efficacy: To compare the AntiSpike antibody, ( Anti RBD ) neutralized antibody ( if possible) of SAR-CoV-2 and T-cell response after injection with Intradermal Comirnaty® 6 microgram versus Intramuscular Comirnaty® 30 microgram by 28 days interval in healthy volunteer in various immunological background groups.Secondary efficacy: Comparesion of infection rate in each arm."
1,9/1/2021,NCT05029037,High-dose Intravenous Vitamin C (HDIVC) as Adjuvant Therapy in Critical Patients With Positive COVID-19. A Pilot Randomized Controlled Dose-comparison Trial.,High-dose Intravenous Vitamin C (HDIVC) as Adjuvant Therapy in Critical Patients With Positive COVID-19. A Pilot Randomized Controlled Dose-comparison Trial.,HDIVC,"The objective of this study is to evaluate the impact of this HDIVC therapy in the first treatment of symptomatic Covid-19 patients in a time period of one week.\n\nDetailed Description\nWe consider that treatment with high doses of injectable vitamin C HDIVC could have a positive impact as an adjunct on the immunity of patients with Covid while reducing the possibility of worsening their clinical picture. Similarly, we wish to evaluate the efficacy of this treatment, for one week, in patients hospitalized in the ICU, in order to reduce the inflammatory burden and reduce hospital stay."
2,9/1/2021,NCT05029011,Low-cost Sensor System for COVID-19 Patient Monitoring: Validation of MediByte VTS,Low-cost Sensor System for COVID-19 Patient Monitoring: Validation of MediByte VTS,,"The MediByte VTS is a low-cost, portable device that is being developed to take continuous and real-time vital sign measurements of COVID-19 patients, both in the hospital and home setting. This study is being undertaken to test the MediByte VTS and validate against industry standards.\n\nDetailed Description\nIn response to the COVID-19 pandemic, the MediByte Vital Signs Monitor (VTS) has been developed with funding from the National Research Council of Canada Industrial Research Assistance Program through the federal government's Innovative Solutions Canada (ISC) program. The VTS will measure peripheral capillary oxygen saturation (SpO2) and pulse via photoplethysmography (PPG), temperature, blood pressure (BP), heart and respiration rates, electrocardiogram (ECG), head position and movement over a continuous period. This monitor can be used remotely, and importantly will be low-cost so it can be accessed widely. Data from the VTS is transmitted wirelessly to a portable basestation which is a smartphone or tablet device via a customized application. This study will test the VTS both within the clinical environment and in the remote home environment to see how well it monitors patients effected by COVID-19 needing care, as well as satisfying an unmet need for remote monitoring across the healthcare system.This study will compare the newly developed VTS with an industry gold standard of polysomnography (PSG) in the sleep laboratory, or in the home with the MediByte Jr (home sleep apnea test).Approximately 150 patients will be recruited to the study across two different settings which includes a hospital setting and a remote setting, within the home environment."
3,9/1/2021,NCT05028998,The Impact of COVID-19-related Medication Assisted Treatment Policy Changes on Care and Outcomes for Patients With Opioid Use Disorder,Covid-related Opioid Treatment Policy Evaluation,COPE,"Our nation is facing the COVID-19 pandemic during an ongoing opioid epidemic. Effective treatment for patients with opioid use problems involves a treatment method called Medication-Assisted Treatment, or MAT. In MAT, patients receive a medication that reduces cravings and withdrawal symptoms and can prevent overdose. Patients also receive counseling. Because the medications that are used in MAT are controlled substances, this treatment is subject to a number of federal regulations. The need for social-distancing during the pandemic would have made following these regulations very difficult for patients and their providers. Because of these difficulties, the federal government eased regulations in March 2020, making it easier for patients to receive MAT with fewer (if any) in-person visits for medication and counseling. Our team is studying the effects of these policy changes on the treatment that patients with opioid use disorder receive and on their outcomes. We are using both quantitative analyses of large, existing databases and qualitative analyses of interviews with patients, providers, and policy-makers to study these effects.\n\nDetailed Description\nThe COVID-19 disease outbreak has occurred in the midst of a national opioid crisis, and poses significant risk for individuals with opioid use disorder (OUD). If existing in-person care delivery systems continued, patients would need to choose between risking exposure to the virus, or foregoing OUD treatment. Medication-assisted treatment (MAT), the gold-standard for treating OUD, involves daily medication (i.e., methadone or buprenorphine), close medication monitoring, and counseling sessions, all typically occurring in person. The medications used are schedule II and III controlled substances and are subject to greater federal regulations than medications for other substance use disorders (SUD), such as alcohol use disorder (AUD).3 To temper the impact of COVID-19 on OUD patients, in March 2020 the federal government temporarily, but dramatically, loosened MAT restrictions to expand treatment options, require fewer in-person visits, and prevent disruption to life-saving treatment.This rapid shift in policy created a natural experiment, allowing for the evaluation of this MAT policy intervention on OUD patient care and outcomes. To examine the unknown effects of this intervention, we propose a mixed-methods, naturalistic experimental design involving quantitative analysis of large administrative and healthcare utilization datasets to evaluate the impact of MAT policy changes on patient care and outcomes. We will also compare OUD patient outcomes to those of AUD patients (analogue comparison group), for whom treatment was unaffected by MAT policy changes. We will then conduct interviews with patients, providers, and key MAT policy stakeholders, to understand perspectives on the impact of these COVID-19 related MAT policy changes on the lives and well-being of OUD patients, and guide policy decisions regarding whether or not to make these changes permanent. Given the general impact of the COVID-19 pandemic on patients and systems, we will compare outcomes for patients with OUD to analogue AUD patients, for whom there were no comparable medication policy changes in response to COVID-19.Patients with OUD across three healthcare systems will be engaged in all steps of the research, including influencing the research design, assisting in determining key variables for Aims 1 and 2, collaborating in drafting our interview scripts for Aim 3, and assisting in interpreting our results and disseminating findings to patient stakeholders.Aim 1. Using existing datasets, examine the effect of federal regulation changes on trends in delivery of MAT for OUD before and after pandemic onset, with a particular focus on prescription access, refills, and dosing schedules, as well as rates of in-person vs. telehealth medical and counseling visits.Aim 2. Estimate the impact of the changes in OUD healthcare delivery on crucial patient outcomes (e.g., emergency department visits, detoxification, treatment retention, relapse, overdose, and mortality) by comparing patients with OUD vs. AUD (clinical analogue comparison group), across time (pre- and post-MAT policy changes).Aim 3. Through in-depth, qualitative interviews and analyses, characterize patient, provider, and decision-maker perspectives on the impact of MAT policy changes in response to COVID-19 on patient access to MAT, health, functioning, and well-being.Built-in reporting milestones will expedite data sharing to guide policy, provider, and patient decision-making as health care systems determine how to prepare for future pandemics, and post-COVID-19 pandemic care for OUD patients."
4,9/1/2021,NCT05028881,Serological Responses to SARS-CoV-2 and Their Temporal Pattern in HIV Infected Persons,Hong Kong HIV SARS-CoV-2 Serology,,"Immunodeficiency associated with human immunodeficiency virus (HIV) infection could predispose people living with HIV/AIDS (PLHA) to defective serological responses following infection or vaccination. To evaluate the health outcomes of SARS-CoV-2/HIV co-infection, PLHA and HIV-uninfected persons in Hong Kong are invited to join a study for understanding their clinical characteristics and for tracking their levels of antibodies against SARS-CoV-2 over a one-year observation period after infection or vaccination. The results could inform the development of prevention and control strategy for PLHA in response to the emerging coronavirus threats.\n\nDetailed Description\nThe aim of the study is to evaluate the health outcomes of SARS-CoV-2 co-infection in people living with HIV/AIDS (PLHA) in Hong Kong, with the specific objectives of (a) describing the clinical and immunological characteristics of COVID-19 in PLHA; (b) tracking the CD4/CD8 changes following SARS-CoV-1 infection; (c) assessing the temporal changes of SARS-CoV-2 serology profile of PLHA following SARS-CoV-2 transmission and vaccination.This is a descriptive study involving the analyses of data derived from the testing of PLHA and non-infected controls at different time-points, following SARS-CoV-2 infection or vaccination, in conjunction with routinely collected clinical data in the setting of Hong Kong.The total number of subjects to be recruited is 800, of which 50 would be HIV/SARS-CoV-2 co-infected persons. In order that their serological responses to SARS-CoV-2 could be interpreted in perspective, 400 HIV uninfected adults would be recruited for comparison. Separately, 400 PLHA and 50 healthy adults who have received SARS-CoV-2 vaccination would be recruited to form another control group.Blood sampling would be performed upon diagnosis of COVID-19 disease when a SARS-CoV-2 infected person is hospitalised for treatment, or after vaccination. This would be repeated after discharge for hospitalised patients and on follow-up at the following time-points: 3, 6, 12, 18 and 24 months. Plasma would be separated from the collected blood samples and stored at -20°C before testing. The levels of antibody to SARS-CoV-2 nucleocapsid and spike protein would be measured using ELISA method, while surrogate virus neutralisation test (sVNT) would be performed to track the changes of seroprotection. ."


In [50]:
df1.shape

(12126, 6)

In [48]:
# Load the average vectors of the abstract
vector_skipgram_100 = pd.read_csv("method2_k_skipgram_100.csv")

In [76]:
# preprocessing the average vectors of the abstract
vector_skipgram_100 = vector_skipgram_100.T

In [80]:
skipgram_100_vectors = []
for i in range(df1.shape[0]):
    skipgram_100_vectors.append(vector_skipgram_100[i].values)

In [130]:
# Function to retrieve top n similar results
def top_n(query,p,df1,model):

    query = preprocessing_input(query,model) #preprocessing the input

    x=[]
    # calculate the cosine similarities of input query with all the vector abstracts
    for i in range(len(p)):
        x.append(cos_sim(query,p[i]))

    temp = list(x)

    # sort the list to find the top n similar results
    res = sorted(range(len(x)),key=lambda sub: x[sub])[-5:] #This code gets the position of the highest similarity values

    simi = [temp[i] for i in reversed(res)] #This line is to get the similarity values based on the position
    print(simi)

    L = []
    for i in reversed(res):
        L.append(i)

    return df1.iloc[L,[0,1,2,3,5]],simi
    

In [134]:
# Skipgram model results
query = "patient"
p = skipgram_100_vectors
model = skipgram_100
Results,sim = top_n(query,p,df1,model)
Results

  return round(dot(a,b)/(norm(a)*norm(b)),3)


[0.805, 0.804, 0.792, 0.782, 0.782]


Unnamed: 0,Date added,Trial ID,Title,Brief title,Abstract
6142,9/19/2020,NCT04555096,"A Pilot, Randomized, Placebo-Controlled Trial of GC4419 (Avasopasem Manganese) in Patients With Critical Illness Due to SARS-CoV-2 Infection (COVID-19)",A Trial of GC4419 in Patients With Critical Illness Due to COVID-19,A Trial of GC4419 in Patients with Critical Illness due to COVID-19
5713,10/20/2020,NCT04591704,The Effect of Diabetes Mellitus on the Morbidity and Mortality Rates in Patients With COVID-19,The Effect of Diabetes Mellitus on the Prognosis of Patients With COVID-19,"All hospitalised patients with COVID-19 who have positive RT-PCR for SARS-COV-2 will be included in the study.The patients will be divided into two groups, as diabetics and non-diabetics. The COVID-19 patients' medical records will be evaluated and compared in terms of the duration of hospitalization, the presence of lung involvement in Computerised Tomography, the need for intensive care unit and mortality rates in patients with and without diabetes.\n\nDetailed Description\nAll hospitalised patients with COVID-19 who have positive RT-PCR for SARS-COV-2 will be included in the study.The patients will be divided into two groups, as diabetics and non-diabetics. The diagnosis of diabetes mellitus will be extracted from the medical records and medical history of the patients hospitalised with COVID-19.The COVID-19 patients medical records will be evaluated and compared in terms of the duration of hospitalization, the presence of lung involvement in Computerised Tomography, the need for intensive care unit and mortality rates in patients with and without diabetes."
10621,4/16/2020,NCT04346797,"CORIMUNO19-ECU: Trial Evaluating Efficacy and Safety of Eculizumab (Soliris) in Patients With COVID-19 Infection, Nested in the CORIMUNO-19 Cohort","CORIMUNO19-ECU: Trial Evaluating Efficacy and Safety of Eculizumab (Soliris) in Patients With COVID-19 Infection, Nested in the CORIMUNO-19 Cohort","The overall objective of the study is to determine the therapeutic effect and tolerance of Eculizumab in patients with moderate, severe pneumonia or critical pneumonia associated with Coronavirus disease 2019 (COVID-19). Eculizumab is a terminal complement inhibitor that has been investigated for more than 10 years in numerous complement-mediated diseases. The study has a cohort multiple Randomized Controlled Trials (cmRCT) design. Randomization will occur prior to offering Eculizumab administration to patients enrolled in the CORIMUNO-19 cohort. Eculizumab will be administered to consenting adult patients hospitalized with COVID-19 either diagnosed with moderate or severe pneumonia requiring no mechanical ventilation or critical pneumonia requiring mechanical ventilation. Patients who will chose not to receive Eculizumab will receive standard of care. Outcomes of Eculizumab-treated patients will be compared with outcomes of standard of care-treated patients as well as with outcomes of patients treated with other immune modulators."
11554,3/14/2020,CHICTR2000030744,Clinical Application of ECMO(or Ultra-Protective Lung Mechanical Ventilation) in the Treatment of Patients with ARDS due to novel Coronavirus Pneumonia (COVID-19),Clinical Application of ECMO(or Ultra-Protective Lung Mechanical Ventilation) in the Treatment of Patients with ARDS due to novel Coronavirus Pneumonia (COVID-19),"This study intends to divide patients with ARDS due to novel Coronavirus Pneumonia (COVID-19) into two groups: the ECMO group and the conventional treatment group,observe whether ECMO can improve the outcome of such patients."
4432,12/17/2020,NCT04668911,"Oral Health, Microbial Burden and COVID-19","Oral Health, Microbial Burden and COVID-19","An observational study of patients with COVID-19 confirmed cases (with various degrees of severity) and controls.Oral and nasal swabs will be taken from 150 patients (50 with mild form and 50 with severe form of COVID-19 with or without mechanical ventilation, 50 healthy controls)."


In [113]:
# Now lets try with fasttext vector abstracts
vector_fasttext_100 = pd.read_csv("k_fasttext_abstract.csv")
vector_fasttext_100 = vector_fasttext_100.T
fasttext_100_vectors = []
for i in range(df1.shape[0]):
    fasttext_100_vectors.append(vector_fasttext_100[i].values)

In [122]:
fasttext_100_vectors = []
for i in range(df1.shape[0]):
    fasttext_100_vectors.append(vector_fasttext_100[i].values)

In [133]:
query = "patient"
p = fasttext_100_vectors
model = fast_n_100
Results,sim = top_n(query,p,df1,model)
Results

  return round(dot(a,b)/(norm(a)*norm(b)),3)


[0.918, 0.912, 0.909, 0.901, 0.9]


Unnamed: 0,Date added,Trial ID,Title,Brief title,Abstract
11087,4/4/2020,CHICTR2000031587,"A paired clinical study of novel coronavirus pneumonia (COVID-19) patients with ordinary, severe, critical, and deceased patients in Tongji Hospital","A paired clinical study for novel coronavirus pneumonia (COVID-19) patients with ordinary, severe, critical, and deceased patients in Tongji Hospital","A paired clinical study of COVID - 19 patients with ordinary, severe, critical, and deceased patients in Wuhan"
2220,4/21/2021,2020-005214-18,An Investigation of the Efficacy and Safety of Favipiravir in COVID-19 Patients without Pneumonia An open-label randomized controlled study,An Investigation of the Efficacy and Safety of Favipiravir in COVID-19 Patients without Pneumonia,COVID-19 Patients without Pneumonia
5713,10/20/2020,NCT04591704,The Effect of Diabetes Mellitus on the Morbidity and Mortality Rates in Patients With COVID-19,The Effect of Diabetes Mellitus on the Prognosis of Patients With COVID-19,"All hospitalised patients with COVID-19 who have positive RT-PCR for SARS-COV-2 will be included in the study.The patients will be divided into two groups, as diabetics and non-diabetics. The COVID-19 patients' medical records will be evaluated and compared in terms of the duration of hospitalization, the presence of lung involvement in Computerised Tomography, the need for intensive care unit and mortality rates in patients with and without diabetes.\n\nDetailed Description\nAll hospitalised patients with COVID-19 who have positive RT-PCR for SARS-COV-2 will be included in the study.The patients will be divided into two groups, as diabetics and non-diabetics. The diagnosis of diabetes mellitus will be extracted from the medical records and medical history of the patients hospitalised with COVID-19.The COVID-19 patients medical records will be evaluated and compared in terms of the duration of hospitalization, the presence of lung involvement in Computerised Tomography, the need for intensive care unit and mortality rates in patients with and without diabetes."
11523,3/15/2020,CHICTR2000030803,Clinical characteristics of severe or critically ill patients infected with 2019-nCoV,Collection and analysis of clinical data in severe and critically ill patients with novel coronavirus pneumonia (COVID-19),"we performed an observational study to comprehensively evaluate the clinical characteristics of severe or critically ill patients with COVID-19 based on clinical data, and aimed to provide the scientific evidence for optimal treatment and management of severe or critically ill patients with COVID-19."
11943,2/26/2020,CHICTR2000029757,Convalescent plasma for the treatment of severe and critical novel coronavirus pneumonia (COVID-19): a prospective randomized controlled trial,Convalescent plasma for the treatment of severe and critical novel coronavirus pneumonia (COVID-19): a prospective randomized controlled trial,"In order to actively prevent and control COVID-19, the convalescent plasma of patients with COVID-19 is collected, and the clinical treatment plan of using convalescent plasma to treat patients with COVID-19 is explored."


# Both the models perform well with good similarity results.