In [1]:
import ir_datasets

In [2]:
dataset = ir_datasets.load("clinicaltrials/2017/trec-pm-2017")

In [3]:
import pandas as pd

In [6]:
queries = pd.DataFrame(dataset.queries_iter())
docs = pd.DataFrame(dataset.docs)
print(docs)
qrels = pd.DataFrame(dataset.qrels_iter())
queries.to_csv("clinical_dataset/original_queries.csv", index=False)
docs.to_csv("clinical_dataset/original_docs.csv", index=False)
qrels.to_csv("clinical_dataset/qrels.csv", index=False)

             doc_id                                              title  \
0       NCT00530868  Comparing Letrozole Given Alone to Letrozole G...   
1       NCT00530127  A Study Investigating the Safety and Tolerabil...   
2       NCT00530517  A Study on the Usability of the Needle-Free In...   
3       NCT00530972  Pilot Study of Patients Chronic Hepatitis C in...   
4       NCT00530322  Adhesion Formation Following Laparoscopic and ...   
...             ...                                                ...   
241001  NCT00074802  Adding Cognitive Behavioral Therapy to Drug Tr...   
241002  NCT00074139  Docetaxel, Doxorubicin, and Cyclophosphamide i...   
241003  NCT00074178  Methotrexate, Cyclophosphamide, and Etoposide ...   
241004  NCT00074035  Pentostatin in Treating Patients With Refracto...   
241005  NCT00074646  Phase I Trial of CC-8490 for the Treatment of ...   

       condition                                            summary  \
0                 \n    \n      This pur

In [4]:
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
import string
import re
from nltk.tokenize import word_tokenize

In [5]:
def dataProcessing(dfText):
    
#
    
    # To lower case
    texts = dfText.str.lower()
    
    # Remove punctuation
    trans_table = str.maketrans(string.punctuation, ' '*len(string.punctuation))  
    texts = [str(word).translate(trans_table) for word in texts]

    # Remove stopwrods
    stop_words = set(stopwords.words('english'))
    texts = [[word for word in q.split() if word not in stop_words] for q in texts]
    
    # Lemmatize text
    lemmatizer = WordNetLemmatizer()
    lemmatized_strings = []
    for text in texts:
        words = word_tokenize(str(text))
        lemmatized_words = []
        for word in text:
            x = pos_tag([word])
            my_pos = wordnet.NOUN
            if x[0][1][0].lower() == 'v':
                my_pos = wordnet.VERB
            lemmatized_words.append(lemmatizer.lemmatize(word, pos = my_pos))
        lemmatized_strings.append(' '.join(lemmatized_words))
    texts = lemmatized_strings

    
    # Remove Non-alphanumeric Characters
    texts = [re.compile('[^a-zA-Z0-9\s]').sub('', str(word)) for word in texts]
    
    return texts
    

In [6]:
docs= pd.read_csv("clinical_dataset/original_docs.csv")

docs['text'] = dataProcessing(docs['title'])
docs.to_csv('clinical_dataset/processed_docs.csv', index=True, index_label='id')

In [8]:
queries = pd.read_csv("clinical_dataset/original_queries.csv")

queries['disease'] = dataProcessing(queries['disease'])
queries['gene'] = dataProcessing(queries['gene'])
queries['demographic'] = dataProcessing(queries['demographic'])
queries['other'] = dataProcessing(queries['other'])

queries.to_csv('clinical_dataset/clean_queries.csv', index=True, index_label='id')