## Preparing Loinc data for listwise search approach

In [135]:
#Imports
try: 
    import pandas as pd 
    import nltk, os, re, math, spacy
    import numpy as np
    from spacy.cli import download
    from sklearn.model_selection import train_test_split
except: 
    print("Install the requirements.txt")

# Requirements: Active and running version of spacy on the computer
# !Uncomment the following part only if model isn't loaded. The download can take several minutes!
"""
download('en_core_web_lg')

nlp = spacy.load('en_core_web_lg')
"""


"\ndownload('en_core_web_lg')\n\nnlp = spacy.load('en_core_web_lg')\n"

### Read Data from .xlsx (excel) format

Since the loinc datasets are stored in excel format it is necessary to load them into dataframes for further processing tasks. We have decided to keep the given layout in order to provide future possible compatibility.

In [139]:
#This functions enables the automated reading of xlsx files and the corrosponding data sheets
#Output is a merged dataframe aswell as the queries in each datasheets and the relevancelevels of the data. 
def read_data_sheets(dataset, sheet_names):
    queries = []
    cols = ['loinc_num', 'long_common_name', 'component', 'system', 'property','qid', 'label']
    df = pd.DataFrame(columns = cols)
    for name in sheet_names:
        query = pd.read_excel(f"./{dataset}",nrows = 1, sheet_name=name)
        query = query.columns[0].split(" ")
        current_df = pd.read_excel(f"./{dataset}",skiprows=2, sheet_name=name)
        current_df["query"] = pd.Series([query[1:len(query)] for x in range(len(current_df.index))])
        queries.append(query)
        df = pd.concat([df, current_df],ignore_index=True, axis=0)
    return df, queries

#Function to transform Array of Format Q1 ["This", "is","an", "example"] into a str
def query_to_str(query) -> str: 
    q  = str()
    for i,word in enumerate(query[1:len(query)]):
        if(i < len(query)):
            q = q + word+" "
        else:
            q = q + word
    query = q
    return query

In [140]:
#Specify the datasheets that are relevant 
#Possible:  ["glucose in blood","bilirubin in plasma","white blood cells count"]
ds_names = ["white blood cells count"]

df, queries = read_data_sheets('loinc_dataset_extended.xlsx',ds_names)
print(df.shape)

(200, 15)


### Building Parameters for Listwise Approach

In order to build parameters we took a close look at an already well established ranking benchmark set.
<br/>
Url: https://arxiv.org/ftp/arxiv/papers/1306/1306.2597.pdf (also included in the our resources)

<br/>
While the Letor 4 Ranking dataset has more than 40 parameters we reduced the amounts of parameters significantly. Main goal of this code is to show a proof of concept of the listwise approach.

#### Calculating term frequencies

In [141]:
#Constructing Parameter => TF Score
#The TF Score counts the overall occourcences of a word in the query in regard to the occurences per row

# https://t4tutorials.com/cosine-similarity-in-data-mining/
def termFrequency(term, document) -> float:
  return document.count(term.lower()) / float(len(document))
    

#Idea: We compute the tf for each Loinc Entry in regard to the query. 
#Since we use an addition, there is the possibility of value exceeding 1. Therefore the data will be scaled afterwards.
# This might be not the mathematiclly cleaneast approach but we assume that an overall high overlap in words in the query with
# words in the row description of an loinc entry is an indicator for significance. 
def add_termFrequencies(col,name) -> pd.DataFrame:
    tf_arr = []
    for idx,txt in enumerate(df[col]): 
        qd_tf = 0
        for idx, trm in enumerate(df["query"][idx]): 
            #print(txt)
            qd_tf = qd_tf + termFrequency(trm.lower(),txt.lower())
        tf_arr.append(qd_tf)
    return pd.DataFrame(tf_arr, columns=[name])

#After the calculation of the tf scores we append them to the df as new column. 
df["lcmon_tf"] =  add_termFrequencies("long_common_name","lcmon_tf")
df["cmp_tf"] = add_termFrequencies("component","cmp_tf")

print(df["lcmon_tf"].shape, df["cmp_tf"].shape)

(200,) (200,)


#### Calculating Similarity Scores

The package spacy allows an already builded model to download in order to build similarity scores between text corpuses. 

In [142]:
#With the use of NLP Package of spacy and the downloaded model we calculate values in the range of [0,1]. 
#As Higher the score as more similar the query and the compared text is. 
def compute_similarity_score(dataframe,column,name) -> pd.DataFrame:
    similarity_score=[]
    for idx,article in enumerate(dataframe[column]):
        similarity_score.append(nlp(query_to_str(dataframe["query"][idx])).similarity(nlp(article)))
    return pd.DataFrame(similarity_score, columns=[name])

df["lcmon_ss"] = compute_similarity_score(df, "long_common_name", "lcmon_ss")
df["cmp_ss"] = compute_similarity_score(df, "component", "cmp_ss")


  similarity_score.append(nlp(query_to_str(dataframe["query"][idx])).similarity(nlp(article)))


#### Description for identifying ranking results

In [143]:
#In order to identify the results of our predictions we need an identifier. Since Loinc already uses its own identifiers,
#a special loinc number, we use the same number for identification. 
description = []
for idx in range(len(df)):
    description.append(f' #docid = {df.loinc_num[idx]} {df.long_common_name[idx]}')
print(len(description))

200


In [144]:
#All columns that we use for the listwise ranking algorithm adaRank. 
prep_df = df[["label","qid","lcmon_ss","lcmon_tf","cmp_tf","cmp_ss"]] 
print(prep_df)

    label qid  lcmon_ss  lcmon_tf  cmp_tf    cmp_ss
0       0   3  0.488962       0.0     0.0  0.368049
1       0   3  0.687383       0.0     0.0  0.523740
2       0   3  0.555352       0.0     0.0  0.368049
3       0   3  0.478199       0.0     0.0  0.362320
4       0   3  0.335934       0.0     0.0  0.160475
..    ...  ..       ...       ...     ...       ...
195     0   3  0.547386       0.0     0.0  0.472740
196     0   3  0.405370       0.0     0.0  0.472740
197     0   3  0.512250       0.0     0.0  0.472740
198     0   3  0.439458       0.0     0.0  0.472740
199     0   3  0.547004       0.0     0.0  0.472740

[200 rows x 6 columns]


### Scaling Data

Since we used an unorthodoxed approach by just adding the tf scores together, we need Scale the values. Otherwise the difference between the similartiy scores and the aggregated tf scores could lead to a biased result. Furthermore we scale all numerical values with MinMaxScaler. MinMaxScaler dosent treat outliers very well. But our assesment is, that outliers represent a very high similarity and therefore dont need to be avoided because they likely indicate a very good match between query and searched Loinc entry

In [145]:
from sklearn.preprocessing import MinMaxScaler

cols = list(prep_df[["lcmon_ss","lcmon_tf","cmp_tf","cmp_ss"]])
prep_df.values[:] = MinMaxScaler().fit_transform(prep_df)
prep_df = pd.DataFrame(prep_df)
print(prep_df.columns)
print(len(prep_df))

prep_df.shape

Index(['label', 'qid', 'lcmon_ss', 'lcmon_tf', 'cmp_tf', 'cmp_ss'], dtype='object')
200


(200, 6)

### Train / Test / Validation Split and converting data into sparse matrix

Since we are using the implementation by the github user: https://github.com/rueycheng/AdaRank the format of an sparse matrix is required.

#### Split the dataset

In [146]:
#Train Test Split. After taking the train test split we split the test set another time in order to get a test and 
#a validation set
X = prep_df[["qid","lcmon_ss","lcmon_tf","cmp_tf","cmp_ss"]]
y = prep_df[["label"]].values.ravel()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.25, random_state=42)

#### Dump to svmlight file 

In [147]:
from sklearn.datasets import dump_svmlight_file

#This function adds the identifier als already described to the svmlight outputed textfile in order to identify the ranked
#documents
def add_comments(description, indexes, file_name):
    with open(file_name, 'r') as f:
        file_lines = [''.join([x.strip(), description[indexes[idx]], '\n']) for idx,x in enumerate(f.readlines())]
    with open(file_name, 'w') as f:
        f.writelines(file_lines) 

#Function to save the preprocessed dataframe into svmlight format
def dump_to_svmlf(df,y,name,descriptio): 
    X = df[["lcmon_ss","lcmon_tf","cmp_tf","cmp_ss"]]
    qid = df[["qid"]].values.ravel()
    y = y.values.ravel()
    f = f"./{name}.txt"
    #print(y.shape)
    #print(len(X),len(y),len(qid))
    dump_svmlight_file(X, y,f, zero_based=False, comment=None, query_id=qid, multilabel=False)
    add_comments(description, X.index.values,f)

    
def save_to_folder(folder_path):
    dump_to_svmlf(X_train,pd.DataFrame(y_train),f"data/{folder_path}/train",description)
    dump_to_svmlf(X_test,pd.DataFrame(y_test),f"data/{folder_path}/test",description)
    dump_to_svmlf(X_val,pd.DataFrame(y_val),f"data/{folder_path}/vali",description)


#Change regarding the  used query and used dataset
save_to_folder("extend_ds/q3")
