### Prerequisites

* Topics from 2017: http://trec-cds.appspot.com/topics2017.xml
* Topics from 2018: http://trec-cds.appspot.com/topics2018.xml
* Pre-processed Gold-Standard or Run Files that can be found on fighsare:
    * 2017 GS: @todo
    * 2018 GS: @todo
    * Example of Run File:

In [None]:
from os import listdir
from os.path import isfile, isdir, join
from lxml import etree
import pandas as pd
import tarfile
import json
import gzip
import time
import csv
import re
import sys
import math
import nltk
import string
import numpy as np
import warnings
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
import pyltr

In [None]:
warnings.simplefilter('ignore')

# TESTING pyltr with LETOR

In [None]:
folder="/path/MQ2007/Fold1"

In [None]:
with open(join(folder,'train.txt')) as trainfile, open(join(folder,'vali.txt')) as valifile, open(join(folder,'test.txt')) as evalfile:
    TX, Ty, Tqids, _ = pyltr.data.letor.read_dataset(trainfile)
    VX, Vy, Vqids, _ = pyltr.data.letor.read_dataset(valifile)
    EX, Ey, Eqids, _ = pyltr.data.letor.read_dataset(evalfile)

In [None]:
metric = pyltr.metrics.NDCG(k=10)

# Only needed if you want to perform validation (early stopping & trimming)
monitor = pyltr.models.monitors.ValidationMonitor(VX, Vy, Vqids, metric=metric, stop_after=250)

model = pyltr.models.LambdaMART(
    metric=metric,
    n_estimators=1000,
    learning_rate=0.02,
    max_features=0.5,
    query_subsample=0.5,
    max_leaf_nodes=10,
    min_samples_leaf=64,
    verbose=1,
)

model.fit(TX, Ty, Tqids, monitor=monitor)

In [None]:
Epred = model.predict(EX)
print('Random ranking:', metric.calc_mean_random(Eqids, Ey))
print('Our model:', metric.calc_mean(Eqids, Ey, Epred))

# Reading Files

In [None]:
trainPath = "../train-path"
testPath = "../test-path"

trainYear = "2017"
testYear = "2018"

In [None]:
gsTrainFile = join(trainPath ,"train-file.tsv")
trecEvalTrain = "../topics2017.xml"

gsTestFile = join(testPath,"test-file.tsv")
trecEvalTest = "../topics2018.xml"

In [None]:
# Reading Topics for Training Set (get gene info)
topicsColumns = ['trec_topic_number', 'trec_topic_gene']
topics = pd.DataFrame(columns=topicsColumns)
topicsXML = etree.parse(trecEvalTrain)
for topic in topicsXML.getroot():
    topicNumber = topic.get('number')
    gene = topic.find('gene').text
    topics = topics.append(pd.Series([topicNumber, gene], index=topicsColumns), ignore_index=True)
topics['trec_topic_number'] = topics['trec_topic_number'].astype('int')

# Merging
train = pd.read_csv(gsTrainFile, sep = '\t', encoding='utf8')
train.fillna("", inplace=True)
trainData = train.merge(topics, left_on=['trec_topic_number'], right_on=['trec_topic_number'], how='left')
trainData.head(1)

In [None]:
# Reading Topics for Test Set (get gene info)
topicsColumns = ['trec_topic_number', 'trec_topic_gene']
topics = pd.DataFrame(columns=topicsColumns)
topicsXML = etree.parse(trecEvalTest)
for topic in topicsXML.getroot():
    topicNumber = topic.get('number')
    gene = topic.find('gene').text
    topics = topics.append(pd.Series([topicNumber, gene], index=topicsColumns), ignore_index=True)
topics['trec_topic_number'] = topics['trec_topic_number'].astype('int')

# Merging
testVal = pd.read_csv(gsTestFile, sep = '\t', encoding='utf8', dtype={'trec_doc_id':object})
testVal.fillna("", inplace=True)
testValData = testVal.merge(topics, left_on=['trec_topic_number'], right_on=['trec_topic_number'], how='left')
testValData.head(1)

# Preprocessing the data

## Functions to tokenize, remove stop words, get stemms

In [None]:
# Get Stopwords
nltk.download('stopwords')
nltk.download('punkt')
stopWords = stopwords.words('english')

def tokenizePorter(text):
    tokens = word_tokenize(text)
    stems = []
    stemmer = PorterStemmer()
    for item in tokens:
        if item not in stopWords: 
            stems.append(stemmer.stem(item))
    return ' '.join(stems)

def tokenizeSnowball(text):
    tokens = word_tokenize(text)
    stems = []
    stemmer = SnowballStemmer("english")
    for item in tokens:
        if item not in stopWords: 
            stems.append(stemmer.stem(item))
    return ' '.join(stems)

## Load TrainData

In [None]:
pd.options.display.max_rows
pd.set_option('display.max_colwidth', -1)

# Preprocessing the Text
removePunctuation = str.maketrans('\n', ' ', string.punctuation)

# Transforms the text to lower case, remove punctuations, get the stemms of words 
trainData['title_abstract_mesh'] = trainData[['title', 'abstract', "major_mesh", "minor_mesh"]].apply(lambda x: ''.join(re.sub(r';|\/', ' ', x.to_string(index=False).lower()).translate(removePunctuation)), axis=1)
trainData['title_abstract_mesh_stemmed'] = trainData['title_abstract_mesh'].apply(tokenizeSnowball)
trainData['title_stemmed'] = trainData[['title']].apply(lambda x: ''.join(re.sub(r';|\/', ' ', x.to_string(index=False).lower()).translate(removePunctuation)), axis=1).apply(tokenizeSnowball)
trainData['abstract_stemmed'] = trainData[['abstract']].apply(lambda x: ''.join(re.sub(r';|\/', ' ', x.to_string(index=False).lower()).translate(removePunctuation)), axis=1).apply(tokenizeSnowball)
trainData['mesh_stemmed'] = trainData[['major_mesh', 'minor_mesh']].apply(lambda x: ''.join(re.sub(r';|\/', ' ', x.to_string(index=False).lower()).translate(removePunctuation)), axis=1).apply(tokenizeSnowball)
trainData['disease_stemmed'] = trainData[['trec_topic_disease']].apply(lambda x: ''.join(re.sub(r';|\/', ' ', x.to_string(index=False).lower()).translate(removePunctuation)), axis=1).apply(tokenizeSnowball)
trainData['gene_stemmed'] = trainData[['trec_topic_gene']].apply(lambda x: ''.join(re.sub(r';|\/', ' ', x.to_string(index=False).lower()).translate(removePunctuation)), axis=1).apply(tokenizeSnowball)

# Defining query ids
trainData["qid"] = trainData["trec_topic_number"]
# trainData["qid"] = trainData["trec_topic_number"].astype(str)+str(trainYear)
trainDataSliced = trainData[['relevance_score','qid', 'title_stemmed', 'abstract_stemmed', 'mesh_stemmed', 'title_abstract_mesh_stemmed', 'disease_stemmed', 'gene_stemmed', 'trec_doc_id']]
trainDataSliced.head(1)

## Load Test and Validation Data

In [None]:
testDataSetSliced = []
valDataSetSliced = []

In [None]:
pd.options.display.max_rows
pd.set_option('display.max_colwidth', -1)

# Preprocessing the Text
removePunctuation = str.maketrans('\n', ' ', string.punctuation)

# Transforms the text to lower case, remove punctuations, get the stemms of words 
testValData['title_abstract_mesh'] = testValData[['title', 'abstract', "major_mesh", "minor_mesh"]].apply(lambda x: ''.join(re.sub(r';|\/', ' ', x.to_string(index=False).lower()).translate(removePunctuation)), axis=1)
testValData['title_abstract_mesh_stemmed'] = testValData['title_abstract_mesh'].apply(tokenizeSnowball)
testValData['title_stemmed'] = testValData[['title']].apply(lambda x: ''.join(re.sub(r';|\/', ' ', x.to_string(index=False).lower()).translate(removePunctuation)), axis=1).apply(tokenizeSnowball)
testValData['abstract_stemmed'] = testValData[['abstract']].apply(lambda x: ''.join(re.sub(r';|\/', ' ', x.to_string(index=False).lower()).translate(removePunctuation)), axis=1).apply(tokenizeSnowball)
testValData['mesh_stemmed'] = testValData[['major_mesh', 'minor_mesh']].apply(lambda x: ''.join(re.sub(r';|\/', ' ', x.to_string(index=False).lower()).translate(removePunctuation)), axis=1).apply(tokenizeSnowball)
testValData['disease_stemmed'] = testValData[['trec_topic_disease']].apply(lambda x: ''.join(re.sub(r';|\/', ' ', x.to_string(index=False).lower()).translate(removePunctuation)), axis=1).apply(tokenizeSnowball)
testValData['gene_stemmed'] = testValData[['trec_topic_gene']].apply(lambda x: ''.join(re.sub(r';|\/', ' ', x.to_string(index=False).lower()).translate(removePunctuation)), axis=1).apply(tokenizeSnowball)

# Defining query ids
testValData["qid"] = testValData["trec_topic_number"]
testValDataSliced = testValData[['relevance_score','qid', 'title_stemmed', 'abstract_stemmed', 'mesh_stemmed', 'title_abstract_mesh_stemmed', 'disease_stemmed', 'gene_stemmed', 'trec_doc_id']]
testValDataSliced.head(1)

## Features for Disease and Gene in Title, Abstract, Mesh and Combined

In [None]:
def countTerms(terms, target):
    vectorizer = CountVectorizer(vocabulary = terms)
    transformed_data = vectorizer.fit_transform(target)

    score = pd.DataFrame(transformed_data.toarray(), columns=vectorizer.get_feature_names())
    scoreDict = score.to_dict('records')
    return scoreDict
    
def specificTermsCount(allTermsCount, index, terms):
    termsCount = 0
    termList = terms.split()
    for term in termList:
        termsCount += allTermsCount[index][term]
    return(termsCount)

def percentageOfTermsMatched(allTermsCount, index, terms):
    matched = 0
    termList = terms.split()
    for term in termList:
        if allTermsCount[index][term] > 0:
            matched += 1
    return(matched/len(termList))

def termsFrequency(corpus, termCount):
    count = len(corpus.split())
    if count == 0:
        return 0
    return float(termCount/count)

def tfidfWeights(terms, target):
    tvec = TfidfVectorizer(vocabulary = terms)
    weights = tvec.fit_transform(target)
    
    score = pd.DataFrame(weights.toarray(), columns=tvec.get_feature_names())
    scoreDict = score.to_dict('records')
    return scoreDict

def specificTermsTfIdf(tfidfWeights, index, terms):
    tfidf = 0
    termList = terms.split()
    for term in termList:
        tfidf += tfidfWeights[index][term]
    return(tfidf)

## Extract Features

In [None]:
allSelectedFeatures ={'disease_title_count': 1,
             'disease_title_tf': 2,
             'disease_title_percent': 3,
             'disease_title_tfidf': 4,
             'gene_title_count': 5,
             'gene_title_tf': 6,
             'gene_title_percent': 7,
             'gene_title_tfidf': 8,
             'disease_gene_title_tf': 9,
             'disease_gene_title_tfidf': 10,
             'disease_abstract_count': 11,
             'disease_abstract_tf': 12,
             'disease_abstract_percent': 13,
             'disease_abstract_tfidf': 14,
             'gene_abstract_count': 15,
             'gene_abstract_tf': 16,
             'gene_abstract_percent': 17,
             'gene_abstract_tfidf': 18,
             'disease_gene_abstract_tf': 19,
             'disease_gene_abstract_tfidf': 20,
             'disease_mesh_count': 21,
             'disease_mesh_tf': 22,
             'disease_mesh_percent': 23,
             'disease_mesh_tfidf': 24,
             'gene_mesh_count': 25,
             'gene_mesh_tf': 26,
             'gene_mesh_percent': 27,
             'gene_mesh_tfidf': 28,
             'disease_gene_mesh_tf': 29,
             'disease_gene_mesh_tfidf': 30,
             'disease_combined_count': 31,
             'disease_combined_tf': 32,
             'disease_combined_percent': 33,
             'disease_combined_tfidf': 34,
             'gene_combined_count': 35,
             'gene_combined_tf': 36,
             'gene_combined_percent': 37,
             'gene_combined_tfidf': 38,
             'disease_gene_combined_tf': 39,
             'disease_gene_combined_tfidf': 40
            }

In [None]:
def extractFeatures (diseaseTerms, geneTerms, trainDataSliced):
    # TITLE
    
    # Disease
    termsCountTitle = countTerms(diseaseTerms, trainDataSliced['title_stemmed'])

    trainDataSliced['disease_title_count'] = trainDataSliced.apply(lambda row: specificTermsCount(termsCountTitle, row.name, row['disease_stemmed']), axis=1)
    trainDataSliced['disease_title_tf'] = trainDataSliced.apply(lambda row: termsFrequency(row['title_stemmed'], row['disease_title_count']), axis=1)
    trainDataSliced['disease_title_percent'] = trainDataSliced.apply(lambda row: percentageOfTermsMatched(termsCountTitle, row.name, row['disease_stemmed']), axis=1)

    termsTfIdfTitle = tfidfWeights(diseaseTerms, trainDataSliced['title_stemmed'])
    trainDataSliced['disease_title_tfidf'] = trainDataSliced.apply(lambda row: specificTermsTfIdf(termsTfIdfTitle, row.name, row['disease_stemmed']), axis=1)


    # Gene
    geneCountTitle = countTerms(geneTerms, trainDataSliced['title_stemmed'])

    trainDataSliced['gene_title_count'] = trainDataSliced.apply(lambda row: specificTermsCount(geneCountTitle, row.name, row['gene_stemmed']), axis=1)
    trainDataSliced['gene_title_tf'] = trainDataSliced.apply(lambda row: termsFrequency(row['title_stemmed'], row['gene_title_count']), axis=1)
    trainDataSliced['gene_title_percent'] = trainDataSliced.apply(lambda row: percentageOfTermsMatched(geneCountTitle, row.name, row['gene_stemmed']), axis=1)

    termsTfIdfTitle = tfidfWeights(geneTerms, trainDataSliced['title_stemmed'])
    trainDataSliced['gene_title_tfidf'] = trainDataSliced.apply(lambda row: specificTermsTfIdf(termsTfIdfTitle, row.name, row['gene_stemmed']), axis=1)

    # Disease and Gene
    trainDataSliced['disease_gene_title_tf'] = trainDataSliced['disease_title_tf'] + trainDataSliced['gene_title_tf']
    trainDataSliced['disease_gene_title_tfidf'] = trainDataSliced['disease_title_tfidf'] + trainDataSliced['gene_title_tfidf']

    # ABSTRACT

    # Disease
    termsCountAbst = countTerms(diseaseTerms, trainDataSliced['abstract_stemmed'])

    trainDataSliced['disease_abstract_count'] = trainDataSliced.apply(lambda row: specificTermsCount(termsCountAbst, row.name, row['disease_stemmed']), axis=1)
    trainDataSliced['disease_abstract_tf'] = trainDataSliced.apply(lambda row: termsFrequency(row['abstract_stemmed'], row['disease_abstract_count']), axis=1)
    trainDataSliced['disease_abstract_percent'] = trainDataSliced.apply(lambda row: percentageOfTermsMatched(termsCountAbst, row.name, row['disease_stemmed']), axis=1)

    termsTfIdfAbstract = tfidfWeights(diseaseTerms, trainDataSliced['abstract_stemmed'])
    trainDataSliced['disease_abstract_tfidf'] = trainDataSliced.apply(lambda row: specificTermsTfIdf(termsTfIdfAbstract, row.name, row['disease_stemmed']), axis=1)

    # Gene
    geneCountAbst = countTerms(geneTerms, trainDataSliced['abstract_stemmed'])

    trainDataSliced['gene_abstract_count'] = trainDataSliced.apply(lambda row: specificTermsCount(geneCountAbst, row.name, row['gene_stemmed']), axis=1)
    trainDataSliced['gene_abstract_tf'] = trainDataSliced.apply(lambda row: termsFrequency(row['abstract_stemmed'], row['gene_abstract_count']), axis=1)
    trainDataSliced['gene_abstract_percent'] = trainDataSliced.apply(lambda row: percentageOfTermsMatched(geneCountAbst, row.name, row['gene_stemmed']), axis=1)

    termsTfIdfAbstract = tfidfWeights(geneTerms, trainDataSliced['abstract_stemmed'])
    trainDataSliced['gene_abstract_tfidf'] = trainDataSliced.apply(lambda row: specificTermsTfIdf(termsTfIdfAbstract, row.name, row['gene_stemmed']), axis=1)

    # Disease + Gene
    trainDataSliced['disease_gene_abstract_tf'] = trainDataSliced['disease_abstract_tf'] + trainDataSliced['gene_abstract_tf']
    trainDataSliced['disease_gene_abstract_tfidf'] = trainDataSliced['disease_abstract_tfidf'] + trainDataSliced['gene_abstract_tfidf']

    # MESH

    # Disease
    termsCountMesh = countTerms(diseaseTerms, trainDataSliced['mesh_stemmed'])

    trainDataSliced['disease_mesh_count'] = trainDataSliced.apply(lambda row: specificTermsCount(termsCountMesh, row.name, row['disease_stemmed']), axis=1)
    trainDataSliced['disease_mesh_tf'] = trainDataSliced.apply(lambda row: termsFrequency(row['mesh_stemmed'], row['disease_mesh_count']), axis=1)
    trainDataSliced['disease_mesh_percent'] = trainDataSliced.apply(lambda row: percentageOfTermsMatched(termsCountMesh, row.name, row['disease_stemmed']), axis=1)

    termsTfIdfMesh = tfidfWeights(diseaseTerms, trainDataSliced['mesh_stemmed'])
    trainDataSliced['disease_mesh_tfidf'] = trainDataSliced.apply(lambda row: specificTermsTfIdf(termsTfIdfMesh, row.name, row['disease_stemmed']), axis=1)

    # Gene
    geneCountMesh = countTerms(geneTerms, trainDataSliced['mesh_stemmed'])

    trainDataSliced['gene_mesh_count'] = trainDataSliced.apply(lambda row: specificTermsCount(geneCountMesh, row.name, row['gene_stemmed']), axis=1)
    trainDataSliced['gene_mesh_tf'] = trainDataSliced.apply(lambda row: termsFrequency(row['mesh_stemmed'], row['gene_mesh_count']), axis=1)
    trainDataSliced['gene_mesh_percent'] = trainDataSliced.apply(lambda row: percentageOfTermsMatched(geneCountMesh, row.name, row['gene_stemmed']), axis=1)

    termsTfIdfMesh = tfidfWeights(geneTerms, trainDataSliced['mesh_stemmed'])
    trainDataSliced['gene_mesh_tfidf'] = trainDataSliced.apply(lambda row: specificTermsTfIdf(termsTfIdfMesh, row.name, row['gene_stemmed']), axis=1)

    # Disease + Gene
    trainDataSliced['disease_gene_mesh_tf'] = trainDataSliced['disease_mesh_tf'] + trainDataSliced['gene_mesh_tf']
    trainDataSliced['disease_gene_mesh_tfidf'] = trainDataSliced['disease_mesh_tfidf'] + trainDataSliced['gene_mesh_tfidf']

    # COMBINED

    # Disease
    termsCountCombined = countTerms(diseaseTerms, trainDataSliced['title_abstract_mesh_stemmed'])

    trainDataSliced['disease_combined_count'] = trainDataSliced.apply(lambda row: specificTermsCount(termsCountCombined, row.name, row['disease_stemmed']), axis=1)
    trainDataSliced['disease_combined_tf'] = trainDataSliced.apply(lambda row: termsFrequency(row['title_abstract_mesh_stemmed'], row['disease_combined_count']), axis=1)
    trainDataSliced['disease_combined_percent'] = trainDataSliced.apply(lambda row: percentageOfTermsMatched(termsCountCombined, row.name, row['disease_stemmed']), axis=1)

    termsTfIdfCombined = tfidfWeights(diseaseTerms, trainDataSliced['title_abstract_mesh_stemmed'])
    trainDataSliced['disease_combined_tfidf'] = trainDataSliced.apply(lambda row: specificTermsTfIdf(termsTfIdfCombined, row.name, row['disease_stemmed']), axis=1)

    # Gene
    geneCountCombined = countTerms(geneTerms, trainDataSliced['title_abstract_mesh_stemmed'])

    trainDataSliced['gene_combined_count'] = trainDataSliced.apply(lambda row: specificTermsCount(geneCountCombined, row.name, row['gene_stemmed']), axis=1)
    trainDataSliced['gene_combined_tf'] = trainDataSliced.apply(lambda row: termsFrequency(row['title_abstract_mesh_stemmed'], row['gene_combined_count']), axis=1)
    trainDataSliced['gene_combined_percent'] = trainDataSliced.apply(lambda row: percentageOfTermsMatched(geneCountCombined, row.name, row['gene_stemmed']), axis=1)

    termsTfIdfCombined = tfidfWeights(geneTerms, trainDataSliced['title_abstract_mesh_stemmed'])
    trainDataSliced['gene_combined_tfidf'] = trainDataSliced.apply(lambda row: specificTermsTfIdf(termsTfIdfCombined, row.name, row['gene_stemmed']), axis=1)

    # Disease + Gene
    trainDataSliced['disease_gene_combined_tf'] = trainDataSliced['disease_combined_tf'] + trainDataSliced['gene_combined_tf']
    trainDataSliced['disease_gene_combined_tfidf'] = trainDataSliced['disease_combined_tfidf'] + trainDataSliced['gene_combined_tfidf']

    return trainDataSliced

## Features for Train Dataset

In [None]:
diseases = trainDataSliced['disease_stemmed'].unique()
diseaseTerms = []
for disease in diseases:
    d = disease.split()
    for word in d:
        if word not in diseaseTerms:
            diseaseTerms.append(word)
print(diseaseTerms)

In [None]:
allGenes = trainDataSliced['gene_stemmed'].unique()
geneTerms = []
for genes in allGenes:
    d = genes.split()
    for gene in d:
        if gene not in geneTerms:
            geneTerms.append(gene)
print(geneTerms)

#### TODO: expansions

In [None]:
with open('../lexigram-output.json') as f:
    exPandedDisease = json.load(f)

In [None]:
trainData = extractFeatures(diseaseTerms, geneTerms, trainDataSliced)
pd.options.display.max_columns = None
display(trainData.tail(1))

## Train File

In [None]:
trainDocId = trainData['trec_doc_id']
croppedTrain = trainData.drop(['title_abstract_mesh_stemmed', 'title_stemmed', 'abstract_stemmed', 
                              'mesh_stemmed', 'disease_stemmed', 'gene_stemmed', 'trec_doc_id'], axis=1)

finalTrain = croppedTrain.sort_values('qid')
finalTrain['trec_doc_id'] = trainDocId
finalTrain.head(1)

rankTrain = finalTrain.to_dict('records')
    
f = open("train.txt", "w")

for item in rankTrain:
    for i,val in item.items():
        if(i == "relevance_score"):
            f.write(str(val)+" ")
        elif(i == "trec_doc_id"):
            f.write('# '+str(val))
        elif(i == "qid"):
            f.write(str(i)+":"+str(val)+" ")
        else:
            j = allSelectedFeatures[i]
            f.write(str(j)+":"+str(val)+" ")
    f.write("\n")    
f.close()

# Features for Test and Validation Datasets

In [None]:
diseases = testValDataSliced['disease_stemmed'].unique()
diseaseTermsTest = []
for disease in diseases:
    d = disease.split()
    for word in d:
        if word not in diseaseTermsTest:
            diseaseTermsTest.append(word)
print(diseaseTermsTest)

In [None]:
allGenes = testValDataSliced['gene_stemmed'].unique()
geneTermsTest = []
for genes in allGenes:
    d = genes.split()
    for gene in d:
        if gene not in geneTermsTest:
            geneTermsTest.append(gene)
print(geneTermsTest)

### Only Testind and *no* Validation Dataset

In [None]:
testValDataSliced.shape

In [None]:
testDataSetSliced.append(testValDataSliced)

### One Testing and Validation Datasets

In [None]:
# split into validation and testing
testDataSliced, valDataSliced, yT, yV = train_test_split(testValDataSliced, testValDataSliced['qid'], test_size=0.5)

In [None]:
testDataSetSliced.append(testDataSliced)
valDataSetSliced.append(valDataSliced)

In [None]:
testDataSetSliced[0].shape

In [None]:
valDataSetSliced[0].shape

### K-Fold Test and Validation Datasets

In [None]:
from sklearn.model_selection import KFold

testDataSetSliced = []
valDataSetSliced = []

kf = KFold(n_splits=5, random_state=123, shuffle=True)
for train_index, test_index in kf.split(testValDataSliced):
    testDataSetSliced.append(testValDataSliced.iloc[train_index])
    valDataSetSliced.append(testValDataSliced.iloc[test_index])

In [None]:
testDataSetSliced[0].shape

In [None]:
valDataSetSliced[0].shape

## Test Files

In [None]:
sets = 1

for testDataSliced in testDataSetSliced:
    testDataSliced.reset_index(drop=True, inplace=True)
    testData = extractFeatures(diseaseTermsTest, geneTermsTest, testDataSliced)
    
    testDocId = testData['trec_doc_id']
    croppedTest = testData.drop(['title_abstract_mesh_stemmed', 'title_stemmed', 'abstract_stemmed', 
                              'mesh_stemmed', 'disease_stemmed', 'gene_stemmed', 'trec_doc_id'], axis=1)

    finalTest = croppedTest.sort_values('qid')
    finalTest['trec_doc_id'] = testDocId

    rankTest = finalTest.to_dict('records')

    f = open("test"+str(sets)+".txt", "w")

    for item in rankTest:
        for i,val in item.items():
            if(i == "relevance_score"):
                f.write(str(val)+" ")
            elif(i == "trec_doc_id"):
                f.write('# '+str(val))
            elif(i == "qid"):
                f.write(str(i)+":"+str(val)+" ")
            else:
                j = allSelectedFeatures[i]
                f.write(str(j)+":"+str(val)+" ")
        f.write("\n")    
    f.close()
    sets = sets + 1

## Validation Files

In [None]:
sets = 1

for valDataSliced in valDataSetSliced:
    valDataSliced.reset_index(drop=True, inplace=True)
    valData = extractFeatures(diseaseTermsTest, geneTermsTest, valDataSliced)
    
    valDocId = valData['trec_doc_id']
    croppedVal = valData.drop(['title_abstract_mesh_stemmed', 'title_stemmed', 'abstract_stemmed', 
                              'mesh_stemmed', 'disease_stemmed', 'gene_stemmed', 'trec_doc_id'], axis=1)

    finalVal = croppedVal.sort_values('qid')
    finalVal['trec_doc_id'] = valDocId

    rankVal = finalVal.to_dict('records')

    f = open("val"+str(sets)+".txt", "w")

    for item in rankVal:
        for i,val in item.items():
            if(i == "relevance_score"):
                f.write(str(val)+" ")
            elif(i == "trec_doc_id"):
                f.write('# '+str(val))
            elif(i == "qid"):
                f.write(str(i)+":"+str(val)+" ")
            else:
                j = allSelectedFeatures[i]
                f.write(str(j)+":"+str(val)+" ")
        f.write("\n")    
    f.close()
    sets = sets + 1

# L2R

In [None]:
%ls

# Running the Model

In [None]:
count = 1
randonRanking = []
ourRanking = []
allFeatures = []

while count < sets:
    metric = pyltr.metrics.NDCG(k=10)

    model = pyltr.models.LambdaMART(
        metric=metric,
        n_estimators=1000,
        learning_rate=0.02,
        max_features=0.5,
        query_subsample=0.5,
        max_leaf_nodes=10,
        min_samples_leaf=64,
        verbose=1,
    )

    print("Fold: "+str(count))
    with open('train.txt') as trainfile, open('val'+str(count)+'.txt') as valifile, open('test'+str(count)+'.txt') as evalfile:
        TrainX, Trainy, TrainQids, _ = pyltr.data.letor.read_dataset(trainfile)
        ValX, Valy, ValQids, _ = pyltr.data.letor.read_dataset(valifile)
        EvalX, Evaly, EvalQids, _ = pyltr.data.letor.read_dataset(evalfile)
        
    monitor = pyltr.models.monitors.ValidationMonitor(ValX, Valy, ValQids, metric=metric, stop_after=250)
    model.fit(TrainX, Trainy, TrainQids, monitor=monitor)
    Epred = model.predict(EvalX)
    randonRanking.append(metric.calc_mean_random(EvalQids, Evaly))
    ourRanking.append(metric.calc_mean(EvalQids, Evaly, Epred))
    
    # features
    nonZero = np.nonzero(model.feature_importances_)
    for i in nonZero:
        nonZeros = i.tolist()
        
    listFeatures = np.argsort(model.feature_importances_)
        
    for feature in listFeatures:
        if (feature in nonZeros) and (feature not in allFeatures):
            allFeatures.append(feature)
    
    count+=1

In [None]:
Epred = model.predict(EvalX)
metric.calc_mean_random(EvalQids, Evaly)

In [None]:
numpR = np.asarray(randonRanking)
np.mean(numpR)

In [None]:
print(ourRanking)

In [None]:
numpR = np.asarray(ourRanking)
np.mean(numpR)

In [None]:
len(allFeatures)

In [None]:
for feature in allFeatures:
    for key in allSelectedFeatures:
        f = feature+1
        if allSelectedFeatures[key] == f:
            print(key)