In [8]:
import numpy as np
import random
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers,initializers,constraints
import time
import math
import subprocess
import string
from nltk.corpus import words

In [9]:
stopwords_file = open('Features/stopwords.txt','r')
stopwords = stopwords_file.read().split('\n')

max_binary_len = 20

In [10]:
def ret_query_doc(index,column_names):
    quer_doc_loc = 'Features/feature/{}.txt'.format(index)
    query_doc=pd.read_csv(quer_doc_loc, sep=" ", skiprows=1,names=column_names) # only place where num_of_docs is required
    query_doc[column_names[1:]]=query_doc[column_names[1:]].fillna(query_doc.mode().iloc[0])
    query_doc=query_doc.replace('-∞',0)
    query_doc[column_names[1:]]=query_doc[column_names[1:]].astype(float)
    query_doc = query_doc[~query_doc['word'].isin(stopwords)]
    query_doc = query_doc[query_doc['word'].isin(words.words())]
    return query_doc

In [11]:
def fixed_len_dectobin(tensor):
    global max_binary_len
    final_tensor = []
    for num in tensor:
        c=0
        x=[]
        while num>0:
            x=[num%2]+x
            num=tf.constant(int(num/2))
            c+=1
        x=np.array(x)-0.5
        x = np.pad(x,(max_binary_len-c,0),'constant')
        final_tensor = final_tensor+[tf.convert_to_tensor(x,dtype=tf.float32)]
    return tf.convert_to_tensor(final_tensor,dtype=tf.float32)



In [12]:
# INPUT: decimal_to_binary(f) -- shape=(num of bits,1)  
class Phi_Block(layers.Layer):
    def __init__(self):
        super(Phi_Block,self).__init__()
    
    def build(self,inputs_shape):
        self.w_phi=self.add_weight(name='w_phi',
                                   shape=(inputs_shape[-1],),
                                   initializer=initializers.RandomUniform(minval=0.1, maxval=1.), # random uniform initialised between 0 and 2
                                   #constraint=constraints.non_neg,
                                   trainable=True,
                                   )
        self.b_phi=self.add_weight(name='b_phi',
                                   shape=(1,),
                                   initializer=initializers.RandomUniform(minval=0.1, maxval=1.),
                                   trainable=True,
                                   )
        
        
    def call(self,input_tensor): # shape of input_tensor : ( number_of_words , max_binary_len )
        w_temp=tf.math.cumsum(self.w_phi,axis=0,reverse=True)
        return keras.activations.tanh(tf.math.reduce_sum(tf.multiply(input_tensor,w_temp),axis=-1)+self.b_phi)

In [13]:
class Policy_Net_Block(layers.Layer):
    def __init__(self,num_of_features,agg_dim):
        super(Policy_Net_Block,self).__init__()
        self.num_of_features=num_of_features
        self.agg_dim=agg_dim
        self.phi_block=[]
        for i in range(num_of_features):
            self.phi_block=self.phi_block+[Phi_Block()]

    def call(self,input_tensor): # input tensor is the word embedding [f1,f2,....,fl] --> shape: (number_of_words,l=num_of_features)
        temp=[]
        for i in range(self.num_of_features):
#            temp=temp+[self.phi_block[i](
#                tf.cast(tf.reverse(tf.math.floormod(tf.bitwise.right_shift(
#                    tf.expand_dims(tf.cast(input_tensor[:,i],dtype=tf.int32),1), tf.range(8)), 2),[-1]),
#                        dtype=tf.float32)-0.5
#            )]
            
            temp = temp + [self.phi_block[i](fixed_len_dectobin(tf.cast(input_tensor[:,i],dtype=tf.int32)))]
        temp_tensor=tf.transpose(tf.convert_to_tensor(temp,dtype=tf.float32))
        agg_out=layers.Dense(self.agg_dim)(temp_tensor)
        return agg_out # output of shape : (number_of_words,num_of_units)

In [14]:
class Policy_Network_Model(keras.Model):
    def __init__(self,agg_dim=8): # default agg_dim is set here
        super(Policy_Network_Model,self).__init__()
        self.agg_dim=agg_dim
        
    def build(self,input_shape): # input_shape = (number_of_words,num_of_docs,num_of_features)
        self.num_feat=input_shape[-1]
        self.num_doc=input_shape[-2]
        self.conc=[]
        for i in range(self.num_doc):
            self.conc=self.conc+[Policy_Net_Block(self.num_feat,self.agg_dim)]
        self.comp=layers.Dense(1)
        
    
    def call(self,input_tensor,training=False):
        comp_inp=[] # contains list of d dimensional vectors of size |F|
        temp=[]
        for i in range(self.num_doc):
            comp_inp=comp_inp+[self.conc[i](input_tensor[:,i,:])]
        temp=tf.concat(comp_inp,axis = -1)
        out=self.comp(temp)
        return out # output shape : (number_of_words,1)

In [15]:
# List of Hyper parameters
# 1) learning rate of Adam optimiser - {1e-3, 5e-4, 1e-4}
# 2) number of feedback terms in the reformulised query - {5, 10, 15, 20, 25}
# 3) feedback coefficient - {0.0, 0.2, ...., 1.0}

# 5-fold cross validation is performed
# top 1000 documents are retrieved

In [16]:
# develop the above functions as class functions
class RML_model:
    
    def __init__(self,K):
        self.model=Policy_Network_Model()
        self.prev_eval=0.0
        self.K=K # number of samples extracted
    
    def data_preprocessing(self,index):
        #extract the docs for the given query index 'ind'
        F = []
        q_df=self.query_doc_dict[index] # q_df stores the dataframe for docs retrieved by each query
        num_of_docs=int((len(q_df.columns)-3)/2)
        for i in range(q_df.shape[0]):
            # F should be 2D matrix with shape (num_of_docs,num_of_features)
            F_w=[]
            s=q_df.iloc[i]
            for j in range(1,num_of_docs+1):
                # The number and type of features is hard coded in this section
                F_w.append([s['tf{}'.format(j)],s['idf'],s['dl{}'.format(j)],s['df']])
            # F_w has been obtaied for a given word
            F = F + [F_w]
        F=tf.convert_to_tensor(F)
        return F
    
    def prob_dist_query(self,F):  # ---> R(Q,F;C,Omega)   F has shape : (num_of_words,num_of_docs,num_of_features)
        R=self.model(F,training=True)
        R = tf.keras.activations.softmax(R,axis=0)
        return R # returns array of softmax p(w/Q) -- shape(num_of_words,1)
    
    def sampling_function(self,R): # shape of R : (number_of_words,1)
        
        samples=tf.convert_to_tensor(np.random.choice(np.arange(len(self.vocabulary)),
                                                      size=(self.K,),replace=False,p=R[:,0].numpy()))
        #samples = tf.random.categorical(tf.math.log(tf.transpose(R).numpy()),self.K)[0]
        R_dash = tf.gather(params = R[:,0], indices = samples)
        R_dash=R_dash/tf.math.reduce_sum(R_dash)
        return samples,R_dash # returns (tensor,tensor)
    
    def sampling_test_function(self,R):
        samples = tf.math.top_k(R[:,0],k=self.K)
        R_dash = samples.values
        R_dash = R_dash/tf.math.reduce_sum(R_dash)
        return samples.indices,R_dash
        
    def prob_MLE_fn(self,q,samples):
        R_MLE=tf.convert_to_tensor([q.count(self.vocabulary[i]) for i in samples.numpy()],dtype=tf.float32)
        return R_MLE/len(q.split()) # returns tensor

    def interpolation_function(self,R_MLE,R_dash,alpha=0.5): # alpha is Hyperparamter (importance of word in the original query)
        R_cap=alpha*R_MLE+(1-alpha)*R_dash
        return R_cap # returns tensor   
    
    def retrieval(self, index, query, rf_query):
        filename = "query/query_ti"
        xml_file = open(filename, "w", encoding='utf8')
        n = xml_file.write(rf_query)
        xml_file.close()
    
        #running indri query
        indrirunquery_command =  '/home/ir-group/indri-5.12/runquery/IndriRunQuery'
        result = subprocess.Popen([indrirunquery_command, filename],
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.STDOUT)
        stdout, stderr = result.communicate()
        
        res_file = "query/query_rq"
        file1 = open(res_file, "wb")
        n1 = file1.write(stdout)
        file1.close()
        
        #performing TREC_eval
        indritrec_command =  '/home/ir-group/indri-5.12/trec_eval.9.0/trec_eval'
        trec_qrel = '/home/ir-group/indri-5.12/trec_eval.9.0/678_qrel'
        result1 = subprocess.Popen([indritrec_command, trec_qrel, res_file],
                                  stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
        stdout, stderr = result1.communicate()
        
        #saving MAP results
        map_file = "query/query_MAP"
        file2 = open(map_file, "wb")
        n2 = file2.write(stdout)
        file2.close()

        #returning the MAP result for a query
        file = open('query/query_MAP')
        content = file.readlines()
        #print(content)
        split_l = content[5].split("\t")
        print("MAP value: ", split_l[2])
        MAP = split_l[2]
        return MAP

    def query_reformulation(self, word_sample_dict, index, query):
        index_filepath = "/home/ir-group/indri-5.12/trec678.index/"
        preamble_string = "<parameters>\n<index>"+index_filepath+"</index>\n<runID>RML</runID>\n<trecFormat>true</trecFormat>\n<rule>method:dirichlet;mu:1500</rule>\n<count>10</count>\n<query>\n<number>" + str(index) + "</number>\n<text>"
        end_string = "</text>\n</query>\n</parameters>"
        expansion = ""
        ifac = 0.75
        table = str.maketrans(dict.fromkeys(string.punctuation))
        for key, value in word_sample_dict.items():
#            print(key, "and", value)
            expansion = expansion + str(value) + " " + key + " "
            # Removing punctuations as Indri doesnt support these characters
            term = expansion.translate(table) #removing punctuations
            
        reformulation = "#weight( " + str(ifac) + " #combine(" + query + ") " + str(1-ifac) + " #combine( " + expansion + "))"
        reformulated_query = preamble_string + " " + reformulation + " " + end_string
        #print(reformulated_query)
        return reformulated_query
    
    def eval_fn(self, word_sample_dict, index, query): # takes input the dictionary of size self.K which maps the word with its weight
        
        rf_query = self.query_reformulation(word_sample_dict, index, query)
        MAP = self.retrieval(index, query, rf_query)
        #print("MAP form eval", MAP, " ", type(MAP))
        return tf.cast(float(MAP),dtype=tf.float32)
            
    def reward_fn(self,word_sample_dict, index, query):  # doubt regarding the eval(Q,t) - eval(Q,t-1), the use of prev_eval ??
        t=self.eval_fn(word_sample_dict, index, query)
        reward=t-self.prev_eval
        self.prev_eval=t
        return reward
    
    def cross_entropy_loss(self,R_dash,R,word_sample):
        #R_temp=tf.convert_to_tensor([np.log(R[:,0].numpy()[i]) for i in word_sample.numpy()],dtype=tf.float32)
        R_temp = tf.gather(params=tf.math.log(R[:,0]),indices = word_sample.numpy())
        return tf.math.reduce_sum([i[0]*i[1] for i in zip(R_dash,R_temp)])
    
    def compute_loss_fn(self,X_batch): # X_batch is the dataframe containing 'index' and 'query' column for a given batch_size
        L=tf.constant(0.,dtype=tf.float32)
        for i in range(X_batch.shape[0]):
            index=int(X_batch.iloc[i]['index'])
            query = str(X_batch.iloc[i]['query'])
            print('----------------- Running query {} and {} -----------------'.format(index, query))
            self.vocabulary=self.query_doc_dict[index]['word'].values # stores words used in the top 'num_of_docs' retrieved by a single query
            F = self.data_preprocessing(index)
            R=self.prob_dist_query(F)
            word_sample,R_dash=self.sampling_function(R)
            R_MLE=self.prob_MLE_fn(query,word_sample)
            R_cap=self.interpolation_function(R_MLE,R_dash,alpha=0.6) 
            word_sample_dict = dict(zip([self.vocabulary[i] for i in word_sample],R_cap.numpy()))
            print("-> {}".format(word_sample_dict))
            L=L-self.reward_fn(word_sample_dict, index, query)*self.cross_entropy_loss(R_dash,R,word_sample)
        L = tf.convert_to_tensor(L,dtype=tf.float32)
        return L
    def train(self,query_df,query_doc_dict,epochs=10,batch_size=16):  # takes input as the queries and the documents retrieved by each query as a dict
        # query_df is the dataframe containing 'index' and 'query' column
        num_of_queries=query_df.shape[0] 
        self.query_doc_dict=query_doc_dict
        num_epochs=epochs
        optimizer=keras.optimizers.Adam(learning_rate = 0.01)
        for epoch in range(num_epochs):
            print('Start of training epoch : {}'.format(epoch))
            for i in range(int(num_of_queries/batch_size)):     
                X_batch=query_df.iloc[i*batch_size:(i+1)*batch_size] # X_batch is a dataframe with 'index' and 'query' column
                with tf.GradientTape() as tape:
                    loss = self.compute_loss_fn(X_batch)
                    print("Loss for epoch={} and batch={} : {}".format(epoch,i+1,loss))   
                gradients=tape.gradient(loss,self.model.trainable_variables)
                optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))

                
            if num_of_queries%batch_size!=0:
                X_batch=query_df.iloc[int(num_of_queries/batch_size)*batch_size:]
                with tf.GradientTape() as tape:
                    loss=self.compute_loss_fn(X_batch)
                    print("Loss for epoch={} and batch=rem : {}".format(epoch,loss))   
                gradients=tape.gradient(loss,self.model.trainable_weights)
                optimizer.apply_gradients(zip(gradients,self.model.trainable_weights))
        
    def test(self,query_df,query_doc_dict):
        print('Testing the model : ')
        num_of_queries = query_df.shape[0]
        self.query_doc_dict = query_doc_dict
        map_arr = []
        for i in range(query_df.shape[0]):
            query = query_df.iloc[i]['query']
            index = query_df.iloc[i]['index']
            self.vocabulary=self.query_doc_dict[index]['word'].values
            F = self.data_preprocessing(index)
            R = self.prob_dist_query(F)
            word_sample, R_dash = self.sampling_test_function(R)
            R_MLE=self.prob_MLE_fn(query,word_sample)
            R_cap=self.interpolation_function(R_MLE,R_dash,alpha=0.6)
            word_sample_dict = dict(zip([self.vocabulary[i] for i in word_sample],R_cap.numpy()))
            print('query : {}'.format(query))
            print("Reformalised query terms -> {}".format(word_sample_dict))
            MAP = (self.eval_fn(word_sample_dict, index, query)).numpy()
            map_arr = map_arr + [MAP]
        return map_arr     

In [17]:
num_of_docs=10 # change this if number of documents retrieved by a query changes
num_of_features=4
agg_dim=int(np.sqrt(num_of_features)) # Hyperparameter

# data preprocessing for query file
quer_df_loc = 'Features/678-topics.txt' 
query_df=pd.read_csv(quer_df_loc,sep='=>',names=['index','query']) 
column_names=['word']
column_names=column_names+['tf{}'.format(i) for i in range(1,num_of_docs+1)]
column_names=column_names+['idf']
column_names=column_names+['dl{}'.format(i) for i in range(1,num_of_docs+1)]
column_names=column_names+['df']

# temporary
query_df = query_df[(query_df['index']!=312) & (query_df['index']!=348) & (query_df['index']!=424)]
# temporary

# list of dataframes for the vocab and feature of each word in a given query
ret_docs=[ret_query_doc(index,column_names) for index in query_df['index'].values ]
query_doc_dict={query_df.iloc[i]['index']:ret_docs[i] for i in range(len(ret_docs))}
#print(max([query_doc_dict[i].max().iloc[1:].max() for i in query_df['index'].values]))

# train-test split
train_test_split_ratio = 0.9
train_query_df = query_df.iloc[:int(train_test_split_ratio*query_df.shape[0])]
test_query_df = query_df.iloc[int(train_test_split_ratio*query_df.shape[0]):]
train_doc_dict = {query_df.iloc[i]['index']:ret_docs[i] for i in range(int(len(ret_docs)*train_test_split_ratio))}
test_doc_dict = {query_df.iloc[i]['index']:ret_docs[i] for i in range(int(len(ret_docs)*train_test_split_ratio),len(ret_docs))}

  import sys


In [18]:
# execute this cell to train the model.
# Takes input the list of queries as query_df and the documents retrieved by each query as query_doc_dict
start=time.time()
rml=RML_model(4) # pass K : the number of samples to extract for query reformulation
x = train_doc_dict[train_query_df.iloc[0]['index']]
temp = rml.model(tf.random.uniform(shape=[x.shape[0],num_of_docs,num_of_features]),training=False)
rml.model.load_weights('saved_weights.h5') # comment this statement to train the model from scratch
rml.train(train_query_df,train_doc_dict,epochs = 2, batch_size = 64)
end=time.time()
print('time elapsed in training the model : {} hrs'.format((end-start)/3600))


Start of training epoch : 0
----------------- Running query 301 and international organized crime -----------------
-> {'modern': 0.11157203, 'inasmuch': 0.09781163, 'inflation': 0.091674484, 'type': 0.09894185}
MAP value:  0.0046

----------------- Running query 302 and poliomyelitis post polio -----------------
-> {'unabashedly': 0.09747263, 'basic': 0.10538318, 'saying': 0.100855365, 'poetry': 0.09628884}
MAP value:  0.0817

----------------- Running query 303 and hubble telescope achievements -----------------
-> {'isolated': 0.095652215, 'simply': 0.11091007, 'shelter': 0.09195216, 'ability': 0.10148557}
MAP value:  0.1000

----------------- Running query 304 and endangered species mammals -----------------
-> {'avenue': 0.105154745, 'urial': 0.10382145, 'infrequency': 0.101156905, 'spring': 0.08986693}
MAP value:  0.0083

----------------- Running query 305 and most dangerous vehicles -----------------
-> {'shorter': 0.099341944, 'remember': 0.099328585, 'contact': 0.0977655, 'gu

-> {'township': 0.10149536, 'heavy': 0.10798068, 'brother': 0.08826521, 'rifle': 0.10225874}
MAP value:  0.0026

----------------- Running query 344 and abuses of e mail -----------------
-> {'control': 0.104587235, 'governance': 0.09956759, 'instance': 0.09884963, 'principal': 0.096995495}
MAP value:  0.0400

----------------- Running query 345 and overseas tobacco sales -----------------
-> {'logic': 0.10428723, 'p': 0.072766446, 'dump': 0.1281281, 'downturn': 0.09481823}
MAP value:  0.0220

----------------- Running query 346 and educational standards -----------------
-> {'careful': 0.094363205, 'education': 0.4188356, 'variety': 0.09292014, 'preclude': 0.09388106}
MAP value:  0.0000

----------------- Running query 347 and wildlife extinction -----------------
-> {'autonomous': 0.09501869, 'impact': 0.10517191, 'elimination': 0.10182738, 'plant': 0.09798203}
MAP value:  0.0007

----------------- Running query 349 and metabolism -----------------
-> {'microbiological': 0.10251216, 

MAP value:  0.0697

----------------- Running query 386 and teaching disabled children -----------------
-> {'cerebral': 0.097916014, 'sad': 0.10254767, 'trash': 0.10528945, 'interpret': 0.09424688}
MAP value:  0.0219

----------------- Running query 387 and radioactive waste -----------------
-> {'major': 0.092888825, 'laboratory': 0.09313186, 'select': 0.10347169, 'cesium': 0.11050763}
MAP value:  0.0229

----------------- Running query 388 and organic soil enhancement -----------------
-> {'innovative': 0.12762044, 'retention': 0.10287448, 'poor': 0.098961174, 'equivalent': 0.070543885}
MAP value:  0.0121

----------------- Running query 389 and illegal technology transfer -----------------
-> {'unavoidable': 0.10026007, 'legal': 0.3104449, 'charge': 0.09681601, 'usually': 0.09247905}
MAP value:  0.0010

----------------- Running query 390 and orphan drugs -----------------
-> {'handful': 0.096889704, 'suffer': 0.09430707, 'efficiency': 0.100425616, 'manufacture': 0.10837761}
MAP va

MAP value:  0.0264

----------------- Running query 429 and legionnaires disease -----------------
-> {'dioxide': 0.08968937, 'aspergillus': 0.08977741, 'filtration': 0.08977741, 'lung': 0.13075581}
MAP value:  0.0485

----------------- Running query 430 and killer bee attacks -----------------
-> {'cantaloupe': 0.10148268, 'pacific': 0.09960806, 'finding': 0.106100954, 'port': 0.09280831}
MAP value:  0.4907

----------------- Running query 431 and robotic technology -----------------
-> {'available': 0.104579665, 'prompt': 0.09368877, 'determine': 0.098625325, 'erosion': 0.10310625}
MAP value:  0.0218

Loss for epoch=0 and batch=2 : -1.269749402999878
----------------- Running query 432 and profiling motorists police -----------------
-> {'colin': 0.09965232, 'relate': 0.102752104, 'highlight': 0.09965232, 'chemical': 0.09794326}
MAP value:  0.0000

----------------- Running query 433 and greek philosophy stoicism -----------------
-> {'technocratic': 0.09437702, 'landing': 0.10046092

KeyboardInterrupt: 

In [13]:
# save the weights of the model
rml.model.save_weights('saved_weights.h5')

In [13]:
rml.model.trainable_weights[0]

<tf.Variable 'policy__network__model/policy__net__block/phi__block/w_phi:0' shape=(20,) dtype=float32, numpy=
array([0.30985805, 0.88752383, 0.269065  , 0.55500835, 0.7633393 ,
       0.62908787, 0.92710507, 0.2243061 , 0.54521054, 0.30970347,
       0.65288955, 0.64080924, 0.573331  , 0.2765277 , 0.9137944 ,
       0.22049949, 0.06107275, 0.41391474, 0.7974824 , 0.98787355],
      dtype=float32)>

In [15]:
# validaton 
MAP_score = rml.test(test_query_df,test_doc_dict)

Testing the model : 
query : railway accidents
Reformalised query terms -> {'labor': 0.13254432, 'reform': 0.10496382, 'drug': 0.08551115, 'yuan': 0.07698074}
MAP value:  0.0000

query : deregulation gas electric
Reformalised query terms -> {'company': 0.121422306, 'electric': 0.29850376, 'increase': 0.09016055, 'people': 0.08991339}
MAP value:  0.0000

query : tourism increase
Reformalised query terms -> {'percent': 0.11094677, 'office': 0.096856326, 'current': 0.09631947, 'change': 0.09587746}
MAP value:  0.0024

query : inventions scientific discoveries
Reformalised query terms -> {'world': 0.109131746, 'share': 0.09847072, 'task': 0.096212566, 'program': 0.096185}
MAP value:  0.0000

query : child labor
Reformalised query terms -> {'day': 0.10649319, 'outside': 0.10297185, 'school': 0.09563116, 'employment': 0.09490382}
MAP value:  0.0060

query : lyme disease
Reformalised query terms -> {'tick': 0.14465557, 'p': 0.09061281, 'di': 0.38284105, 'fever': 0.08189063}
MAP value:  0.3113

In [16]:
np.array(MAP_score).mean()

0.08041334

In [17]:
!nvidia-smi

Fri Aug 26 14:26:19 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.33.01    Driver Version: 440.33.01    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K40c          On   | 00000000:04:00.0 Off |                    0 |
| 23%   33C    P8    23W / 235W |     11MiB / 11441MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    