In [1]:
import re
import numpy as np
import pandas as pd

In [317]:
#main class that will process logical queries
class Query:
    
    def __init__(self,query)->None:
        '''
        Constructor function for query class. Takes in a single query
        Input type:str.
        For naive implementation, this function assumes that the query is of the type: a or b and c .......
        Any brackets or illegal characters will cause the query to fail.
        Also,it is assumed that the query characters are space separated.
        
        '''
        self.query=query.lower()
    
    #this function will process query and return a result
    def process_query(self,doc):
        '''
        Function to perform query operation on the input document.
        Inputs: a pandas dataframe object of the term document index form.
        
        '''
        #split the query into a list
        querycpy=self.query.split(" ")
        
        querycpy.reverse()
        
        #list of all possible query joiners
        list_bools=['and','not','or']
        
        prevEle=None
        nextEle=None
        prevQuery=None
        err=False
        
        while(len(querycpy)>0):
            element=querycpy.pop(0)
            
            #check if element is a boolean operator //O(3)
            if element in list_bools:
                
                #if boolean operator doesnt have any operation to perform
                if prevEle is None:
                    err=True
                    break
                    
                elif element=="not":
                    
                    prevQuery=self.solve_query(prevEle,nextEle,element,doc,prevQuery)
                    prevEle=prevQuery
                    nextEle=None
                    
                elif prevEle is not None and nextEle is None:
                    
                    nextEle=querycpy.pop()
                    prevQuery=self.solve_query(prevEle,nextEle,element,doc,prevQuery)
                    prevEle=prevQuery
                    nextEle=None
                    
            elif prevEle is not None and nextEle is None:
                nextEle=element
                
            elif prevEle is None and nextEle is None:
                
                prevEle=element
            
            elif prevEle is None and nextEle is not None:
                
                err=True
                break
        if err:
            return None
        return prevQuery
    

                
    #this function will solve the query being generated from process query function
    def solve_query(self,prevEle,nextEle,operator,df,prevQuery):
        '''
        This function will solve the query being generated from process query function.
        returns a binary string representing the boolean operation of the query on tdi.
        Input: term1,term2,boolean query type,document.
        Returns: a dataframe of found documents
        
        '''
        result=None
        print('Prev :{}'.format(prevEle))
        print('Next :{}'.format(nextEle))
        print('Operator :{}'.format(operator))
        print('prevQuery :{}'.format(prevQuery))


        if prevQuery is None:
            
            if operator=="not":
                
                result=df.loc[df["tokens"]==prevEle]
                result=self.convert(result)
                result=self.not_operation(result)
                
            
            elif operator=="or":
                
                result=df.loc[df['tokens']==prevEle]
                result=self.convert(result)
          
                result2=df.loc[df['tokens']==nextEle]
                result2=self.convert(result2)   
                
                result=self.or_operation(result,result2)
            
            elif operator=="and":
                
                result=df.loc[df['tokens']==prevEle]
                result=self.convert(result)   
                result2=df.loc[df['tokens']==nextEle]
                result2=self.convert(result2)   
                
                result=self.and_operation(result,result2)
        
        else:
            
            if operator=="not":
                
                result=self.not_operation(prevQuery)
                
            elif operator=="and":
                
                result2=df.loc[df['tokens']==nextEle]
                result2=self.convert(result2)   
                
                result=self.and_operation(prevQuery,result2)
                
            elif operator=="or":
                
                result2=df.loc[df['tokens']==nextEle]
                result2=self.convert(result2)   
              
                result=self.or_operation(prevQuery,result2)
                                
        return result    
                
                
                
    #not operation function      
    def not_operation(self,data):
        '''
        Perform not operation on a list.
        Input data->list (strings)
        returns list.
        '''
     
        dt=data[:]
       
        for (idx,ele) in enumerate(dt):
     
            if ele=="0":
                dt[idx]="1"
                
            elif ele=="1":
                dt[idx]="0"
        return dt
    
    
    #and operation function
    def and_operation(self,data,data2):
        '''
        Perform and operation on a list.
        Input data->list (strings),number=2
        returns list.
        '''
        dt=data[:]
        dt2=data2[:]
        
        idx=0
        for (ele1,ele2) in zip(dt,dt2):
            if bool(int(ele1)) and bool(int(ele2)):
                dt[idx]="1"
                
            else:    
                dt[idx]="0"
            idx+=1
            
        return dt
    


    
    #OR operation function
    def or_operation(self,data,data2):
        '''
        Perform OR operation on 2 list.
        Input data->list (strings),number =2
        returns list.
        '''
        dt=data[:]
        dt2=data2[:]
        
        idx=0
        
        for (ele1,ele2) in zip(dt,dt2):
            if bool(int(ele1)) or bool(int(ele2)):
                dt[idx]="1"
                
            else:    
                dt[idx]="0"
            idx+=1
            
        return dt
                        
    def convert(self,data):
        '''
        Converts empty dataframe to 0 list and series to list.
        '''
        print(data)
        if isinstance(data,pd.DataFrame):
            
            if data.empty:
                lis=["0" for ele in list(data.columns)]
                lis.pop(0)
                return lis
            
            else:
                lis=data.drop(labels=["tokens"],axis=1)
                lis=np.array(lis.values.tolist()).ravel()
                return list(map(str,lis.tolist()))
        
        elif isinstance(data,pd.Series):
            return list(map(str,data.tolist()))
        
            
        

                
            
        
        
        
    
    

In [318]:
help(Query.__init__)

Help on function __init__ in module __main__:

__init__(self, query) -> None
    Constructor function for query class. Takes in a single query
    Input type:str.
    For naive implementation, this function assumes that the query is of the type: a or b and c .......
    Any brackets or illegal characters will cause the query to fail.
    Also,it is assumed that the query characters are space separated.



In [319]:
bool(int("1"))

True

<h2>Testing Query</h2>

In [322]:
query="maroon and adam"
q=Query(query)
df=pd.read_csv("../data/term_matrix.csv")

In [323]:
q.process_query(df)

Prev :adam
Next :maroon
Operator :and
prevQuery :None
    tokens  dbz  doomEternal  twice  maroon5  coldplay  bioshock
523   adam    0            0      0        1         0         0
     tokens  dbz  doomEternal  twice  maroon5  coldplay  bioshock
481  maroon    0            0      0        1         0         0


['0', '0', '0', '1', '0', '0']

In [324]:
df.loc[df['tokens']=='dragon'].value_counts()

tokens  dbz  doomEternal  twice  maroon5  coldplay  bioshock
dragon  1    0            0      0        0         0           1
dtype: int64

In [325]:
df.empty

False