In [1]:
import json
import nltk
import enum
import pandas as pd
from nltk.tokenize import word_tokenize
from collections import defaultdict
import numpy as np
from typing import List,Tuple
import spacy

### make sure you have run ./DATA/pre_process.ipynb first 

In [2]:
class Query_type(enum.Enum):
    AUTHOR = "Author Based"
    TITLE = "Title Based"
def is_int(s):
    try:
        int(s)
        return True
    except:
        return False
class Boolean_IR:
    def __init__(self):
        self.author_to_id = json.load(open("DATA/Module_data/author_to_id.json","r"))
        self.author_to_doc = json.load(open("DATA/Module_data/author_to_doc.json","r"))
        self.authorid_to_num_id = json.load(open("DATA/Module_data/authorid_to_num_id.json","r"))
        self.documents = json.load(open("DATA/crawler/data/NLP.json","r"))
        self.lemma_title = json.load(open("DATA/Module_data/title_lemma.json","r"))
        self.bool_dic_title = json.load(open("DATA/Module_data/bool_dic_title.json","r"))
        self.nlp = spacy.load("en_core_web_sm")
        self.title_tokenizer = lambda s : [token.lemma_ for token in self.nlp(s) if token.lemma_ not in self.nlp.Defaults.stop_words ]

    def flatten(seflf,l : List[List]) -> List:
        return [item for sublist in l for item in sublist]

    def word_tokenize_authoe(self,t : str) -> List:
        res = word_tokenize(t)
        if (res[-1] != "."):
            return res
        res[-2] = res[-2]+res[-1]
        return res[:-1]

    def pre_process_authors(self) -> None:
        self.all_names = list(set(self.flatten([self.word_tokenize_authoe(key) for key in self.author_to_id if not is_int(key)])))
        i = iter(range(1,len(self.all_names)+1))
        self.w_mapping = defaultdict(lambda : next(i))
        self.bool_dic_author = defaultdict(lambda : [])
        list(map(lambda x : self.w_mapping[x],self.all_names))
        removed_key = []
        for key in self.author_to_id:
            if not is_int(key) and is_int(self.author_to_id[key]) and key:
                i = self.author_to_id[key]
                self.bool_dic_author[i] = np.array([self.w_mapping[w] for w in self.word_tokenize_authoe(key)])
            else:
                removed_key.append(key)
        for x in removed_key:
            del self.author_to_id[x]
    def pre_process_title(self) -> None:
        for key in self.bool_dic_title:
            self.bool_dic_title[key] = np.array(self.bool_dic_title[key])


            
    def title_ir(self,wk:str , k : int = 10):
        words = np.array([self.lemma_title.get(w,0) for w in wk])
        titles = [(key,np.sum([np.sum([item == self.bool_dic_title[key] for item in words ])])) for key in self.documents if type(self.documents[key]["title"]) == str]
        return sorted(titles , key = lambda x : x[1],reverse=True)[:k]


    def author_ir(self,input_wk:str,k) -> List:
        names_map = np.array([self.w_mapping.get(w,0) for w in input_wk])
        authors = [(key,np.sum([np.sum([name == self.bool_dic_author[self.author_to_id[key]] for name in names_map ])])) for key in self.author_to_id]
        return sorted(authors , key = lambda x : x[1],reverse=True)[:k]

    def query(self,type : Query_type , input_string:str , k : int = 10) -> Tuple[List,List]:
        input_string = input_string.lower()
        if type == Query_type.TITLE:
            mapping = self.title_ir(self.title_tokenizer(input_string.strip().lower()), k)
            articles = [self.documents[id[0]] for id in mapping]
            return (articles,mapping)
        elif type == Query_type.AUTHOR:
            names =  self.author_ir(self.word_tokenize_authoe(input_string.strip()),k) 
            articles = self.flatten([[self.documents[id] for id in self.author_to_doc[self.author_to_id[name[0]]]] for name in names])
            return (articles[:k],names)

In [3]:
boolean_ir = Boolean_IR()
boolean_ir.pre_process_authors()
boolean_ir.pre_process_title()


In [20]:
articles,names = boolean_ir.query(Query_type.AUTHOR,"mir h. ali",k = 10)
for m,a in zip(articles,names):
    print(a)
    print(m["paperId"])
    # print(m["title"])
    # print(m["abstract"])

('mir h. ali', 3)
7d4d45f164370c368f00f713dc4b1b2f810b2a01
('h. h. muljo', 2)
be716f9ea2d22d2855ba06aa466c35ca23023172
('h. wallach', 1)
d47a682723f710395454687319bb55635e653105
('h. cunningham', 1)
3e65f572322e192fe36ae52a8a7f025b0685dfc6
('ali daud', 1)
835ac3cbb41f2ec47718c5491211dd33b64f382b
('h. schwarz', 1)
2bd2e082913c5366129622fbee1fe24f2dfa696f
('h. wang', 1)
a1f7045dd66b01f46843592035a26ea407b48982
('alexander h. miller', 1)
d19b000d782ca90138a38bc7c882a992a99e38c8
('h. chase', 1)
6e6825c2feada559592e49b093a06fc27214c6c0
('patrick h. duffy', 1)
5b7929b7e1e74865683e3d1dc5bfd062ef1cab6b


In [5]:
articles,mapping = boolean_ir.query(Query_type.TITLE,"A Survey of Data Augmentation Approaches for NLP",k = 10)
for a,m in zip(mapping,articles):
    print(a)
    print(m["title"])

('63d8426ba1f51a8525dd19fd8ec92934ec71aea5', 5)
A Survey of Data Augmentation Approaches for NLP
('013eb12ce5468f79d58bf859653f4929c5a2bd14', 5)
An Empirical Survey of Data Augmentation for Limited Data Learning in NLP
('69870a6f189a8a68d6dc1e25b6024291711f43a0', 4)
Clinical Decision Support Systems: A Survey of NLP-Based Approaches from Unstructured Data
('c9b56cb026a38e39bb0228faac57accd6f65e6f7', 3)
TextAttack: A Framework for Adversarial Attacks, Data Augmentation, and Adversarial Training in NLP
('982aa0ee48a5fd228fb9fb3b3edd319b8af6f76d', 3)
Text Data Augmentation Made Simple By Leveraging NLP Cloud APIs
('2a8a2ab581f2e89c9a66e1b353346e1bb86ee6f6', 3)
Mixup-Transformer: Dynamic Data Augmentation for NLP Tasks
('9dff2ada58cb7a836ec6a23ce8e22d7ce8e0b81f', 3)
NLP-Based Approach to Semantic Classification of Heterogeneous Transportation Asset Data Terminology
('ddbbf8dd6f4cc4cbf6efba22196e03b37bcd349f', 3)
Sentiment Analysis Approaches on Different Data set Domain: Survey
('d47a68272

In [7]:
wk = boolean_ir.title_tokenizer("A Survey of Data Augmentation Approaches for NLP".lower())
print(wk)
words = np.array([boolean_ir.lemma_title.get(w,0) for w in wk])
print(words)

['survey', 'datum', 'augmentation', 'approach', 'nlp']
[2 4 5 6 8]
