In [6]:
import os

MODULE = "/Users/alifian/Documents/GitHub/data-vis-kakenhi"
os.chdir(MODULE)
os.getcwd()

'/Users/alifian/Documents/GitHub/data-vis-kakenhi'

In [7]:
# utils import
from database.database import load_researchers
from dataset.dataset import *

# load data, models, collections
en_data = load_en_data()
en_faiss = load_en_faiss()
en_model = load_en_model()
researchers = load_researchers()

loading data
loading faiss index
loading en keyword model
Connecting to cluster


In [14]:
import pandas as pd
import numpy as np

def vector_search(query, model, index, num_results=10):
    """Tranforms query to vector using a pretrained, sentence-level 
    DistilBERT model and finds similar vectors using FAISS.
    Args:
        query (str): User query that should be more than a sentence long.
        model (sentence_transformers.SentenceTransformer.SentenceTransformer)
        index (`numpy.ndarray`): FAISS index that needs to be deserialized.
        num_results (int): Number of results to return.
    Returns:
        D (:obj:`numpy.array` of `float`): Distance between results and query.
        I (:obj:`numpy.array` of `int`): Paper ID of the results.
    
    """
    vector = model.encode(list(query))
    D, I = index.search(np.array(vector).astype("float32"), k=num_results)
    return D, I

def get_keywords(model, data, index, kw, num=10):
    ''' get a list of similar keywords

    Parameters
    ----------
        model : the model to be used for extracting keywords from
        data : the data frame that contains full information related to a given index
        kw : string querry
        num: number of resutls to be extracted (default is 20)

    Returns
    -------
        tuple of a list of strig keywords and a list of coresponding memberIDs
    
    '''

    # Load data and models
    D, I = vector_search([kw], model, index, num_results=num)
    set_kw_ids = set(data.id)
    

    kw_tar = list()
    member_id = list()
    id_type = list()

    # print(data)

    for id_ in I.flatten().tolist():
            if id_ in set_kw_ids:
                f = data[(data.id == id_)]
            else:
                continue
            kw_tar.append(f.keyword.values[0])
            member_id.append(f.memberID.values[0])
            id_type.append(f.memberID_type.values[0])

    
    df = pd.DataFrame({'kw': kw_tar, 'memid': member_id, 'idtype': id_type})
    df.memid = df.memid.astype(str)
    df.idtype = df.idtype.astype(str)
    # print(df)
    # df = df.groupby(['kw'])['memid'].apply(','.join ).reset_index()
    # print(df)

    return list(df['kw']), list(df['memid']), list(df['idtype'])

In [25]:
kw = "machine learning"

In [69]:
keywords_list, id_list, id_type = get_keywords(en_model, en_data, en_faiss, kw, 50)

In [70]:
def flatten_str_list(memid):
    return_list = list()
    for ids in memid:
        for idsplit in ids.split(','):
            return_list.append(idsplit)
    return return_list

In [71]:
ids = list(set(flatten_str_list(id_list)))

In [77]:
select_field = {
    "$project" :
       {
         "institution" : "$institutionName.en",
         "name" : "$name.en",
         "_id" : 0
       }
    }

In [81]:
id_query = {"$or": [{ "researchmapID": { "$in": ids } }, { "kakenhiID": { "$in": ids } }]}
pipeline = [
    {"$match": {
        "$and": [
            id_query,
        ],
    }},
    select_field,
    {"$limit" :  100},
]

In [82]:
from pandas import DataFrame

In [94]:
df = DataFrame(list(researchers.aggregate(pipeline)))

In [96]:
df

Unnamed: 0,institution,name
0,Ritsumeikan University,YAMADA Mayumi
1,Okinawa Institute of Science and Technology Gr...,VELASQUE Mariana
2,Tohoku University,
3,Ritsumeikan Asia Pacific University,LEE Timothy
4,Okayama University,HAYAKAWA Toru
5,Osaka University,Nakai Hiroshi
6,Meiji University,Adams Andrew A.
7,Tokyo Metropolitan University,GROISARD Jocelyn
8,Nagoya University,mangyo eiji
9,Saitama University,Bez Neal


In [116]:
return_data = list()
for inst in list(df['institution'].unique()):
    researcher_profiles = list()
    print(inst)
    for profile_name in list(df[df["institution"]==inst]['name']):
        ob = {"firstName": profile_name}
        researcher_profiles.append(ob)
#     print({"institution": inst, "researcherProfiles": researcher_profiles})
    return_data.append({"institution": inst, "researcherProfiles": researcher_profiles})

Ritsumeikan University
Okinawa Institute of Science and Technology Graduate University
Tohoku University
Ritsumeikan Asia Pacific University
Okayama University
Osaka University
Meiji University
Tokyo Metropolitan University
Nagoya University
Saitama University
Hitotsubashi University
Kyoto University
Tokyo Medical and Dental University
Tokyo University of Science
Tokyo University of Foreign Studies
National Institute of Health Sciences
Kyushu University
Kanazawa University
The University of Tokyo
Shiga University
Aichi Prefectural University
Tokyo Institute of Technology
The University of Shiga Prefecture
Institute of Physical and Chemical Research


In [117]:
return_data

[{'institution': 'Ritsumeikan University',
  'researcherProfiles': [{'firstName': 'YAMADA Mayumi'}]},
 {'institution': 'Okinawa Institute of Science and Technology Graduate University',
  'researcherProfiles': [{'firstName': 'VELASQUE Mariana'}]},
 {'institution': 'Tohoku University',
  'researcherProfiles': [{'firstName': ''}, {'firstName': 'Nagai Hiroki'}]},
 {'institution': 'Ritsumeikan Asia Pacific University',
  'researcherProfiles': [{'firstName': 'LEE Timothy'},
   {'firstName': 'MANTELLO Peter A.'}]},
 {'institution': 'Okayama University',
  'researcherProfiles': [{'firstName': 'HAYAKAWA Toru'},
   {'firstName': 'RUCYNSKI John'}]},
 {'institution': 'Osaka University',
  'researcherProfiles': [{'firstName': 'Nakai Hiroshi'}]},
 {'institution': 'Meiji University',
  'researcherProfiles': [{'firstName': 'Adams Andrew A.'}]},
 {'institution': 'Tokyo Metropolitan University',
  'researcherProfiles': [{'firstName': 'GROISARD Jocelyn'}]},
 {'institution': 'Nagoya University',
  'resea

In [48]:
return_format = [{
    "institution": "Turnabout",
    "relatedResearchers": 306,
    "researcherProfiles": [
      {
        "firstName": "Ines",
        "lastName": "Bean",
        "papers": [
          {
            "paperName":
              "Ea exercitation reprehenderit minim magna sit consequat elit occaecat qui duis veniam.",
          },
          {
            "paperName":
              "Voluptate incididunt consequat qui dolor dolore cupidatat duis do quis nulla.",
          },
          {
            "paperName":
              "Excepteur ea laboris aute mollit culpa anim Lorem sit do ipsum ullamco labore sit amet.",
          },
          {
            "paperName":
              "Est adipisicing consectetur aliqua Lorem ex sunt ex sit voluptate.",
          },
          {
            "paperName":
              "Ex anim exercitation ut nostrud et anim nostrud dolore adipisicing.",
          },
        ],
      }]
}]