In [1]:
import os

MODULE = "/Users/alifian/Documents/GitHub/data-vis-kakenhi"
os.chdir(MODULE)
os.getcwd()

'/Users/alifian/Documents/GitHub/data-vis-kakenhi'

In [2]:
# utils import
from database.database import load_researchers
from dataset.dataset import *

# load data, models, collections
en_data = load_en_data()
en_faiss = load_en_faiss()
en_model = load_en_model()
researchers = load_researchers()

loading data
loading faiss index
loading en keyword model
Connecting to cluster


In [3]:
import pandas as pd
import numpy as np

def vector_search(query, model, index, num_results=10):
    """Tranforms query to vector using a pretrained, sentence-level 
    DistilBERT model and finds similar vectors using FAISS.
    Args:
        query (str): User query that should be more than a sentence long.
        model (sentence_transformers.SentenceTransformer.SentenceTransformer)
        index (`numpy.ndarray`): FAISS index that needs to be deserialized.
        num_results (int): Number of results to return.
    Returns:
        D (:obj:`numpy.array` of `float`): Distance between results and query.
        I (:obj:`numpy.array` of `int`): Paper ID of the results.
    
    """
    vector = model.encode(list(query))
    D, I = index.search(np.array(vector).astype("float32"), k=num_results)
    return D, I

def get_keywords(model, data, index, kw, num=10):
    ''' get a list of similar keywords

    Parameters
    ----------
        model : the model to be used for extracting keywords from
        data : the data frame that contains full information related to a given index
        kw : string querry
        num: number of resutls to be extracted (default is 20)

    Returns
    -------
        tuple of a list of strig keywords and a list of coresponding memberIDs
    
    '''

    # Load data and models
    D, I = vector_search([kw], model, index, num_results=num)
    set_kw_ids = set(data.id)
    

    kw_tar = list()
    member_id = list()
    id_type = list()

    # print(data)

    for id_ in I.flatten().tolist():
            if id_ in set_kw_ids:
                f = data[(data.id == id_)]
            else:
                continue
            kw_tar.append(f.keyword.values[0])
            member_id.append(f.memberID.values[0])
            id_type.append(f.memberID_type.values[0])

    
    df = pd.DataFrame({'kw': kw_tar, 'memid': member_id, 'idtype': id_type})
    df.memid = df.memid.astype(str)
    df.idtype = df.idtype.astype(str)
    # print(df)
    # df = df.groupby(['kw'])['memid'].apply(','.join ).reset_index()
    # print(df)

    return list(df['kw']), list(df['memid']), list(df['idtype'])

In [4]:
kw = "machine learning"

In [5]:
keywords_list, id_list, id_type = get_keywords(en_model, en_data, en_faiss, kw, 50)

In [6]:
def flatten_str_list(memid):
    return_list = list()
    for ids in memid:
        for idsplit in ids.split(','):
            return_list.append(idsplit)
    return return_list

In [7]:
ids = list(set(flatten_str_list(id_list)))

In [8]:
select_field = {
    "$project" :
       {
         "institution" : "$institutionName.en",
         "name" : "$name.en",
        "kakenhiID": "$kakenhiID",
         "_id" : 0
       }
    }

In [9]:
id_query = {"$or": [{ "researchmapID": { "$in": ids } }, { "kakenhiID": { "$in": ids } }]}
pipeline = [
    {"$match": {
        "$and": [
            id_query,
        ],
    }},
    select_field,
    {"$limit" :  100},
]

In [10]:
from pandas import DataFrame

In [11]:
df = DataFrame(list(researchers.aggregate(pipeline)))

In [12]:
df

Unnamed: 0,institution,name,kakenhiID
0,Tokyo Metropolitan University,CHIN WEIHONG,10876650
1,Osaka University,LI LIANGZHI,10875545
2,Osaka Prefecture University,Roy Parthapratim,10837222
3,Okinawa Institute of Science and Technology Gr...,KATIC Jelena,10834947
4,Kanazawa University,Gary Ross,10708142
...,...,...,...
95,The University of Tokyo,Berman Naomi,60814407
96,International Christian University,PICHL Lukas,10343394
97,National Institute of Information and Communic...,Mano Hiroaki,10571581
98,Osaka University,TABATA Tomoji,10249873


In [13]:
df[df["institution"]==inst][['name', 'kakenhiID']]

NameError: name 'inst' is not defined

In [39]:
return_data = list()

for inst in list(df['institution'].unique()):
    researcher_profiles = list()
    print(inst)
    filtered_df = df[df["institution"]==inst][['name', 'kakenhiID']]
    for idx in range(len(filtered_df)):
        profile_name = filtered_df.iloc[idx][0] # name
        kak_id = filtered_df.iloc[idx][1] # kakenhi ID
        ob = {"firstName": profile_name, "kakenhiID": kak_id}
        researcher_profiles.append(ob)
    print({"institution": inst, "researcherProfiles": researcher_profiles})
    return_data.append({"institution": inst, "researcherProfiles": researcher_profiles})

Tokyo Metropolitan University
{'institution': 'Tokyo Metropolitan University', 'researcherProfiles': [{'firstName': 'CHIN WEIHONG', 'kakenhiID': '10876650'}, {'firstName': 'PETER GUENTERT', 'kakenhiID': '20392110'}]}
Osaka University
{'institution': 'Osaka University', 'researcherProfiles': [{'firstName': 'LI LIANGZHI', 'kakenhiID': '10875545'}, {'firstName': 'ORLOSKY JASON', 'kakenhiID': '10815111'}, {'firstName': 'Yoo Donghoon', 'kakenhiID': '20868541'}, {'firstName': 'Tamura Shinichi', 'kakenhiID': '30029540'}, {'firstName': 'SHIMIZU Masaki', 'kakenhiID': '20550304'}, {'firstName': 'MACPHERSON TOM', 'kakenhiID': '40821898'}, {'firstName': 'Nakagawa Ikuo', 'kakenhiID': '70647437'}, {'firstName': 'TAKAGI TATSUYA', 'kakenhiID': '80144517'}, {'firstName': 'TABATA Tomoji', 'kakenhiID': '10249873'}]}
Osaka Prefecture University
{'institution': 'Osaka Prefecture University', 'researcherProfiles': [{'firstName': 'Roy Parthapratim', 'kakenhiID': '10837222'}, {'firstName': 'Chiang YiHan', 'ka

In [40]:
return_data

[{'institution': 'Tokyo Metropolitan University',
  'researcherProfiles': [{'firstName': 'CHIN WEIHONG',
    'kakenhiID': '10876650'},
   {'firstName': 'PETER GUENTERT', 'kakenhiID': '20392110'}]},
 {'institution': 'Osaka University',
  'researcherProfiles': [{'firstName': 'LI LIANGZHI', 'kakenhiID': '10875545'},
   {'firstName': 'ORLOSKY JASON', 'kakenhiID': '10815111'},
   {'firstName': 'Yoo Donghoon', 'kakenhiID': '20868541'},
   {'firstName': 'Tamura Shinichi', 'kakenhiID': '30029540'},
   {'firstName': 'SHIMIZU Masaki', 'kakenhiID': '20550304'},
   {'firstName': 'MACPHERSON TOM', 'kakenhiID': '40821898'},
   {'firstName': 'Nakagawa Ikuo', 'kakenhiID': '70647437'},
   {'firstName': 'TAKAGI TATSUYA', 'kakenhiID': '80144517'},
   {'firstName': 'TABATA Tomoji', 'kakenhiID': '10249873'}]},
 {'institution': 'Osaka Prefecture University',
  'researcherProfiles': [{'firstName': 'Roy Parthapratim',
    'kakenhiID': '10837222'},
   {'firstName': 'Chiang YiHan', 'kakenhiID': '10824196'}]},
 

In [48]:
return_format = [{
    "institution": "Turnabout",
    "relatedResearchers": 306,
    "researcherProfiles": [
      {
        "firstName": "Ines",
        "lastName": "Bean",
        "papers": [
          {
            "paperName":
              "Ea exercitation reprehenderit minim magna sit consequat elit occaecat qui duis veniam.",
          },
          {
            "paperName":
              "Voluptate incididunt consequat qui dolor dolore cupidatat duis do quis nulla.",
          },
          {
            "paperName":
              "Excepteur ea laboris aute mollit culpa anim Lorem sit do ipsum ullamco labore sit amet.",
          },
          {
            "paperName":
              "Est adipisicing consectetur aliqua Lorem ex sunt ex sit voluptate.",
          },
          {
            "paperName":
              "Ex anim exercitation ut nostrud et anim nostrud dolore adipisicing.",
          },
        ],
      }]
}]