In [1]:
import os

MODULE = "/Users/alifian/Documents/GitHub/data-vis-kakenhi"
os.chdir(MODULE)
os.getcwd()

'/Users/alifian/Documents/GitHub/data-vis-kakenhi'

In [2]:
# utils import
from database.database import load_researchers
from dataset.dataset import *

# load data, models, collections
en_data = load_en_data()
en_faiss = load_en_faiss()
en_model = load_en_model()
researchers = load_researchers()

loading data
loading faiss index
loading en keyword model
Connecting to cluster


In [36]:
import pandas as pd
import numpy as np
import numpy.linalg as LA

def vector_search(query, model, index, num_results=10):
    """Tranforms query to vector using a pretrained, sentence-level 
    DistilBERT model and finds similar vectors using FAISS.
    Args:
        query (str): User query that should be more than a sentence long.
        model (sentence_transformers.SentenceTransformer.SentenceTransformer)
        index (`numpy.ndarray`): FAISS index that needs to be deserialized.
        num_results (int): Number of results to return.
    Returns:
        D (:obj:`numpy.array` of `float`): Distance between results and query.
        I (:obj:`numpy.array` of `int`): Paper ID of the results.
    
    """
    vector = model.encode(list(query))
    D, I = index.search(np.array(vector).astype("float32"), k=num_results)
    return D, I

def get_keywords(model, data, index, kw, num=10):
    ''' get a list of similar keywords

    Parameters
    ----------
        model : the model to be used for extracting keywords from
        data : the data frame that contains full information related to a given index
        kw : string querry
        num: number of resutls to be extracted (default is 20)

    Returns
    -------
        tuple of a list of strig keywords and a list of coresponding memberIDs
    
    '''

    # Load data and models
    D, I = vector_search([kw], model, index, num_results=num)
    set_kw_ids = set(data.id)
    

    kw_tar = list()
    member_id = list()
    id_type = list()

    # print(data)

    for id_ in I.flatten().tolist():
            if id_ in set_kw_ids:
                f = data[(data.id == id_)]
            else:
                continue
            kw_tar.append(f.keyword.values[0])
            member_id.append(f.memberID.values[0])
            id_type.append(f.memberID_type.values[0])

    
    df = pd.DataFrame({'kw': kw_tar, 'memid': member_id, 'idtype': id_type})
    df.memid = df.memid.astype(str)
    df.idtype = df.idtype.astype(str)
    # print(df)
    # df = df.groupby(['kw'])['memid'].apply(','.join ).reset_index()
    # print(df)

    return list(df['kw']), list(df['memid']), list(df['idtype'])

def cos_angle(a, b):
    '''calculate similarity between two given vectors
    Parameter:
    ---------
        a : N dimentional vector (list of N integers)
        b : N dimentional vector
    
    Returns
    -------
        float cosine of angle between the given vectors (a and b)
    '''

    inner = np.inner(a, b)
    norms = LA.norm(a) * LA.norm(b)
    cos = inner / norms
    rad = np.arccos(np.clip(cos, -1.0, 1.0))
    sim = np.cos(rad)
    return sim

def get_key_sim(model, data, index, user_query, threshold, num):
    '''Get graph related data for a given user query
    Parameters
    ----------
        model : the model which takes in a string and returns it's vector embedding
        data : the data that coresponds to the ids passed to the faiss index
        index : the faiss index object
        user_query : string to be looked up for
        num : number of keywords to be looked up for
        threshold : the level of similarity requrired (default is 0.5)
    
    Returns
    -------
        a dictionary containing graph data as the source, target and similarity and the map of keywords to member ids as keywords-memberIDs 
    '''
    resdf, memid, idtype = get_keywords(model, data, index , user_query, num)
    #Create a matrix
    embeddings = model.encode(resdf, show_progress_bar=True)

    src = list()
    tar = list()
    sim = list()
    thresh = threshold
    n = len(embeddings)
    m=0
    for i in range(n):
        m+=1
        for j in range(m):
            _sim = cos_angle(embeddings[i], embeddings[j])
            if i == j or _sim < thresh: continue
            src.append(resdf[i])
            tar.append(resdf[j])
            sim.append(_sim)
    return dict({
        "similarity": sim[:num],
        "memberid": memid[:num]
    })

def flatten_str_list(memid):
    return_list = list()
    for ids in memid:
        for idsplit in ids.split(','):
            return_list.append(idsplit)
    return return_list

In [37]:
kw = "machine learning"

In [38]:
ml_result = get_key_sim(en_model, en_data, en_faiss, kw, 0.1, 10)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [39]:
sim_list = ml_result['similarity']
id_list = ml_result['memberid']

In [60]:
# assign similarity to each ID
tmp_list = list()
tmp_similarity = list()
for idx in range(len(id_list)):
    try:
        for ids in id_list[idx].split(','):
            tmp_list.append(str(ids))
            tmp_similarity.append(float(sim_list[idx]))
    except:
        print("assign failed")
id_sim_list = dict({"kakenhiID": tmp_list, "similarity": tmp_similarity})

In [62]:
ids = list(set(flatten_str_list(id_list)))

In [64]:
len(id_sim_list["kakenhiID"])

107

In [65]:
select_field = {
    "$project" :
       {
           "institution" : "$institutionName.en",
           "name" : "$name.en",
           "kakenhiID": "$kakenhiID",
           "_id" : 0
       }
    }

In [66]:
id_query = {"kakenhiID": { "$in": ids }}
pipeline = [
    {"$match": {
        "$and": [
            id_query,
        ],
    }},
    select_field,
    {"$limit" :  100},
]

In [67]:
from pandas import DataFrame

In [69]:
df_similarity = DataFrame.from_dict(id_sim_list)

In [97]:
df_researcher = DataFrame(list(researchers.aggregate(pipeline)))

In [98]:
df = pd.merge(df_researcher, df_similarity, on='kakenhiID')

In [99]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 66 entries, 0 to 65
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   institution  66 non-null     object 
 1   name         66 non-null     object 
 2   kakenhiID    66 non-null     object 
 3   similarity   66 non-null     float64
dtypes: float64(1), object(3)
memory usage: 2.6+ KB


In [100]:
return_data = list()
for inst in list(df['institution'].unique()):
    researcher_profiles = list()
    print(inst)
    filtered_df = df[df["institution"]==inst][['name', 'kakenhiID', 'similarity']]
    for idx in range(len(filtered_df)):
        profile_name = filtered_df.iloc[idx][0] # name
        kak_id = filtered_df.iloc[idx][1] # kakenhi ID
        sim = filtered_df.iloc[idx][2] # similarity score
        ob = {"firstName": str(profile_name), "kakenhiID": str(kak_id), "similarity": float(sim)}
        researcher_profiles.append(ob)
    print({"institution": inst, "researcherProfiles": researcher_profiles})
    return_data.append({"institution": inst, "researcherProfiles": researcher_profiles})

Osaka Prefecture University
{'institution': 'Osaka Prefecture University', 'researcherProfiles': [{'firstName': 'Roy Parthapratim', 'kakenhiID': '10837222', 'similarity': 0.7765234708786011}, {'firstName': 'Chiang YiHan', 'kakenhiID': '10824196', 'similarity': 0.7765234708786011}]}
Kanazawa University
{'institution': 'Kanazawa University', 'researcherProfiles': [{'firstName': 'Gary Ross', 'kakenhiID': '10708142', 'similarity': 0.7765234708786011}]}
The University of Aizu
{'institution': 'The University of Aizu', 'researcherProfiles': [{'firstName': 'HEO Younghyon', 'kakenhiID': '10631476', 'similarity': 0.7765234708786011}, {'firstName': 'MARKOV Konstantin', 'kakenhiID': '80394998', 'similarity': 0.7765234708786011}, {'firstName': 'Liu Yong', 'kakenhiID': '60325967', 'similarity': 0.7765234708786011}, {'firstName': 'Liu Yong', 'kakenhiID': '60325967', 'similarity': 0.7765234708786011}, {'firstName': 'Liu Yong', 'kakenhiID': '60325967', 'similarity': 0.771973729133606}, {'firstName': 'J

In [101]:
return_data

[{'institution': 'Osaka Prefecture University',
  'researcherProfiles': [{'firstName': 'Roy Parthapratim',
    'kakenhiID': '10837222',
    'similarity': 0.7765234708786011},
   {'firstName': 'Chiang YiHan',
    'kakenhiID': '10824196',
    'similarity': 0.7765234708786011}]},
 {'institution': 'Kanazawa University',
  'researcherProfiles': [{'firstName': 'Gary Ross',
    'kakenhiID': '10708142',
    'similarity': 0.7765234708786011}]},
 {'institution': 'The University of Aizu',
  'researcherProfiles': [{'firstName': 'HEO Younghyon',
    'kakenhiID': '10631476',
    'similarity': 0.7765234708786011},
   {'firstName': 'MARKOV Konstantin',
    'kakenhiID': '80394998',
    'similarity': 0.7765234708786011},
   {'firstName': 'Liu Yong',
    'kakenhiID': '60325967',
    'similarity': 0.7765234708786011},
   {'firstName': 'Liu Yong',
    'kakenhiID': '60325967',
    'similarity': 0.7765234708786011},
   {'firstName': 'Liu Yong',
    'kakenhiID': '60325967',
    'similarity': 0.771973729133606

In [48]:
return_format = [{
    "institution": "Turnabout",
    "relatedResearchers": 306,
    "researcherProfiles": [
      {
        "firstName": "Ines",
        "lastName": "Bean",
        "papers": [
          {
            "paperName":
              "Ea exercitation reprehenderit minim magna sit consequat elit occaecat qui duis veniam.",
          },
          {
            "paperName":
              "Voluptate incididunt consequat qui dolor dolore cupidatat duis do quis nulla.",
          },
          {
            "paperName":
              "Excepteur ea laboris aute mollit culpa anim Lorem sit do ipsum ullamco labore sit amet.",
          },
          {
            "paperName":
              "Est adipisicing consectetur aliqua Lorem ex sunt ex sit voluptate.",
          },
          {
            "paperName":
              "Ex anim exercitation ut nostrud et anim nostrud dolore adipisicing.",
          },
        ],
      }]
}]