# Introduction

![Expert-Document graph](https://github.com/Yongbinkang/ExpFinder/blob/main/images/restricted_graph.png?raw=true)
<center>
    Figure 1: <i>The graph showing the relationship between experts, documents and expertise topics</i>
</center>

In this notebook, we demonstrate the example of the entire ExpFinder process with restricted example, as shown in the Figure 1. In this example, we set some restrictions as follows:

1. $e1$ is expertise in NLP and its applications in healthcare based on the associated documents (e.g. $d1$ and $d2$).
2. $e2$ is expertise in three different areas such as NLP, computer vision and their applications in healthcare based on the associated documents (e.g. $d1$ and $d2$).
3. $e3$ is only expertise in NLP because he/she only associates to a document $d2$.

# Libraries

In [2]:
import pandas as pd
import numpy as np
from ast import literal_eval
from transformers import BertTokenizer, BertModel
import networkx as nx

from src.controller import generator, trainer
from src.lib import extractor

# Data preparation 

In [3]:
DATA_PATH = './data/'

In [4]:
def prepare_data():
    ''' This function reads data from the existing data source'''
    global DATA_PATH
    
    doc_df = pd.read_csv('{}raw_data.csv'.format(DATA_PATH))
    ed_df = pd.read_csv('{}ep_df.csv'.format(DATA_PATH))
    
    with open('{}stopword.txt'.format(DATA_PATH)) as f:
        stopwords = literal_eval(f.read())
    
    with open('{}topics.txt'.format(DATA_PATH)) as f:
        topics = literal_eval(f.read())
    
    return doc_df, ed_df, stopwords, topics

In [5]:
doc_df, ed_df, stopwords, topics = prepare_data()

In [6]:
display(doc_df)
display(ed_df)

Unnamed: 0,doc_id,text
0,d1,A prerequisite for using electronic health rec...
1,d2,"Word embedding, where semantic and syntactic f..."
2,d3,Structural health monitoring at local and glob...


Unnamed: 0,doc_id,exp_id,weight
0,d1,e1,1
1,d1,e2,1
2,d2,e1,1
3,d2,e3,1
4,d3,e2,1


# Data generation

## Expert-document matrix

In [7]:
ed_matrix = generator.generate_ed_matrix(ed_df)

In [8]:
display(ed_matrix)

Unnamed: 0,d1,d2,d3
e1,1,1,0
e2,1,0,1
e3,0,1,0


## Document-phrase matrix

In [9]:
def dp_pipeline(doc_df, stopwords):
    ''' This function contains the pipeline for generating the 
    document-phrase matrix '''
    # Construct corpus (of tokens and noun phrases)
    corpus = doc_df['text'].values
    X_train = extractor.tokenise_doc(corpus, stopwords, max_phrase_len=3)
    
    # Generate TF for terms and noun phrases
    tf_terms = generator.generate_tf(X_train['tokens'])
    tf_phrases = generator.generate_tf(X_train['np'])
    
    # Generate document-phrase matrix
    dp_matrix = generator.generate_dp_matrix(tf_terms, tf_phrases, 
                                             doc_df['doc_id'], method="indirect")
    
    
    return pd.DataFrame(dp_matrix['matrix'].todense(),
                        index=dp_matrix['index'], columns=dp_matrix['columns'])

In [10]:
dp_matrix = dp_pipeline(doc_df, stopwords)

In [11]:
display(dp_matrix)

Unnamed: 0_level_0,access,application_advantage,approach,article,background,basic_procedure,big_data-empowered_clinical,capability,community,complement,...,real-time_nlp_processing,real_time,relevant_literature,share,structural_health_monitoring,structure,syntactic_feature,unlabeled_text_data,vision_technology,word_embedding
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
d1,1.693147,0.0,0.0,0.0,0.0,0.0,1.693147,1.693147,0.0,0.0,...,2.821912,1.693147,0.0,1.693147,0.564382,0.0,0.0,1.128765,0.0,0.0
d2,0.0,0.0,0.0,0.0,1.693147,1.693147,0.0,0.0,0.0,0.0,...,1.128765,0.0,0.0,0.0,0.0,0.0,1.693147,1.693147,0.0,3.386294
d3,0.0,1.693147,1.693147,1.693147,0.0,0.0,0.0,0.0,1.693147,1.693147,...,0.0,0.0,1.693147,0.0,8.465735,1.693147,0.0,0.0,5.079442,0.0


## Document-topic matrix

In [12]:
MODEL_PATH = './model/'

In [13]:
def dtopic_pipeline(dp_matrix, topics):
    ''' This function contaisn the pipeline for generating the 
    document-topic matrix'''
    # Load Scibert model
    MODEL_DIR = '{}scibert_scivocab_uncased'.format(MODEL_PATH)
    model = BertModel.from_pretrained(MODEL_DIR)
    tokenizer = BertTokenizer.from_pretrained(MODEL_DIR)
    
    # Prepare model dictionary
    # Note: For the pretrained vectors of phrases, you will need to read here.
    # This example does not contain pretrained vectors
    model_dict = {
        'model': model,
        'tokenizer': tokenizer,
        'trained_vectors': None
    }
    
    # Generate document-topic matrix
    dtopic_matrix, topic_phrase = generator.generate_dtop_matrix(dp_matrix, topics, 
                                                                 model_dict, top_n=1)
    topic_vec = generator.generate_topic_vector(dtopic_matrix)
    dtopic_matrix = pd.DataFrame(dtopic_matrix['matrix'].todense(),
                                 index=dtopic_matrix['index'], 
                                 columns=dtopic_matrix['columns'])
    
    return dtopic_matrix, topic_vec, topic_phrase

In [14]:
dtopic_matrix, topic_vec, topic_phrase = dtopic_pipeline(dp_matrix, topics)

In [15]:
display(topic_phrase)

{'health application': [('health_monitoring', 0.831919)],
 'natural language processing': [('natural_language_processing', 0.9999999)],
 'computer vision': [('computer_vision_technology', 0.84557104)]}

In [16]:
display(dtopic_matrix)
display(topic_vec)

Unnamed: 0_level_0,health application,natural language processing,computer vision
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
d1,0.32207,0.67793,0.0
d2,0.0,1.0,0.0
d3,0.6,0.0,0.4


Unnamed: 0,weights
health application,0.307357
natural language processing,0.55931
computer vision,0.133333


## Personalised matrices

In [17]:
def personalised_pipeline(ed_df, ed_matrix, dtopic_matrix, topic_vec):
    # Generate expoert-document graph
    G = generator.generate_ecg(ed_df)
    
    # Generate personalised matrices 
    etop_matrix, dtop_matrix = generator.generate_pr_matrix(ed_matrix, 
                                                            dtopic_matrix, 
                                                            topic_vec['weights'].values, 
                                                            G, alpha=0.0)
    
    # Construct DataFrame
    etop_matrix = pd.DataFrame(etop_matrix['matrix'].todense(),
                               index=etop_matrix['index'],
                               columns=etop_matrix['columns'])
    dtop_matrix = pd.DataFrame(dtop_matrix['matrix'].todense(),
                               index=dtop_matrix['index'],
                               columns=dtop_matrix['columns'])
    
    return etop_matrix, dtop_matrix, G

In [18]:
exp_pr_df, doc_pr_df, ed_graph = personalised_pipeline(ed_df, ed_matrix, dtopic_matrix, topic_vec)

In [19]:
display(exp_pr_df.loc[['e1', 'e2', 'e3']])
display(doc_pr_df.loc[['d1', 'd2', 'd3']])

Unnamed: 0,health application,natural language processing,computer vision
e1,0.32207,1.67793,0.0
e2,0.92207,0.67793,0.4
e3,0.0,1.0,0.0


Unnamed: 0,health application,natural language processing,computer vision
d1,0.32207,0.67793,0.0
d2,0.0,1.0,0.0
d3,0.6,0.0,0.4


## Counted vectors

In [20]:
def cv_pipeline(ed_matrix, ed_graph):
        # Generate CV expert-document
    exp_vec, doc_vec = generator.generate_ed_vector(ed_matrix, ed_graph)
    
    return exp_vec, doc_vec

In [21]:
ed_count, de_count = cv_pipeline(ed_matrix, ed_graph)

In [22]:
display(ed_count)
display(de_count)

Unnamed: 0,count
d1,1
e1,2
e2,2
d2,1
e3,1
d3,1


Unnamed: 0,count
d1,2
e1,1
e2,1
d2,2
e3,1
d3,1


# ExpFinder algorithm

In [23]:
def ef_pipeline(ed_matrix, ed_graph, exp_pr_df, doc_pr_df, ed_count, de_count):
    # Intialise parameters
    params = {
        'ed_graph': ed_graph,
        'ed_matrix': ed_matrix,
        'et_matrix': exp_pr_df,
        'dt_matrix': doc_pr_df,
        'lamb_e': 1.0,
        'lamb_d': 0.7,
        'max_iter': 5,
        'ed_count': ed_count,
        'de_count': de_count
    }
    topics = doc_pr_df.columns
    
    # Run model
    etop_matrix = trainer.run_expfinder(topics, params)
    
    display(etop_matrix)

In [24]:
ef_pipeline(ed_matrix, ed_graph, exp_pr_df, doc_pr_df, ed_count, de_count)

Unnamed: 0,e1,e2,e3
health application,0.571502,0.611617,0.547092
natural language processing,0.581399,0.549855,0.599696
computer vision,0.570875,0.608666,0.551024
