In [None]:
# Import packages
import numpy as np
import urllib2
from selenium import webdriver  
from selenium.common.exceptions import NoSuchElementException  
from selenium.webdriver.common.keys import Keys  
import bs4 as bs
import time
import pickle

In this jupyter Notebook we will try to find lexical matches between doctoral projects in the Humanities and Social Sciences Group at the University of Leuven. As a first step, we will scrape the Research database at <http://www.kuleuven.be/research/researchdatabase/faculty/all.htm>, and compile a list of current PhD projects. Next, we will use machine learning tools to search for correspondence between different projects. The goal is to bring together PhD students from different research units working on related topics or using similar methods.

*Disclaimer:* Scraped from the URL's below on Aug 23, 2016. No warranty that the results will be accurate. They could be irrelevant, outdated, hilarious, or trivial. But for some PhD students it might give insights. If you don't get any results, maybe your name is not in the database, or you have not uploaded a title and/or abstract for your project.

# Scraping the Research Database

The faculties listed at <http://www.kuleuven.be/research/researchdatabase/faculty/all.htm> do not all belong to the Group of Humanities and Social Sciences. 

In [None]:
# List faculties belonging to the group of humanities:
#    50000102    Faculty of Theology and Religious Studies
#    50000130    Institute of Philosophy
#    50000146    Faculty of Canon Law
#    50000148    Faculty of Law
#    50000208    Faculty of Economics and Business (FEB)
#    50000243    Faculty of Social Sciences
#    50000275    Faculty of Arts
#    50000339    Faculty of Psychology and Educational Sciences
              
Humanities = {50000102, 50000130, 50000146, 50000148, 50000208, 50000243, 50000275, 50000339}

In [None]:
projects = []

for faculty in Humanities:
    url = 'http://www.kuleuven.be/research/researchdatabase/faculty/' + str(faculty) + '.htm'
    print url
    source = urllib2.urlopen(url).read()
    start_project_url = 1
    while start_project_url>0:
        start_project_url = source.find('/research/researchdatabase/project/')
        source = source[start_project_url:]
        end_project_url = source.find('.htm')
        if end_project_url>0:
            project = 'http://www.kuleuven.be' + source[0:end_project_url+4]
            print project           
            projects.append(project)
        source = source[end_project_url:]

In [None]:
print len(projects)

`projects` is now a list containing the URLs to all individual research project pages. We will now iterate that list and extract relevant information from each page. We use the selenium package to read JavaScript content rendered in the browser.

Let's do some scraping.

In [None]:
def replace_non_ascii(string):
    string = string.format('ascii')
    string = string.replace('\\u2019',"'")
    string = string.replace('\\u2018',"'")
    string = string.replace('<br/>'," ")
    string = string.replace('<i>',"")
    string = string.replace('</i>',"")
    string = string.replace('\\xa0'," ")
    string = string.replace('<p align="LEFT">',"")
    string = string.replace('</p>',"")
    string = string.replace('<p>',"")
    string = string.replace('&amp;'," and ")
    string = string.replace('\u201c',"")
    string = string.replace('\u201d',"")
    string = string.replace('\\n'," ")
    string = string.replace('\\t'," ")
    string = string.replace('\\u2013'," ")
    string = string.replace('<em>'," ")
    string = string.replace('</em>'," ")
    string = string.replace('<span lang="EN-US">'," ")
    string = string.replace('<span>'," ")
    string = string.replace('</span>'," ")
    string = string.replace('<span>'," ")
    string = string.replace('\\xe0',"à")
    string = string.replace('\\u02bb',"")
    string = string.replace('\\u02bc',"")
    string = string.replace('\\u2014'," ")
    string = string.replace('<\\xf6'," ")
    string = string.replace('<\\xab'," ")
    string = string.replace('<\\xbb'," ")
    string = string.replace('<strong>'," ")
    string = string.replace('<\strong>'," ")
    string = string.replace('\\xe8',"è")
    string = string.replace('\\xc3\xa8',"e")
    
    return string

In [None]:
browser = webdriver.Firefox()
Projects_List = []

start_idx = 0

for idx, project in enumerate(projects[start_idx:]):
    print str(start_idx+idx+1) + " / " + str(len(projects))
    # Project URL:
    project_url = project
       
    browser.get(project)
    time.sleep(3)
    html_source = browser.page_source
    soup = bs.BeautifulSoup(html_source)
    
    # Content:
    content = soup.find_all("div", {"class": "grid spacer ng-scope"})

    # Check whether person is still listed in who-is-who:
    person_missing = 0
    for content_part in content:
        contents = str(content_part.get_text)
        if contents.find('Doctorandus')>0:
            start = contents.find("href=")+6
            stop = contents.find("title")-2
            who_is_who = contents[start:stop]
            if who_is_who == "http://www.kuleuven.be/wieiswie/nl/person/":
                person_missing = 1
    if person_missing:
        print project_url + " MISSING"
        continue
    print project_url
    
    project_summ = ""
    name = ""
    who_is_who = ""
    u_number = ""
    for content_part in content:
        contents = str(content_part.get_text)
        if contents.find('item.summary')>0:
            project = str(content_part.get_text)
            start = project.find('ng-bind-html="item.summary"')+28
            project = project[start:]
            stop = project.find('</div')
            project_summ = project[:stop]
            try:
                project_summ = replace_non_ascii(project_summ)
            except:
                project_summ = project_summ
        elif contents.find('Doctorandus')>0:
            name = content_part.contents[1].getText()
            start = contents.find("href=")+6
            stop = contents.find("title")-2
            who_is_who = contents[start:stop]
            u_number = who_is_who[who_is_who.find("person/")+7:]
        
    # Title:   
    title = soup.find_all("div",{"class":"grid__12 grid--bp-med__9"})    
    title = str(title)
    start = title.find('h3 class="ng-binding">')+22
    title = title[start:]
    stop = title.find("(")
    title = title[:stop-1]
    
    if (len(title)+len(project_summ))>10:
        Projects_List.append([project_url, name, u_number, who_is_who, title, project_summ])
        # writing to file after each new entry, because of server time-outs
        pickle.dump(Projects_List, open( "Projects_List_2.p", "wb"))
        
browser.quit()
print len(Projects_List)

# Re-format the projects

In [None]:
# Merge the separate Project-lists (separate due to server time-outs):
Projects_0 = pickle.load(open("Projects_List_0.p", "rb"))
Projects_1 = pickle.load(open("Projects_List_1.p","rb"))
Projects_2 = pickle.load(open("Projects_List_2.p","rb"))

Projects_List = Projects_0 + Projects_1 + Projects_2
pickle.dump(Projects_List, open( "Projects_List.p", "wb"))

In [None]:
Projects_List = []
import pickle
Projects_List = pickle.load( open( "Projects_List.p", "rb" ) )
Descriptions = [project[4]+' '+project[5] for project in Projects_List] # merge title and project description
Students = [project[1] for project in Projects_List] # list of students

In [None]:
print Projects_List[1722]

# Find lexical similarities

In [None]:
# Build a tokenized version of the descriptions
# (1) recode to ASCII, ignoring non-ASCII characters
# (2) make all text lowercase
# (3) remove punctuation (except for the "-" character)
# (4) remove stopwords from English language
# (5) remove general research-related tokens

import nltk # Natural language toolkit
import string
#print string.punctuation
import re
from nltk.stem import WordNetLemmatizer
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]


from nltk.corpus import stopwords # run "nltk.download('stopwords')" once
research_words = ['research', 'result', 'conclusion', 'conclude', 'show', 'investigate', 'study', 'project', 'phd',
                  'understanding', 'goal', 'aim', 'aims', 'process', 'processing', 'results', 'theory', 'effect',
                  'effects', 'test', 'testing', 'data', 'analysis', 'analyses', 'hypothesis', 'hypotheses',
                  'subject', 'subjects', 'participant', 'participants', 'role', 'variable', 'variables', 
                  'finding', 'findings', 'found', 'shows', 'showed', 'shown', 'researchers', 'significant',
                  'significance', 'discussion', 'theories', 'studies', 'chapter', 'influence', 'influences',
                  'evidence', 'studied', 'doctoral', 'thesis', 'find', 'finds', 'underlying', 'approach']
useless_words = ['however', 'whether', 'recent', 'other', 'use', 'imply', 'current', 'currently', 'aspect',
                 'aspects', 'new', 'field', 'versus', 'also', 'possibility', 'towards', 'thus', 'hence', 'as', 
                 'general', 'using', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine',
                 'ten', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'discuss', 'discussed', 'possible', 
                 'important', 'importance', 'understand', 'understaning', 'used', 'with', 'without', 'would', 'will', 
                 'many', 'most', 'previous', 'present', 'among', 'common', 'described', 'presented', 'reflect',
                 'reflects', 'vs', 'get', 'gets', 'getting', 'back', 'main', 'although', 'may', 'account', 'therefore',
                 'upon', 'eg', 'e.g.', 'usually', 'despite', 'certain', 'seem', 'seems', 'obvious', 'related',
                 'must', 'within',
                 'differ', 'different', 'could', 'clearly', 'depend', 'depends', 'way', 'propose', 'high',
                 'low', 'specific', 'indeed', 'furthermore', 'afterwards', 'allow', 'us', 'around', 'others',
                 'particular', 'de', 'en', 'het', 'together', 'along', 'goal', 'goals', 'nevertheless'] + research_words

stopwords_rem = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')

token_dict = {}
for ind, description in enumerate(Descriptions):
    #description = description.encode('ascii', 'ignore') 
    lowers = description.lower()
    no_punctuation = lowers.translate(None,string.punctuation.replace("-", ""))
    no_stopwords = stopwords_rem.sub('', no_punctuation)
    no_uselesswords = ' '.join([i for i in no_stopwords.split() if i not in useless_words])
    token_dict[ind] = no_uselesswords

In [None]:
#nltk.download('wordnet')
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

# Run the TF-IDF algorithm on the tokened descriptions
# TF-IDF = "term frequency–inverse document frequency"
tfidf = TfidfVectorizer(tokenizer=LemmaTokenizer())
tfs = tfidf.fit_transform(token_dict.values())

This results in a CSR, compressed sparse row format, matrix. The rows are the original descriptions, the columns are the features (i.e., tokens)

In [None]:
feature_names = tfidf.get_feature_names()
#print feature_names[26073]
print len(feature_names)

In [None]:
student_id = 1129
student_vector = tfs[student_id]
max_loads = np.argsort(student_vector.data)[::-1]
print 'Most important "tokens" for PhD student '+Students[student_id]+':'
for f in np.arange(min(len(max_loads),10)):
    print feature_names[student_vector.indices[max_loads[f]]]

# Compare documents

To compare the 2000+ documents, we can compare the tfs vectors. To find the **cosine distances** of one document *D* to all of the others we need to compute the dot products of the *D*-th vector with all of the others (tfs vectors are already row-normalized). To get the first vector we need to slice the matrix row-wise to get a submatrix with a single row:

scikit-learn provides pairwise metrics that work for both dense and sparse representations of vector collections. In this case we need a dot product that is also known as the linear kernel:

In [None]:
n_matches = 3

In [None]:
from sklearn.metrics.pairwise import linear_kernel

In [None]:
from sklearn.metrics.pairwise import linear_kernel
cosine_similarities_all = linear_kernel(tfs,tfs)
cosine_similarities_all = cosine_similarities_all.argsort()
# Note: the argsort returns one student more than once as the best corresponding one (missing values for specific students)

In [None]:
print Top_K[1129]
print Projects_List[1060][0]

In [None]:
Top_K =  [student[:-(n_matches+2):-1] for student in cosine_similarities_all]

In [None]:
for student in cosine_similarities_all:
    if student[-1]==student_id:
        print student

In [None]:
# 0 = project url
# 1 = student name
# 2 = student number
# 3 = student url
# 4 = title
# 5 = abstract

all_students = []
for student in Top_K:
    new_row = []
    student_u    = (Projects_List[student[0]][2]).lower()
    student_name = (Projects_List[student[0]][1]).title()
    student_url  = (Projects_List[student[0]][3])
    new_row.append(student_u)
    new_row.append(student_name)
    new_row.append(student_url)
    for match in np.arange(n_matches):
        match_name  = (Projects_List[student[match+1]][1]).title()
        match_url   = (Projects_List[student[match+1]][0])
        match_title = (Projects_List[student[match+1]][4])
        new_row.append(match_name)
        new_row.append(match_url)
        new_row.append(match_title)
    all_students.append(new_row)

In [None]:
print all_students[1021]

Hence to find the top 3 related documents, we can use argsort and some negative array slicing (most related documents have highest cosine similarity values, hence at the end of the sorted indices array):

In [None]:
all_students[1021]

In [None]:
import sys
reload(sys)
sys.setdefaultencoding('utf8')

import csv

with open("output.csv", "wb") as f:
    writer = csv.writer(f)
    writer.writerows(all_students)