## Install Required Packages

In [1]:
!pip install -q pyspellchecker==0.8.1

## Import Libraries & Packages

In [2]:
import pandas as pd
import numpy as np
# import sklearn
import matplotlib.pyplot as plt
import re
import nltk
import gensim.downloader as dl_api
import os
import time


from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from nltk.corpus import stopwords
from spellchecker import SpellChecker
# from gensim.test.utils import common_texts
from gensim.models import KeyedVectors
from huggingface_hub import hf_hub_download
from google.colab import userdata


nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Load and Preprocess the Dataset

In [3]:
xls_file = pd.ExcelFile('/content/university_data.xlsx')
xls_file.sheet_names

['Students', 'Professors']

In [4]:
df_profs = pd.read_excel(xls_file, sheet_name=xls_file.sheet_names[1])
df_students = pd.read_excel(xls_file, sheet_name=xls_file.sheet_names[0])

df_profs.dropna(axis=0, inplace=True)  # drop empty rows
df_students.dropna(axis=0, inplace=True)  # drop empty rows

# sanity check over null values
assert df_profs.all().all()
assert df_students.all().all()

In [None]:
df_profs.head()

Unnamed: 0,Professor GUID,Name,Research Interests,University Field
0,ee1d5410-3fd8-444e-b847-b6d7b41137fc,Thomas Hill,"Astrophysics, Particle Physics, Statistical Me...",Physics
1,44560b3d-0f89-4cc1-85a7-a660f5cd7ae6,Dwayne Branch,"Biopsychology, Social Psychology, Psychometric...",Psychology
2,2dcd859c-9c81-40f8-a2df-89471cfa3e85,Heather Swanson,"Forensic Psychology, Clinical Psychology, Deve...",Psychology
3,f335bcda-e4de-4d10-9c68-f76bbd0e6968,Thomas Ashley,"Statistics, Probability, Topology, Mathematica...",Mathematics
4,de9922e1-a6a9-4f05-84ba-1ee651398ced,Pamela Mcmillan,"Environmental Engineering, Systems Engineering...",Engineering


In [None]:
df_students.head()

Unnamed: 0,Student GUID,Name,Research Interests,University Field
0,c1c5fc27-048a-4fad-b32a-57b6613f5c6d,Daniel Cain,"Photonics, Cosmology, Theoretical Physics, Exp...",Physics
1,ead3d7a5-bddc-4ad1-ab55-4db006731802,Amy Potter,"Cognitive Psychology, Developmental Psychology...",Psychology
2,c6f1e6d1-21fe-4daa-a022-ff9e0f4fd957,Jessica Collins,"Materials Science, Physical Chemistry, Inorgan...",Chemistry
3,3e19f76f-46b4-46c4-a489-36053fd8d79e,Maria Singh,"Economic History, History of Science, Military...",History
4,31bbb063-8dae-4e81-97c4-456e8df9af33,James Thomas,"Geometry, Mathematical Physics, Statistics, Al...",Mathematics


In [31]:
def str_to_list(input_str):
    delimiters = ', |; |-'
    return [e for e in re.split(pattern=delimiters, string=input_str)]


def normalize(input_list):  # make all words lower and clean indents
    return [e.lower().strip() for e in input_list]


def remove_stop_words_from(input_list):  # remove stop words like 'of', 'and'
    stop_words = stopwords.words('english')
    # return [e for w in input_list for e in w.split() if e not in stop_words]  # & return tokens
    return [' '.join([e for e in w.split() if e not in stop_words]) for w in input_list]

### TODO: use local vocab to make the dictionary more robust
def check_spells_in(input_list):  # check for any typos
    speller = SpellChecker(language='en', distance=2, case_sensitive=False)
    for phrase in input_list:
        misspelled = speller.unknown(phrase.split())
        if misspelled:
            print('unknown word:', misspelled)
            print('best correction:', speller.correction(misspelled.pop()))


def split_into_tokens(input_list):  # convert list of interests into single words
    return [token for word in input_list for token in word.split()]


### TODO: remove punctuations, numbers, links
def preprocess(input_str):
    input_list = str_to_list(input_str)
    normalized_list = normalize(input_list)
    cleaned_list = remove_stop_words_from(normalized_list)
    # check_spells_in(cleaned_list)  # view-only; no output yet
    extracted_tokens_list = split_into_tokens(cleaned_list)
    return extracted_tokens_list


df_profs['Tokenized RIs'] = df_profs['Research Interests'].apply(preprocess)
df_students['Tokenized RIs'] = df_students['Research Interests'].apply(preprocess)

In [32]:
df_profs.head()

Unnamed: 0,Professor GUID,Name,Research Interests,University Field,Tokenized RIs
0,ee1d5410-3fd8-444e-b847-b6d7b41137fc,Thomas Hill,"Astrophysics, Particle Physics, Statistical Me...",Physics,"[astrophysics, particle, physics, statistical,..."
1,44560b3d-0f89-4cc1-85a7-a660f5cd7ae6,Dwayne Branch,"Biopsychology, Social Psychology, Psychometric...",Psychology,"[biopsychology, social, psychology, psychometr..."
2,2dcd859c-9c81-40f8-a2df-89471cfa3e85,Heather Swanson,"Forensic Psychology, Clinical Psychology, Deve...",Psychology,"[forensic, psychology, clinical, psychology, d..."
3,f335bcda-e4de-4d10-9c68-f76bbd0e6968,Thomas Ashley,"Statistics, Probability, Topology, Mathematica...",Mathematics,"[statistics, probability, topology, mathematic..."
4,de9922e1-a6a9-4f05-84ba-1ee651398ced,Pamela Mcmillan,"Environmental Engineering, Systems Engineering...",Engineering,"[environmental, engineering, systems, engineer..."


## Method 1: Using Gensim Pretrained Models; GloVe, FastText, Word2Vec

In [7]:
available_corpora = dl_api.info()['models']

# see all corpora and their file sizes
for name, metadata in available_corpora.items():
    if not name.startswith('_'):
        print(f"name: {name}, size: {round(metadata['file_size']/(1024.0*1024.0))} MB")

name: fasttext-wiki-news-subwords-300, size: 958 MB
name: conceptnet-numberbatch-17-06-300, size: 1169 MB
name: word2vec-ruscorpora-300, size: 199 MB
name: word2vec-google-news-300, size: 1663 MB
name: glove-wiki-gigaword-50, size: 66 MB
name: glove-wiki-gigaword-100, size: 128 MB
name: glove-wiki-gigaword-200, size: 252 MB
name: glove-wiki-gigaword-300, size: 376 MB
name: glove-twitter-25, size: 105 MB
name: glove-twitter-50, size: 200 MB
name: glove-twitter-100, size: 387 MB
name: glove-twitter-200, size: 759 MB


In [10]:
model_name = 'fasttext-wiki-news-subwords-300'
pretrained_model = dl_api.load(name=model_name)

In [None]:
pretrained_model.subwording

In [12]:
# sanity check: all tokens (words) should be already present in the model vocab
for interests in df_students['Tokenized RIs']:
    for interest in interests:
        for word in interest.split():
            if word not in pretrained_model: print(word)

for interests in df_profs['Tokenized RIs']:
    for interest in interests:
        for word in interest.split():
            if word not in pretrained_model: print(word)

## Method 2: Using Huggingface Pretrained Word2Vec Model, trained on Wikipeida Corpus

In [None]:
cache_dir = '/content/cache/'
os.makedirs(cache_dir, exist_ok=True)

HF_TOKEN = userdata.get('HF_TOKEN')

repo_name = 'Word2vec/wikipedia2vec_enwiki_20180420_100d'
file_name = 'enwiki_20180420_100d.txt'

raw_model = hf_hub_download(repo_id=repo_name, filename=file_name, cache_dir=cache_dir)
pretrained_model = KeyedVectors.load_word2vec_format(raw_model)

enwiki_20180420_100d.txt:   0%|          | 0.00/3.49G [00:00<?, ?B/s]

In [None]:
# sanity check: all tokens (words) should be already present in the model vocab
for interests in df_students['Tokenized RIs']:
    for interest in interests:
        for word in interest.split():
            if word not in pretrained_model: print(word)

for interests in df_profs['Tokenized RIs']:
    for interest in interests:
        for word in interest.split():
            if word not in pretrained_model: print(word)

## Method 1.1: Using Data As-Is (Split All into Single-word Tokens)

In [34]:
def find_closest_profs_to(student_idx, students, professors, topn=1):
    student_ris = students['Tokenized RIs'][student_idx]
    # student_ris = ['machine', 'learning', 'computer', 'vision', 'data', 'science']

    avg_dists = list()
    # iterate over each professor and their research interests
    for prof_id, prof_ris in professors['Tokenized RIs'].items():
        dists = 0
        # calculate distance between each student's interest and prof's interests
        for student_ri in student_ris:
            # TODO: use weighted averaging if the order of interests matter (first>>last)
            dists += pretrained_model.distances(student_ri, prof_ris).mean()
        avg_dist = dists/len(student_ris)  # avg distance between student & prof
        avg_dists.append((prof_id, avg_dist))

    sorted_dists = sorted(avg_dists, key=lambda x: x[1])  # sort distances (ascending)
    # extract top n profs based on minimum distance
    topn_profs = [professors.loc[prof_id] for prof_id, _ in sorted_dists[:topn]]
    return topn_profs


idx = 7820
print('Student Name:', df_students['Name'][idx])
print('Student Research Interests:', df_students['Research Interests'][idx])
print('Student Department:', df_students['University Field'][idx])

print('----------------------------')
print('searching for professors ...')
st = time.time()
most_similar_profs = find_closest_profs_to(student_idx=idx, students=df_students,
                                           professors=df_profs, topn=5)
print('search done!')
print('elapsed time:', round(time.time() - st), 'secs')
print('----------------------------')

for prof_idx, top_prof in enumerate(most_similar_profs):
    print('******')
    print(f"Best Professor #{prof_idx+1}: {top_prof['Name']}")
    print(f"Prof Research Focus: {top_prof['Research Interests']}")
    print(f"Prof Faculty: {top_prof['University Field']}")

Student Name: Gary Garza
Student Research Interests: World History, Social History, Medieval History, Modern History, History of Science, Ancient History, Cultural History, Economic History, Military History, Political History
Student Department: History
----------------------------
searching for professors ...
search done!
elapsed time: 11 secs
----------------------------
******
Best Professor #1: Anna Adkins
Prof Research Focus: Economic History, Military History, History of Science, Social History, Cultural History, Ancient History, World History, Modern History, Political History, Medieval History
Prof Faculty: History
******
Best Professor #2: Stacey Wheeler
Prof Research Focus: World History, Cultural History, Economic History, Military History, History of Science, Ancient History, Social History, Modern History, Political History, Medieval History
Prof Faculty: History
******
Best Professor #3: Carolyn Beltran
Prof Research Focus: Medieval History, Cultural History, Political H

## Method 1.2: Using Mean Vectors for Unknown Phrases, based on Custom Mapping

In [27]:
# create custom mapping over word combinations not available in pretrained model
def create_embedding_map_for(dataframe, col_name, map_dict):
    for ris in dataframe[col_name]:
        for ri in ris:
            if len(ri.split()) > 1:  # multiple-worded phrase only
                map_dict[ri] = pretrained_model.get_mean_vector(ri.split())
    return map_dict


ri_map = dict()
ri_map = create_embedding_map_for(df_students, col_name='Tokenized RIs', map_dict=ri_map)
ri_map = create_embedding_map_for(df_profs, col_name='Tokenized RIs', map_dict=ri_map)

print(f'there are {len(ri_map)} unique combination of words in the dataset.')

there are 71 unique combination of words in the dataset.


In [28]:
def generate_vectors_from(words):
    vectors = list()
    for word in words:
        if len(word.split()) > 1:
            vectors.append(ri_map[word])
        else:
            vectors.append(pretrained_model.get_vector(word))
    return vectors


def find_closest_profs_to(student_idx, students, professors, topn=1):
    student_ris = students['Tokenized RIs'][student_idx]
    # student_ris = ['machine learning', 'computer vision', 'data science']
    student_vecs = generate_vectors_from(student_ris)  # get vectors for student interests

    avg_sims = list()
    # iterate over each professor and their interests
    for prof_id, prof_ris in professors['Tokenized RIs'].items():
        sims = 0
        prof_vecs = generate_vectors_from(prof_ris)
        # calculate vector distance between each student's interest and prof's interests
        for student_vec in student_vecs:
            sims += pretrained_model.cosine_similarities(student_vec, prof_vecs).mean()

        avg_sim = sims/len(student_vecs)  # avg distance between student & prof
        avg_sims.append((prof_id, avg_sim))

    sorted_sims = sorted(avg_sims, key=lambda x: x[1], reverse=True)  # sort distances (descending)
    # extract top n profs based on minimum distance
    topn_profs = [professors.loc[prof_id] for prof_id, _ in sorted_sims[:topn]]
    return topn_profs


idx = 7820
print('Student Name:', df_students['Name'][idx])
print('Student Research Interests:', df_students['Research Interests'][idx])
print('Student Department:', df_students['University Field'][idx])

print('----------------------------')
print('searching for professors ...')
st = time.time()
most_similar_profs = find_closest_profs_to(student_idx=idx, students=df_students,
                                           professors=df_profs, topn=5)
print('search done!')
print('elapsed time:', round(time.time() - st), 'secs')
print('----------------------------')

for prof_idx, top_prof in enumerate(most_similar_profs):
    print('******')
    print(f"Best Professor #{prof_idx+1}: {top_prof['Name']}")
    print(f"Prof Research Focus: {top_prof['Research Interests']}")
    print(f"Prof Faculty: {top_prof['University Field']}")

Student Name: Gary Garza
Student Research Interests: World History, Social History, Medieval History, Modern History, History of Science, Ancient History, Cultural History, Economic History, Military History, Political History
Student Department: History
----------------------------
searching for professors ...
search done!
elapsed time: 4 secs
----------------------------
******
Best Professor #1: Mr. Jeffrey Hamilton
Prof Research Focus: World History, Military History, Ancient History, Medieval History, Cultural History, Economic History, Political History, Social History, Modern History, History of Science
Prof Faculty: History
******
Best Professor #2: Vincent Lambert
Prof Research Focus: Ancient History, Social History, Military History, Economic History, Political History, Modern History, Medieval History, Cultural History, World History, History of Science
Prof Faculty: History
******
Best Professor #3: Michelle Brennan
Prof Research Focus: Cultural History, Economic History, M

Using word embedding map lead to a fascinating inference speed up, around ~ 3x