In [1]:
from bs4 import BeautifulSoup
import urllib

In [9]:
class Professor:
    def __init__(self, fname, lname, phone, email, desc, href=''):
        self.fname = fname
        self.lname = lname
        self.email = email
        self.phone = phone
        self.desc = desc
        self.href = href
        self.area = None
        self.profileText = ''
        
    def getName(self):
        return self.fname + ' ' + self.lname
    
    def setAreaOfInterest(self, area):
        self.area = area
        
    def getAreaOfInterest(self):
        return self.area
    
    def setProfileText(self, text):
        self.profileText = text
        
    def getProfileText(self):
        return self.profileText
    
    def copyHref(self, other):
        self.href = other.href
    
    def __str__(self):
        return str('{:30}'.format(self.fname + ' ' + self.lname) + '{:20}'.format('=>' + self.email) + ' ' + self.desc[0:30]) 
    
    def __repr__(self):
        return self.__str__()

In [10]:
def buildProfessor(raw_info):
    lname, fname, email, phone, desc = [attribute.getText() for attribute in raw_info[:-1]]
    return Professor(fname, lname, phone, email, desc)

In [11]:
def getISIProfessors():
    r = urllib.urlopen('https://www.isi.edu/people/directory?sq=&letter=&sort=asc&show=all').read()
    soup = BeautifulSoup(r)
    table = soup.find_all('table')[0]
    trs = table.find_all('tr')
    professors = []
    for tr in trs[1:]:
        professor = buildProfessor(list(tr.children))
        professors.append(professor)
    return professors

In [12]:
def getUSCProfessors():
    r = urllib.urlopen('https://viterbi.usc.edu/directory/faculty/').read()
    soup = BeautifulSoup(r)
    names = soup.find_all('h5', 'resultName')
    uscProfessors = []
    for name in names:
        url = 'https://viterbi.usc.edu' +name.parent.get_attribute_list('href')[0]
        professor = Professor(name.getText(), lname='', phone='', email='', desc='', href=url)
        uscProfessors.append(professor)
    return uscProfessors

In [13]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [14]:
uscProfessors = getUSCProfessors()
isiProfessors = getISIProfessors()

targetedProfessors = []
for prof in isiProfessors:
    for usc_prof in uscProfessors:
        if fuzz.ratio(prof.getName(), usc_prof.getName()) > 80:
            prof.copyHref(usc_prof)
            targetedProfessors.append(prof)
            print '{:30}'.format('Matched : ' + prof.getName().encode('utf-8').strip()) + ' => ', prof
            break

Matched : Jose-Luis Ambite     =>  Jose-Luis Ambite              =>ambite@isi.edu     Research Assistant Professor, 
Matched : Yigal Arens          =>  Yigal Arens                   =>arens@isi.edu      Senior Director For Administra
Matched : David Barnhart       =>  David Barnhart                =>barnhart@isi.edu   Research Professor
Matched : Andrea Belz          =>  Andrea Belz                   =>                   
Matched : Spartak Buniatyan    =>  Spartak Buniatyan             =>                   
Matched : Young Cho            =>  Young Cho                     =>youngcho@isi.edu   Research Assistant Professor
Matched : Stephen Crago        =>  Stephen Crago                 =>crago@isi.edu      Research Associate Professor, 
Matched : Ewa Deelman          =>  Ewa Deelman                   =>deelman@isi.edu    Research Professor; Research D
Matched : Emilio Ferrara       =>  Emilio Ferrara                =>ferrarae@isi.edu   Research Assistant Professor, 
Matched : Rafael Ferr

In [22]:
def getProfessorProfile(professor):
    r = urllib.urlopen(professor.href).read()
    soup = BeautifulSoup(r)
    div = soup.find_all('div', 'profileModuleLeft')[0]
    return div.getText()

In [23]:
for prof in targetedProfessors:
    prof.setProfileText(getProfessorProfile(prof))

In [24]:
def getCategoryScore(prof, categoryTags):
    profText = prof.getProfileText().lower()
    score = 0
    for categoryTag in categoryTags:
        if categoryTag in profText:
            score += 1
    return score

In [34]:
def setCategoriesScores(prof, allCategories):
    maxScore = 0
    maxCategory = 'Not Found'
    for category in allCategories.keys():
        score = getCategoryScore(prof, allCategories[category])
        if score > maxScore:
            maxCategory = category
            maxScore = score
    prof.setAreaOfInterest(maxCategory)

In [35]:
allCategories = {}
allCategories['imageProcessing'] = ['image', 'computer vision', 'opencv']
allCategories['artificialIntelligence'] = ['artificial', 'intelligence', 'machine leaning', 'intelligent']
allCategories['robotics'] = ['robotics', 'motor', 'drones']
allCategories['cryptography'] = ['cryptography', 'cyber security', 'steganography']
allCategories['networking'] = ['networking', 'networks']
allCategories['highperformance'] = ['performance', 'computing', 'cloud']

for prof in targetedProfessors:
    setCategoriesScores(prof, allCategories)

In [36]:
for prof in targetedProfessors:
    print '{:25}'.format(prof.getName()) + '{:25}'.format(prof.getAreaOfInterest()) + prof.href

Jose-Luis Ambite         Not Found                https://viterbi.usc.edu/directory/faculty/Ambite-Molina/Jose-Luis
Yigal Arens              artificialIntelligence   https://viterbi.usc.edu/directory/faculty/Arens/Yigal
David Barnhart           robotics                 https://viterbi.usc.edu/directory/faculty/Barnhart/David
Andrea Belz              artificialIntelligence   https://viterbi.usc.edu/directory/faculty/Belz/Andrea
Spartak Buniatyan        highperformance          https://viterbi.usc.edu/directory/faculty/Buniatyan/Spartak
Young Cho                networking               https://viterbi.usc.edu/directory/faculty/Cho/Young
Stephen Crago            highperformance          https://viterbi.usc.edu/directory/faculty/Crago/Stephen
Ewa Deelman              highperformance          https://viterbi.usc.edu/directory/faculty/Deelman/Ewa
Emilio Ferrara           highperformance          https://viterbi.usc.edu/directory/faculty/Ferrara/Emilio
Rafael Ferreira Da Silva highperformance