# iSchool People and Research

### Load Libraries

In [17]:
import pandas as pd
import altair as alt
import requests
from bs4 import BeautifulSoup
import networkx as nx
from thefuzz import fuzz
import itertools
from networkx.algorithms import bipartite

In [2]:
url = "https://ischool.illinois.edu/research/areas"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
research_links = soup.find_all('li', class_='taxonomy-term')

In [3]:
dfs = []
for link in research_links:
    term_link = link.find('a', class_='taxonomy-term__link')
    term_href = "https://ischool.illinois.edu" + term_link.get('href')
    term_name = term_link.get_text()
    description = link.find('div', class_='taxonomy-term__description').get_text()
    dfs.append({'research_area': term_name, 'research_description': description, 'research_area_url': term_href})
research_areas = pd.DataFrame(dfs)

In [4]:
dfs = []
for _, row in research_areas.iterrows():
    response = requests.get(row['research_area_url'])
    soup = BeautifulSoup(response.text, 'html.parser')
    faculty = soup.find_all('li', class_='personnel-list__person-item')
    for fac in faculty:
        fac_name = fac.find('div', class_='personnel-list__person-name').get_text()
        fac_title = fac.find('div', class_='personnel-list__person-role').get_text()
        fac_url = "https://ischool.illinois.edu" + fac.find('a', class_='personnel-list__person-link').get('href')
        data = {'name': fac_name, 'description': fac_title, 'url': fac_url, 'research_area': row['research_area'], 'research_url': row.research_area_url, 'research_description': row.research_description}
        dfs.append(data)
final_df = pd.DataFrame(dfs)

In [6]:
committees_url = "https://ischool.illinois.edu/people/committees"
committees_r = requests.get(committees_url)
committees_soup = BeautifulSoup(committees_r.text, 'html.parser')

In [7]:
div = committees_soup.find('div', class_='text-plus-image text-plus-image__noimage')

In [8]:
paragraphs = div.find_all('p')

In [10]:
dfs = []
for p in paragraphs:
    title = p.find('strong').get_text() if p.find('strong') else ''
    members = p.get_text().replace(title, '').split(',')
    data = {}
    
    for member in members:
        
        cleaned_member = member.strip().replace('(ex officio)', '').replace('(Chair)', ''). replace('ex officio)', '').replace('(Chair', '')
        split_members = cleaned_member.split('\r\n')
        split_members = [member for member in split_members if len(member) > 1]
        split_members = [member for member in split_members if 'chair' not in member.lower()]
        for m in split_members:
            data = {'committee_title': title, 'committee_member': m}
            dfs.append(data)
committee_df = pd.DataFrame(dfs)

In [11]:
committee_df['name'] = committee_df.committee_member
committee_df.loc[committee_df.committee_title == "", "committee_title"] = "Executive Committee"

In [14]:
original_names = final_df.name.unique()
committee_names = committee_df.committee_member.unique()

names = list(itertools.product(original_names, committee_names))

dfs =[]
for name in names:
    score = fuzz.token_set_ratio(name[0], name[1])
    if score > 90:
        data = {'name': name[0], 'committee_member': name[1], 'score': score}
        dfs.append(data)

matches_df = pd.DataFrame(dfs)
matches_df = matches_df.drop_duplicates()

In [15]:
merged_df = pd.merge(matches_df, committee_df[['committee_member', 'committee_title']], on=['committee_member'], how='outer')

merged_df.loc[merged_df.name.isna(), 'name'] = merged_df.committee_member

In [16]:
all_df = pd.merge(final_df, merged_df, on=['name'], how='outer')

In [28]:
all_df.to_csv("scraped_ischool_people.csv", index=False)

### Generate Networks

In [25]:
all_melted = pd.melt(all_df, id_vars=['name', 'description', 'url', 'research_url', 'research_description', 'committee_member', 'score'], value_vars=['research_area', 'committee_title'])

In [26]:
people_nodes = all_melted[['name', 'description', 'url']].drop_duplicates()
# groupby name and drop None values

subset_people_nodes = people_nodes.groupby('name').agg({'description': 'first', 'url': 'first'}).reset_index()

area_nodes = all_melted[['value', 'variable', 'research_description', 'research_url']].drop_duplicates()

subset_area_nodes = area_nodes.groupby(['value', 'variable']).agg({'research_description': 'first', 'research_url': 'first'}).reset_index()




In [54]:
edges = all_melted[['name', 'value']].drop_duplicates()

In [58]:


# Create an empty graph
G = nx.Graph()

# Add nodes from subset_people_nodes
for _, row in subset_people_nodes.iterrows():
    attributes = {key: value for key, value in row[['description', 'url']].items() if pd.notnull(value)}
    G.add_node(row['name'], bipartite=0, **attributes)

# Add nodes from subset_area_nodes
for _, row in subset_area_nodes.iterrows():
    attributes = {key: value for key, value in row[['variable', 'research_description', 'research_url']].items() if pd.notnull(value)}
    G.add_node(row['value'], bipartite=1, **attributes)

G.add_edges_from(edges.values)
nx.write_gexf(G, "ischool_people_research_areas.gexf")

In [46]:
bottom_nodes, top_nodes = bipartite.sets(G)

In [39]:
B = bipartite.weighted_projected_graph(G, bottom_nodes)

In [40]:
nx.write_gexf(B, "ischool_people_committees_updated.gexf")

In [133]:

def build_edges(borrow_events, group_col, list_col):
    edges = []

    def create_edges(rows):
        if len(rows[f'list_{list_col}']) > 1:
            combos = list(itertools.combinations(rows[f'list_{list_col}'], 2))

            for c in combos:
                edge = {}
                edge['source'] = c[0]
                edge['target'] = c[1]
                edge[f'{group_col}'] = rows[group_col]
                edges.append(pd.DataFrame([edge]))

    borrow_events.groupby(f'{group_col}')[f'{list_col}'].apply(list).reset_index(name=f'list_{list_col}').progress_apply(create_edges, axis=1)
    final_edges = pd.concat(edges)
    grouped_edges = final_edges.groupby(
        ['source', 'target', f'{group_col}']).size().reset_index(name='counts')
    return grouped_edges

def get_attrs(dict_attrs, rows):
    updated_dict_attrs = dict_attrs.copy()
    for k, v in dict_attrs.items():
        updated_dict_attrs[k] = rows[v]
    
    return updated_dict_attrs

def add_nodes(rows, graph, node_attrs):
    updated_node_attrs = get_attrs(node_attrs, rows) if len(
        node_attrs) > 1 else node_attrs
    graph.add_nodes_from(rows, **updated_node_attrs)

def add_edges(rows, graph, edge_attrs):
    updated_edge_attrs = get_attrs(edge_attrs, rows)
    graph.add_edges_from([(rows.source, rows.target)], **updated_edge_attrs)

def create_unipartite_network(df, graph, node_attrs, edge_attrs, node_col, edge_col):
    '''Create a unipartite graph either members or books'''
    nodelist = df.loc[:, [node_col]]
    edgelist = build_edges(df, edge_col, node_col)
    nodelist.apply(add_nodes, graph=graph, node_attrs=node_attrs, axis=1)
    edgelist.apply(add_edges, graph=graph, edge_attrs=edge_attrs, axis=1)

def create_bipartite_network(rows, graph, member_attrs, book_attrs, edge_attrs):
    
    updated_member_attrs = get_attrs(member_attrs, rows)
    updated_book_attrs = get_attrs(book_attrs, rows)
    updated_edge_attrs = get_attrs(edge_attrs, rows)

    tuples = [(rows.member_id, rows.item_uri)]
    graph.add_node(rows.member_id, **updated_member_attrs,
                   group='members', bipartite=0)
    graph.add_node(rows.item_uri, group='books',
                   bipartite=1, **updated_book_attrs)
    graph.add_edges_from(tuples, **updated_edge_attrs)