# Baseline Network Analysis and Exploration of Exceptional Metadata

This notebook explores how best to represent s&Co data as a network. We explore whether to represent the data as a bipartite or unipartite network (attempting to capture the information loss from one to the other), and whether we need to account for exceptional reading data in our analyses (or if the network remains stable regardless), and whether time periods influence the shape of the network. Overall the goal is to assess the best baseline for our downstream network analyses (node classification/community detection and link prediction).

#### Load Libraries and Initial Data

In [1]:

import pandas as pd
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', None)
import networkx as nx
from networkx.algorithms import bipartite
# import community
from networkx.readwrite import json_graph
# import nx_altair as nxa
from networkx.algorithms.community import greedy_modularity_communities
from pyvis import network as net
# from node2vec import Node2Vec
import altair as alt
import matplotlib.pyplot as plt
import numpy as np
import itertools
import collections
from tqdm.notebook import trange, tqdm

import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append("..")
from network_analysis.load_datasets import get_updated_shxco_data
from network_analysis.generate_network_metrics import *
from network_analysis.create_networks import *
from network_analysis.read_write_networks import *
from network_analysis.visualize_networks import * 


#### Baseline datasets

In [2]:
members_df, books_df, borrow_events, events_df = get_updated_shxco_data()

In [3]:
all_events = borrow_events.copy()

unexceptional_events = borrow_events[borrow_events.exceptional_types.isna()]

### Bipartite Comparisons

#### Comparing Across Entire Time of s&Co Library

Create bipartite networks for the entire time period of the s&Co library for both all the data and only unexceptional data.

In [4]:
member_attrs = {'uri': 'member_id'}
book_attrs = {'uri': 'item_uri'}
edge_attrs = {'weight': 'counts'}
all_events_grouped = all_events.groupby(['member_id', 'item_uri']).size().reset_index(name='counts')
unexceptional_events_grouped = unexceptional_events.groupby(['member_id', 'item_uri']).size().reset_index(name='counts')
should_process = True
write_to_file = True
sk_metrics = ['katz', 'louvain']
link_metrics = ['HITS', 'CoHITS', 'BiRank', 'BGRM']


all_events_bipartite_graph, all_events_bipartite_nodelist, all_events_bipartite_edgelist, all_events_members, all_events_books = check_reload_build_bipartite_graphs(all_events_grouped, member_attrs, book_attrs, edge_attrs, should_process, write_to_file, './data/all_events_bipartite', sk_metrics, link_metrics, members_df, books_df)

unexceptional_bipartite_graph, unexceptional_bipartite_nodelist, unexceptional_bipartite_edgelist, unexceptional_members, unexceptional_books = check_reload_build_bipartite_graphs(unexceptional_events_grouped, member_attrs, book_attrs, edge_attrs, should_process, write_to_file,'./data/unexceptional_bipartite', sk_metrics, link_metrics, members_df, books_df)


reloading saved graph: ./data/all_events_bipartite
reloading saved graph: ./data/unexceptional_bipartite


#### Correlations Between Bipartite Graph Metrics

In [5]:

all_events_corr_members_chart = generate_corr_chart(
    all_events_members, all_events, 'member correlations for all events data', True)
unexceptional_corr_members_chart = generate_corr_chart(
    unexceptional_members, unexceptional_events, 'member correlations for unexceptional data', True)

all_events_corr_members_chart_subset = generate_corr_chart(
    all_events_members.loc[all_events_members.component == 0], all_events, 'member correlations for all events data without disconnected nodes', True)
unexceptional_corr_members_chart_subset = generate_corr_chart(
    unexceptional_members.loc[unexceptional_members.component == 0], unexceptional_events, 'member correlations for unexceptional data without disconnected nodes', True)

alt.vconcat(alt.hconcat(*[all_events_corr_members_chart, unexceptional_corr_members_chart]),
            alt.hconcat(*[all_events_corr_members_chart_subset, unexceptional_corr_members_chart_subset]))


In [6]:
df_type = 'all_events_members'
df_type2 = 'unexceptional_members'
melted_all_events_members = get_melted_corr(
    all_events_members, all_events, True, df_type)
melted_unex_events_members = get_melted_corr(
    unexceptional_members, unexceptional_events, True, df_type2)

df_type_subset = 'all_events_members_subset'
df_type_subset2 = 'unexceptional_members_subset'

melted_all_events_members_subset = get_melted_corr(
    all_events_members.loc[all_events_members.component == 0], all_events, True, df_type_subset)
melted_unex_events_members_subset = get_melted_corr(
    unexceptional_members.loc[unexceptional_members.component == 0], unexceptional_events, True, df_type_subset2)

chart = compare_corr_chart(melted_all_events_members,melted_unex_events_members, df_type, df_type2)

chart1 = compare_corr_chart(melted_all_events_members_subset,melted_unex_events_members_subset, df_type_subset, df_type_subset2)

# chart2 = compare_corr_chart(melted_all_events_members, melted_all_events_members_subset, df_type, df_type_subset)

# chart3 = compare_corr_chart(melted_unex_events_members,melted_unex_events_members_subset, df_type2, df_type_subset2)

alt.hconcat(*[chart, chart1])


In [7]:
all_events_corr_books_chart = generate_corr_chart(
    all_events_books, all_events, 'book correlations for all events data', False)
unexceptional_corr_books_chart = generate_corr_chart(
    unexceptional_books, unexceptional_events, 'book correlations for unexceptional data', False)

all_events_corr_books_chart_subset = generate_corr_chart(
    all_events_books.loc[all_events_books.component == 0], all_events, 'book correlations for all events data', False)
unexceptional_corr_books_chart_subset = generate_corr_chart(
    unexceptional_books.loc[unexceptional_books.component == 0], unexceptional_events, 'book correlations for unexceptional data', False)

alt.vconcat(alt.hconcat(*[all_events_corr_books_chart, unexceptional_corr_books_chart]), alt.hconcat(*[all_events_corr_books_chart_subset, unexceptional_corr_books_chart_subset]))


In [8]:
df_type = 'all_events_books'
df_type2 = 'unexceptional_books'
melted_all_events_books = get_melted_corr(
    all_events_books, all_events, False, df_type)
melted_unex_events_books = get_melted_corr(
    unexceptional_books, unexceptional_events, False, df_type2)

df_type_subset = 'all_events_books_subset'
df_type_subset2 = 'unexceptional_books_subset'
melted_all_events_books_subset = get_melted_corr(
    all_events_books.loc[all_events_books.component == 0], all_events, False, df_type_subset)
melted_unex_events_books_subset = get_melted_corr(
    unexceptional_books.loc[unexceptional_books.component == 0], unexceptional_events, False, df_type_subset2)

chart = compare_corr_chart(melted_all_events_books, melted_unex_events_books, df_type, df_type2)
chart1 = compare_corr_chart(melted_all_events_books_subset,
                           melted_unex_events_books_subset, df_type_subset, df_type_subset2)
alt.hconcat(*[chart, chart1])


Overall seems like exceptional data does not radically alter the shape of the library. If we remove it we are left with the following numbers:

In [19]:
total_members = len(all_events_members)
total_unexceptional_members = len(all_events_members) - len(unexceptional_members)
total_books = len(all_events_books)
total_unexceptional_books = len(all_events_books) - len(unexceptional_books)
total_events = len(all_events)
total_unexceptional_events = len(all_events) - len(unexceptional_events)
print('total members in the libraries :', total_members, 'minus exceptional members :', total_unexceptional_members)
print('total books in the libraries :', total_books, 'minus exceptional books :', total_unexceptional_books)
print('total events in the libraries :', total_events, 'minus exceptional events :', total_unexceptional_events)

total members in the libraries : 536 minus exceptional members : 28
total books in the libraries : 5375 minus exceptional books : 755
total events in the libraries : 19374 minus exceptional events : 6148


### Unipartite Comparisons

#### Comparing Across Entire Time of Sco Library

In [10]:
member_attrs = {'uri': 'member_id'}
book_attrs = {'uri': 'item_uri'}
edge_attrs = {'weight': 'counts'}
node_attrs = {}
should_process = True
write_to_file = True
sk_metrics = ['katz', 'louvain']
link_metrics = ['pagerank', 'hubs', 'auth']
is_projected = True


projected_members_graph, projected_members_nodelist, projected_members_edgelist, projected_members, projected_books_graph, projected_books_nodelist, projected_books_edgelist, projected_books = check_reload_build_unipartite_graphs(
    all_events_grouped, all_events, member_attrs, book_attrs, edge_attrs, node_attrs, should_process, write_to_file, './data/all_events_unipartite_projected', sk_metrics, link_metrics, members_df, books_df, is_projected)

unexceptional_projected_members_graph, unexceptional_projected_members_nodelist, unexceptional_projected_members_edgelist, unexceptional_projected_members, unexceptional_projected_books_graph, unexceptional_projected_books_nodelist, unexceptional_projected_books_edgelist, unexceptional_projected_books = check_reload_build_unipartite_graphs(
    unexceptional_events_grouped, unexceptional_events, member_attrs, book_attrs, edge_attrs, node_attrs, should_process, write_to_file, './data/unexceptional_events_unipartite_projected', sk_metrics, link_metrics, members_df, books_df, is_projected)


reloading saved graph: ./data/all_events_unipartite_projected
reloading saved graph: ./data/unexceptional_events_unipartite_projected


#### Correlations Between Unipartite Graph Metrics

In [11]:

projected_members_chart = generate_corr_chart(
    projected_members, all_events, 'member correlations for all events data', True)
unexceptional_projected_members_chart = generate_corr_chart(
    unexceptional_projected_members, unexceptional_events, 'member correlations for unexceptional data', True)
projected_books_chart = generate_corr_chart(
    projected_books, all_events, 'book correlations for all events data', False)
unexceptional_projected_books_chart = generate_corr_chart(
    unexceptional_projected_books, unexceptional_events, 'book correlations for unexceptional data', False)

projected_members_chart_subset = generate_corr_chart(
    projected_members.loc[projected_members.component == 0], all_events, 'member correlations for all events data', True)
unexceptional_projected_members_chart_subset = generate_corr_chart(
    unexceptional_projected_members.loc[unexceptional_projected_members.component == 0], unexceptional_events, 'member correlations for unexceptional data', True)
projected_books_chart_subset = generate_corr_chart(
    projected_books.loc[projected_books.component == 0], all_events, 'book correlations for all events data', False)
unexceptional_projected_books_chart_subset = generate_corr_chart(
    unexceptional_projected_books.loc[unexceptional_projected_books.component == 0], unexceptional_events, 'book correlations for unexceptional data', False)

charts = [projected_members_chart, unexceptional_projected_members_chart,
                 projected_books_chart, unexceptional_projected_books_chart]
subset_charts = [projected_members_chart_subset, unexceptional_projected_members_chart_subset,
                 projected_books_chart_subset, unexceptional_projected_books_chart_subset]

alt.vconcat(alt.hconcat(*charts), alt.hconcat(*subset_charts))


In [14]:
df_type = 'all_events_members'
df_type2 = 'unexceptional_members'
melted_projected_members = get_melted_corr(
    projected_members, all_events, True, df_type)
melted_projected_unex_members = get_melted_corr(
    unexceptional_projected_members, unexceptional_events, True, df_type2)

df_type_subset = 'all_events_members_subset'
df_type_subset2 = 'unexceptional_members_subset'
melted_projected_members_subset = get_melted_corr(
    projected_members.loc[projected_members.component == 0], all_events, True, df_type_subset)
melted_projected_unex_members_subset = get_melted_corr(
    unexceptional_projected_members.loc[unexceptional_projected_members.component == 0], unexceptional_events, True, df_type_subset2)

chart = compare_corr_chart(melted_projected_members,
                           melted_projected_unex_members, df_type, df_type2)
chart1 = compare_corr_chart(melted_projected_members_subset,
                           melted_projected_unex_members_subset, df_type_subset, df_type_subset2)

df_type = 'all_events_books'
df_type2 = 'unexceptional_books'
melted_projected_books = get_melted_corr(
    projected_books, all_events, False, df_type)
melted_projected_unex_books = get_melted_corr(
    unexceptional_projected_books, unexceptional_events, False, df_type2)

df_type_subset = 'all_events_books_subset'
df_type_subset2 = 'unexceptional_books_subset'
melted_projected_books_subset = get_melted_corr(
    projected_books.loc[projected_books.component ==0], all_events, False, df_type_subset)
melted_projected_unex_books_subset = get_melted_corr(
    unexceptional_projected_books.loc[unexceptional_projected_books.component==0], unexceptional_events, False, df_type_subset2)

chart2 = compare_corr_chart(melted_projected_books,
                           melted_projected_unex_books, df_type, df_type2)
chart3 = compare_corr_chart(melted_projected_books_subset,
                            melted_projected_unex_books_subset, df_type_subset, df_type_subset2)
alt.vconcat(alt.hconcat(chart, chart1), alt.hconcat(chart2, chart3))


In [None]:
## Attempt at building an unprojected unipartite graph but leads to memory errors since it takes so long to complete 😭
# is_projected = False
# unprojected_members_graph, unprojected_members_nodelist, unprojected_members_edgelist, unprojected_books_graph, unprojected_books_nodelist, unprojected_books_edgelist, unprojected_members, unprojected_books = check_reload_build_unipartite_graphs(
#     unexceptional_events_grouped, unexceptional_events, member_attrs, book_attrs, edge_attrs, node_attrs, should_process, write_to_file, './data/all_events_unipartite_unprojected', sk_metrics, link_metrics, members_df, books_df, is_projected)


### Temporal Network Comparisons

#### Bipartite Comparisons

In [15]:
events_1920s = all_events[(all_events.end_datetime <= '1930-01-01')]
events_1930s = all_events[(all_events.end_datetime >= '1930-01-01') & (all_events.start_datetime <= '1943-01-01')]


In [16]:
member_attrs = {'uri': 'member_id'}
book_attrs = {'uri': 'item_uri'}
edge_attrs = {'weight': 'counts'}
events_1920s_grouped = events_1920s.groupby(['member_id', 'item_uri']).size().reset_index(name='counts')
events_1930s_grouped = events_1930s.groupby(
    ['member_id', 'item_uri']).size().reset_index(name='counts')
should_process = True
write_to_file = True
sk_metrics = ['katz', 'louvain']
link_metrics = ['HITS', 'CoHITS', 'BiRank', 'BGRM']


bipartite_graph_1920s, bipartite_nodelist_1920s, bipartite_edgelist_1920s, bipartite_members_1920s, bipartite_books_1920s = check_reload_build_bipartite_graphs(
    events_1920s_grouped, member_attrs, book_attrs, edge_attrs, should_process, write_to_file, './data/events_1920s_bipartite', sk_metrics, link_metrics, members_df, books_df)

bipartite_graph_1930s, bipartite_nodelist_1930s, bipartite_edgelist_1930s, bipartite_members_1930s, bipartite_books_1930s = check_reload_build_bipartite_graphs(
    events_1930s_grouped, member_attrs, book_attrs, edge_attrs, should_process, write_to_file, './data/events_1930s_bipartite', sk_metrics, link_metrics, members_df, books_df)


reloading saved graph: ./data/events_1920s_bipartite
reloading saved graph: ./data/events_1930s_bipartite


In [17]:
node_attrs = {}
should_process = True
write_to_file = True
sk_metrics = ['katz', 'louvain']
link_metrics = ['pagerank', 'hubs', 'auth']
is_projected = True
members_graph_1920s, members_nodelist_1920s, members_edgelist_1920s, joined_members_1920s, books_graph_1920s, books_nodelist_1920s, books_edgelist_1920s, joined_books_1920s = check_reload_build_unipartite_graphs(
    events_1920s_grouped, events_1920s, member_attrs, book_attrs, edge_attrs, node_attrs, should_process, write_to_file, './data/events_1920s_unipartite_projected', sk_metrics, link_metrics, members_df, books_df, is_projected)

members_graph_1930s, members_nodelist_1930s, members_edgelist_1930s, joined_members_1930s, books_graph_1930s, books_nodelist_1930s, books_edgelist_1930s, joined_books_1930s = check_reload_build_unipartite_graphs(
    events_1930s_grouped, events_1930s, member_attrs, book_attrs, edge_attrs, node_attrs, should_process, write_to_file, './data/events_1930s_unipartite_projected', sk_metrics, link_metrics, members_df, books_df, is_projected)


reloading saved graph: ./data/events_1920s_unipartite_projected
reloading saved graph: ./data/events_1930s_unipartite_projected


In [22]:
bipartite_members_1920s_chart = generate_corr_chart(
    bipartite_members_1920s, events_1920s, 'bipartite member correlations for 1920s events data', True)
bipartite_members_1930s_chart = generate_corr_chart(
    bipartite_members_1930s, events_1930s, 'bipartite member correlations for 1930s events data', True)

bipartite_members_1920s_chart_subset = generate_corr_chart(
    bipartite_members_1920s.loc[bipartite_members_1920s.component == 0], events_1920s, 'bipartite member correlations for 1920s events data subset', True)
bipartite_members_1930s_chart_subset = generate_corr_chart(
    bipartite_members_1930s.loc[bipartite_members_1930s.component == 0], events_1930s, 'bipartite member correlations for 1930s events data subset', True)

alt.vconcat(alt.hconcat(bipartite_members_1920s_chart, bipartite_members_1930s_chart),
            alt.hconcat(bipartite_members_1920s_chart_subset, bipartite_members_1930s_chart_subset))


In [23]:

joined_members_1920s_chart_subset = generate_corr_chart(
    joined_members_1920s.loc[joined_members_1920s.component == 0], events_1920s, 'unipartite member correlations for 1920s events subset', True)
joined_members_1930s_chart_subset = generate_corr_chart(
    joined_members_1930s.loc[joined_members_1930s.component == 0], events_1930s, 'unipartite member correlations for 1930s events subset', True)

joined_members_1920s_chart = generate_corr_chart(
    joined_members_1920s, events_1920s, 'unipartite member correlations for 1920s events', True)
joined_members_1930s_chart = generate_corr_chart(
    joined_members_1930s, events_1930s, 'unipartite member correlations for 1930s events', True)

alt.vconcat(alt.hconcat(joined_members_1920s_chart, joined_members_1930s_chart),
            alt.hconcat(joined_members_1920s_chart, joined_members_1930s_chart))
