# Bipartite Network Analysis and Exploration of Exceptional Metadata

This notebook explores how we can represent s&Co data as a network and also how a network helps us assess whether exceptional member behavior impacts our network. 

#### Load Libraries and Initial Data

In [28]:

import pandas as pd
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append("..")
from network_analysis.load_datasets import get_updated_shxco_data
from network_analysis.generate_network_metrics import *
from network_analysis.create_networks import *
from network_analysis.read_write_networks import *
from network_analysis.visualize_networks import * 


#### Baseline datasets

In [29]:
members_df, books_df, borrow_events, events_df = get_updated_shxco_data(get_subscription=False)

In [30]:
unknown_borrows = borrow_events[borrow_events.start_datetime.isna()]

In [31]:
borrow_events = borrow_events[(borrow_events.start_datetime.isna() == False) & (borrow_events.end_datetime.isna() == False)]


In [32]:
all_borrows = borrow_events[borrow_events.start_datetime < '1942-01-01'].copy()

unexceptional_borrows = all_borrows[all_borrows.exceptional_types.isna()]

## If you want all events regardless of types, use this:
# all_borrows = events_df[events_df.item_uri.isna() == False].copy()

# unexceptional_borrows = all_borrows[all_borrows.exceptional_types.isna()]


### Bipartite Comparisons

#### Comparing Across Entire Time of s&Co Library

Create bipartite networks for the entire time period of the s&Co library for both all the data and only unexceptional data.

In [33]:
member_attrs = {'uri': 'member_id'}
book_attrs = {'uri': 'item_uri'}
edge_attrs = {'weight': 'counts'}
all_borrows_grouped = all_borrows.groupby(['member_id', 'item_uri']).size().reset_index(name='counts')
unexceptional_borrows_grouped = unexceptional_borrows.groupby(['member_id', 'item_uri']).size().reset_index(name='counts')
should_process = True
write_to_file = True
sk_metrics = ['katz', 'louvain']
link_metrics = ['HITS', 'CoHITS', 'BiRank', 'BGRM']


all_borrows_bipartite_graph, all_borrows_bipartite_nodelist, all_borrows_bipartite_edgelist, all_borrows_members, all_borrows_books = check_reload_build_bipartite_graphs(all_borrows_grouped, member_attrs, book_attrs, edge_attrs, should_process, write_to_file, './data/borrow_events_bipartite', sk_metrics, link_metrics, members_df, books_df)

unexceptional_borrows_bipartite_graph, unexceptional_borrows_bipartite_nodelist, unexceptional_borrows_bipartite_edgelist, unexceptional_borrows_members, unexceptional_borrows_books = check_reload_build_bipartite_graphs(unexceptional_borrows_grouped, member_attrs, book_attrs, edge_attrs, should_process, write_to_file,'./data/unexceptional_borrow_events_bipartite', sk_metrics, link_metrics, members_df, books_df)


reloading saved graph: ./data/borrow_events_bipartite
reloading saved graph: ./data/unexceptional_borrow_events_bipartite


#### Correlations Between Bipartite Graph Metrics

In [37]:
local_cols = [ col for col in all_borrows_members.columns.tolist() if 'local' in col]
global_cols = [ col for col in all_borrows_members.columns.tolist() if 'global' in col]

In [38]:
global_members_borrows = all_borrows_members[all_borrows_members.columns[~all_borrows_members.columns.isin(local_cols)]]
unexceptional_global_members_borrows = unexceptional_borrows_members[
    unexceptional_borrows_members.columns[~unexceptional_borrows_members.columns.isin(local_cols)]]
all_borrows_corr_members_chart = generate_corr_chart(
    global_members_borrows, 'member correlations for all borrows data')
unexceptional_corr_members_chart = generate_corr_chart(
    unexceptional_global_members_borrows, 'member correlations for unexceptional data')

local_members_borrows = all_borrows_members[(
    all_borrows_members.columns[~all_borrows_members.columns.isin(global_cols)])]
unexceptional_local_members_borrows = unexceptional_borrows_members[(
    unexceptional_borrows_members.columns[~unexceptional_borrows_members.columns.isin(global_cols)])]
all_borrows_corr_members_chart_subset = generate_corr_chart(
    local_members_borrows[(local_members_borrows.component == 0)], 'member correlations for all borrows data without disconnected nodes')
unexceptional_corr_members_chart_subset = generate_corr_chart(
    unexceptional_local_members_borrows[(unexceptional_local_members_borrows.component == 0)], 'member correlations for unexceptional data without disconnected nodes')

alt.vconcat(
    alt.hconcat(*[all_borrows_corr_members_chart, unexceptional_corr_members_chart]),
    alt.hconcat(*[all_borrows_corr_members_chart_subset, unexceptional_corr_members_chart_subset])
)


In [39]:
df_type = 'all_borrows_members'
df_type2 = 'unexceptional_members'
melted_all_borrows_members = get_melted_corr(
    global_members_borrows, df_type)
melted_unex_borrows_members = get_melted_corr(
    unexceptional_global_members_borrows, df_type2)

df_type_subset = 'all_borrows_members_subset'
df_type_subset2 = 'unexceptional_members_subset'

melted_all_borrows_members_subset = get_melted_corr(
    local_members_borrows.loc[local_members_borrows.component == 0], df_type_subset)
melted_unex_borrows_members_subset = get_melted_corr(
    unexceptional_local_members_borrows.loc[unexceptional_local_members_borrows.component == 0], df_type_subset2)

chart = compare_corr_chart(melted_all_borrows_members,melted_unex_borrows_members, df_type, df_type2)

chart1 = compare_corr_chart(melted_all_borrows_members_subset,melted_unex_borrows_members_subset, df_type_subset, df_type_subset2)

# chart2 = compare_corr_chart(melted_all_borrows_members, melted_all_borrows_members_subset, df_type, df_type_subset)

# chart3 = compare_corr_chart(melted_unex_borrows_members,melted_unex_borrows_members_subset, df_type2, df_type_subset2)

alt.hconcat(*[chart, chart1])


In [40]:
global_books_borrows = all_borrows_books[all_borrows_books.columns[~all_borrows_books.columns.isin(
    local_cols)]]
unexceptional_global_books_borrows = unexceptional_borrows_books[
    unexceptional_borrows_books.columns[~unexceptional_borrows_books.columns.isin(local_cols)]]

all_borrows_corr_books_chart = generate_corr_chart(
    global_books_borrows, 'book correlations for all borrows data')
unexceptional_corr_books_chart = generate_corr_chart(
    unexceptional_global_books_borrows, 'book correlations for unexceptional data')

local_books_borrows = all_borrows_books[(
    all_borrows_books.columns[~all_borrows_books.columns.isin(global_cols)])]
unexceptional_local_books_borrows = local_books_borrows = unexceptional_borrows_books[(
    unexceptional_borrows_books.columns[~unexceptional_borrows_books.columns.isin(global_cols)])]

all_borrows_corr_books_chart_subset = generate_corr_chart(
    local_books_borrows.loc[local_books_borrows.component == 0], 'book correlations for all borrows data without disconnected nodes')
unexceptional_corr_books_chart_subset = generate_corr_chart(
    unexceptional_local_books_borrows.loc[unexceptional_local_books_borrows.component == 0], 'book correlations for unexceptional data without disconnected nodes')

alt.vconcat(alt.hconcat(*[all_borrows_corr_books_chart, unexceptional_corr_books_chart]), alt.hconcat(*[all_borrows_corr_books_chart_subset, unexceptional_corr_books_chart_subset]))


In [41]:
df_type = 'all_borrows_books'
df_type2 = 'unexceptional_books'
melted_all_borrows_books = get_melted_corr(
    global_books_borrows, df_type)
melted_unex_borrows_books = get_melted_corr(
    unexceptional_global_books_borrows, df_type2)

df_type_subset = 'all_borrows_books_subset'
df_type_subset2 = 'unexceptional_borrows_books_subset'
melted_all_borrows_books_subset = get_melted_corr(
    local_books_borrows.loc[local_books_borrows.component == 0], df_type_subset)
melted_unex_borrows_books_subset = get_melted_corr(
    unexceptional_local_books_borrows.loc[unexceptional_local_books_borrows.component == 0], df_type_subset2)

chart = compare_corr_chart(melted_all_borrows_books, melted_unex_borrows_books, df_type, df_type2)
chart1 = compare_corr_chart(melted_all_borrows_books_subset,
                           melted_unex_borrows_books_subset, df_type_subset, df_type_subset2)
# chart2 = compare_corr_chart(melted_all_borrows_books,
#                             melted_all_borrows_books_subset, df_type, df_type_subset)

# chart3 = compare_corr_chart(melted_unex_borrows_books,
#                             melted_unex_borrows_books_subset, df_type2, df_type_subset2)
alt.hconcat(*[chart, chart1])


In [42]:

disconnected_items = pd.concat([all_borrows_members[all_borrows_members.component != 0][['uri', 'component', 'group', 'borrow_count']], all_borrows_books[all_borrows_books.component !=0][['uri', 'component', 'group', 'borrow_count']]])
disconnected_items.sort_values(
    by=['component', 'group'], ascending=[True, False])



Unnamed: 0,uri,component,group,borrow_count
131,du-bos,1,members,4.0
603,rossetti-pre-raphaelite-diaries,1,books,1.0
1664,dupre-italien-dangleterre-poe,1,books,1.0
5130,recollections-dante-gabriel,1,books,1.0
27,bruno-jean,2,members,1.0
5263,wordsworth-prelude,2,books,1.0
123,dent,3,members,1.0
620,seton-lives-hunted-containing,3,books,1.0
156,fitzherbert,4,members,2.0
1363,waley-translations-chinese,4,books,1.0


In [14]:
local_cols = [col for col in all_borrows_members.columns if 'local' in col]
global_cols = [col for col in all_borrows_members.columns if 'global' in col]
cols = all_borrows_members[global_cols + local_cols].columns.tolist()
cols.remove('global_louvain')
cols.remove('local_louvain')

comparison_df = all_borrows_members[all_borrows_members.component == 0][cols + 'uri']
ranked_exploded, chart = compare_node_variability(comparison_df, cols)
chart


In [15]:
local_cols = [col for col in all_borrows_books.columns if 'local' in col]
global_cols = [col for col in all_borrows_books.columns if 'global' in col]
cols = all_borrows_books[global_cols + local_cols].columns.tolist()
cols.remove('global_louvain')
cols.remove('local_louvain')
comparison_df = all_borrows_books[all_borrows_books.component == 0][cols + 'uri']
ranked_exploded, chart = compare_node_variability(comparison_df, cols)
chart


Overall seems like exceptional data does not radically alter the shape of the library. If we remove it we are left with the following numbers:

In [16]:

total_events = len(events_df)
total_borrows = len(all_borrows)
total_unexceptional_borrows = len(unexceptional_borrows)
print(f'Total events → {total_events} | total borrows → {total_borrows} | subset that is exceptional → {total_borrows - total_unexceptional_borrows}')

total_members = members_df.member_id.nunique()
total_borrowers = members_df[members_df.borrow_count > 0].member_id.nunique()
members_exceptional = borrow_events[(
    borrow_events.exceptional_types.isna() == False)].member_id.unique().tolist()
members_unexceptional = borrow_events[(borrow_events.member_id.isin(members_exceptional)) & (
    borrow_events.exceptional_types.isna() == True)].member_id.unique().tolist()
total_unexceptional_borrowers = len(
    members_exceptional) - len(members_unexceptional)
print(f'Total unique members → {total_members} | total unique borrowers → {total_borrowers} | subset that is only exceptional → {total_unexceptional_borrowers}')

total_books = books_df.id.nunique()
total_borrowed_books = books_df[books_df.borrow_count > 0].id.nunique()
books_exceptional = borrow_events[(
    borrow_events.exceptional_types.isna() == False)].item_uri.unique().tolist()
books_unexceptional = borrow_events[(borrow_events.item_uri.isin(books_exceptional)) & (
    borrow_events.exceptional_types.isna() == True)].item_uri.unique().tolist()

total_unexceptional_borrowed_books = len(
    books_exceptional) - len(books_unexceptional)
print(f'Total unique books → {total_books} | total borrowed books → {total_borrowed_books} | subset that is only exceptional → {total_unexceptional_borrowed_books}')


Total events → 34357 | total borrows → 19332 | subset that is exceptional → 5957
Total unique members → 5601 | total unique borrowers → 598 | subset that is only exceptional → 32
Total unique books → 6018 | total borrowed books → 5681 | subset that is only exceptional → 777
