In [8]:
import pandas as pd
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', None)
import networkx as nx
from networkx.algorithms import bipartite
# import community
from networkx.readwrite import json_graph
# import nx_altair as nxa
from networkx.algorithms.community import greedy_modularity_communities
from pyvis import network as net
# from node2vec import Node2Vec
import altair as alt
import matplotlib.pyplot as plt
import numpy as np
import itertools
import collections
from tqdm.notebook import trange, tqdm

import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append("..")
from network_analysis.load_datasets import get_updated_shxco_data
from network_analysis.generate_network_metrics import *
from network_analysis.create_networks import *
from network_analysis.read_write_networks import * 

Exploring the stability of graphs in these groupings:
- compare overall graph density
- compare node metrics
1. All Events **vs** All Events minus Exceptional Ones **vs** All Events minus those of Killen and Raphael France
   1. Bipartite
      1. All time
      2. Seasonal
      3. 1920s vs 1930s
   2. Unipartite
      1. All time
      2. Seasonal
      3. 1920s vs 1930s

Goals:
- identify communities
- identify shape of network and how we want to divide it for analysis
- find out which metrics are most useful for modeling subscribers and book reading habits

#### Baseline datasets

In [9]:
members_df, books_df, borrow_events, events_df = get_updated_shxco_data()

In [10]:
all_events = borrow_events.copy()

unexceptional_events = borrow_events[borrow_events.exceptional_types.isna()]

no_rk_borrow_events = borrow_events[(borrow_events.member_id != 'killen') & (borrow_events.member_id != 'raphael-france')]
no_rk_members_df = members_df[(members_df.member_id != 'killen') & (members_df.member_id != 'raphael-france')]


### Bipartite Comparisons

#### Comparing Across Entire Time of Sco Library

In [11]:
member_attrs = {'uri': 'member_id'}
book_attrs = {'uri': 'item_uri'}
edge_attrs = {'weight': 'counts'}
all_events_grouped = all_events.groupby(['member_id', 'item_uri']).size().reset_index(name='counts')
unexceptional_events_grouped = unexceptional_events.groupby(['member_id', 'item_uri']).size().reset_index(name='counts')
no_rk_events_grouped = no_rk_borrow_events.groupby(['member_id', 'item_uri']).size().reset_index(name='counts')
should_process = True
write_to_file = True
sk_metrics = ['katz', 'louvain']
link_metrics = ['HITS', 'CoHITS', 'BiRank', 'BGRM']


all_events_bipartite_graph, all_events_bipartite_nodelist, all_events_bipartite_edgelist, all_events_members, all_events_books = check_reload_build_bipartite_graphs(all_events_grouped, member_attrs, book_attrs, edge_attrs, should_process, write_to_file, './data/all_events_bipartite', sk_metrics, link_metrics, members_df, books_df)

unexceptional_bipartite_graph, unexceptional_bipartite_nodelist, unexceptional_bipartite_edgelist, unexceptional_members, unexceptional_books = check_reload_build_bipartite_graphs(unexceptional_events_grouped, member_attrs, book_attrs, edge_attrs, should_process, write_to_file,'./data/unexceptional_bipartite', sk_metrics, link_metrics, members_df, books_df)

no_rk_bipartite_graph, no_rk_bipartite_nodelist, no_rk_bipartite_edgelist, no_rk_members, no_rk_books = check_reload_build_bipartite_graphs(no_rk_events_grouped, member_attrs, book_attrs, edge_attrs, should_process, write_to_file, './data/no_rk_bipartite', sk_metrics, link_metrics, members_df, books_df)


reloading saved graph: ./data/all_events_bipartite
reloading saved graph: ./data/unexceptional_bipartite
reloading saved graph: ./data/no_rk_bipartite


In [12]:
columns = no_rk_books.columns.to_list()
columns = [c for c in columns if ('local' in c) | ('global' in c)]
no_rk_books_corr = no_rk_books[columns].corr()
all_events_books_corr = all_events_books[columns].corr()
unexceptional_books_corr = unexceptional_books[columns].corr()
no_rk_members_corr = no_rk_members[columns].corr()
all_events_members_corr = all_events_members[columns].corr()
unexceptional_members_corr = unexceptional_members[columns].corr()


def generate_corr_chart(corr_df, title):
    # data preparation
    pivot_cols = list(corr_df.columns)
    corr_df['cat'] = corr_df.index

    base = alt.Chart(corr_df).transform_fold(pivot_cols).encode(
        x="cat:N",  y='key:N').properties(height=300, width=300, title=title)
    boxes = base.mark_rect().encode(color=alt.Color(
        "value:Q", scale=alt.Scale(scheme="redyellowblue")))
    labels = base.mark_text(size=5, color="grey").encode(
        text=alt.Text("value:Q", format="0.1f"))
    chart = boxes + labels
    return chart

In [13]:
unexceptional_corr_members_chart = generate_corr_chart(
    unexceptional_members_corr, 'member correlations for unexceptional data')
no_rk_corr_members_chart = generate_corr_chart(
    no_rk_members_corr, 'member correlations for no rk data')
all_events_corr_members_chart = generate_corr_chart(
    all_events_members_corr, 'member correlations for all events data')

alt.hconcat(*[all_events_corr_members_chart,
            unexceptional_corr_members_chart, no_rk_corr_members_chart])


In [14]:
all_events_members_corr
cor_cols = all_events_members_corr.columns.to_list()
cor_cols = [c for c in cor_cols if ('cat' != c)]
pd.melt(all_events_members_corr, id_vars=['cat'], value_vars=cor_cols)


Unnamed: 0,cat,variable,value
0,global_degree,global_degree,1.000000
1,local_degree,global_degree,0.049005
2,global_clustering,global_degree,-0.181214
3,local_clustering,global_degree,-0.181214
4,global_closeness,global_degree,0.129270
...,...,...,...
395,global_louvain,global_BGRM,0.474028
396,global_HITS,global_BGRM,-0.011875
397,global_CoHITS,global_BGRM,-0.045232
398,global_BiRank,global_BGRM,0.025200


In [15]:
unexceptional_corr_books_chart = generate_corr_chart(
    unexceptional_books_corr, 'book correlations for unexceptional data')
no_rk_corr_books_chart = generate_corr_chart(
    no_rk_books_corr, 'book correlations for no rk data')
all_events_corr_books_chart = generate_corr_chart(
    all_events_books_corr, 'book correlations for all events data')

alt.hconcat(*[all_events_corr_books_chart,
            unexceptional_corr_books_chart, no_rk_corr_books_chart])


In [16]:
len(all_events_members), len(unexceptional_members), len(no_rk_members)

(536, 508, 534)

In [17]:
members = all_events_members.copy()
members_unex = unexceptional_members.copy()
members_no_rk = no_rk_members.copy()

for c in columns:
    members = members.rename(columns={c: c.replace('local', 'all_events_local').replace('global', 'all_events_global')})
    members_unex = members_unex.rename(columns={c: c.replace('local', 'unexceptional_local').replace('global', 'unexceptional_global')})
    members_no_rk = members_no_rk.rename(columns={c: c.replace('local', 'no_rk_local').replace('global', 'no_rk_global')})

In [18]:
joined_columns = [c for c in members.columns if ('local' in c) | ('global' in c) | ('component' != c)]
joined_columns = list(set(members.columns) - set(joined_columns))

In [None]:
merged_members = pd.merge(members, members_unex, on=joined_columns, how='outer')
merged_membs = pd.merge(merged_members, members_no_rk, on=joined_columns, how='outer')


In [None]:
merged_members[merged_members.uri == 'fitzherbert']


Error: Kernel is dead

In [None]:
subset_cols = merged_members.columns
subset_cols = [c for c in subset_cols if ('degree' in c) ]
merged_members[merged_members.exceptional_types.isna() == False][subset_cols + ['uri']]


In [None]:
all_events_members[all_events_members.member_id == 'fitzherbert']


In [None]:
members_all = all_events_members.uri.unique().tolist()

members_unex = unexceptional_members.uri.unique().tolist()
print(set(members_all) - set(members_unex), set(members_unex) - set(members_all))

In [None]:
cols = all_events_members.columns.to_list()
df_type = 'all_events'
for c in cols:
    all_events_members = all_events_members.rename(columns={c: c + '_' + df_type})

In [None]:
all_events_members

### Unipartite Comparisons

#### Comparing Across Entire Time of Sco Library

In [None]:
member_attrs = {'uri': 'member_id'}
book_attrs = {'uri': 'item_uri'}
edge_attrs = {'weight': 'counts'}
node_attrs = {}
all_events_grouped = all_events.groupby(
    ['member_id', 'item_uri']).size().reset_index(name='counts')
unexceptional_events_grouped = unexceptional_events.groupby(
    ['member_id', 'item_uri']).size().reset_index(name='counts')
no_rk_events_grouped = no_rk_borrow_events.groupby(
    ['member_id', 'item_uri']).size().reset_index(name='counts')
should_process = True
write_to_file = True
sk_metrics = ['katz', 'louvain']
link_metrics = ['pagerank', 'hubs', 'auth']
is_projected = True


projected_members_graph, projected_members_nodelist, projected_members_edgelist, projected_books_graph, projected_books_nodelist, projected_books_edgelist, projected_members, projected_books = check_reload_build_unipartite_graphs(
    all_events_grouped, all_events, member_attrs, book_attrs, edge_attrs, node_attrs, should_process, write_to_file, './data/all_events_unipartite_projected', sk_metrics, link_metrics, members_df, books_df, is_projected)

unexceptional_projected_members_graph, unexceptional_projected_members_nodelist, unexceptional_projected_members_edgelist, unexceptional_projected_books_graph, unexceptional_projected_books_nodelist, unexceptional_projected_books_edgelist, unexceptional_projected_members, unexceptional_projected_books = check_reload_build_unipartite_graphs(
    unexceptional_events_grouped, unexceptional_events, member_attrs, book_attrs, edge_attrs, node_attrs, should_process, write_to_file, './data/unexceptional_events_unipartite_projected', sk_metrics, link_metrics, members_df, books_df, is_projected)


In [None]:
columns = ['global_pagerank', 'global_hubs', 'global_auth',
           'local_pagerank', 'local_hubs', 'local_auth', 
           'global_degree', 'local_degree', 'global_eigenvector',
           'local_eigenvector', 'global_closeness', 'local_closeness',
           'global_betweenness', 'local_betweenness', 'global_clustering',
           'local_clustering', 'global_graph_radius',
           'global_diameter', 'local_graph_radius', 'local_diameter', 
           'local_katz', 'local_louvain', 'global_katz', 'global_louvain']
members_nodelist_corr = projected_members_nodelist[columns].corr()
books_nodelist_corr = projected_books_nodelist[columns].corr()
unexceptional_projected_members_nodelist_corr = unexceptional_projected_members_nodelist[columns].corr(
)
unexceptional_projected_books_nodelist_corr = unexceptional_projected_books_nodelist[columns].corr(
)

def generate_corr_chart(corr_df, title):
    # data preparation
    pivot_cols = list(corr_df.columns)
    corr_df['cat'] = corr_df.index

    base = alt.Chart(corr_df).transform_fold(pivot_cols).encode(
        x="cat:N",  y='key:N').properties(height=300, width=300, title=title)
    boxes = base.mark_rect().encode(color=alt.Color(
        "value:Q", scale=alt.Scale(scheme="redyellowblue")))
    labels = base.mark_text(size=5, color="grey").encode(
        text=alt.Text("value:Q", format="0.1f"))
    chart = boxes + labels
    return chart


In [None]:

members_nodelist_chart = generate_corr_chart(members_nodelist_corr, 'unipartite members all data correlations')
books_nodelist_chart = generate_corr_chart(
    books_nodelist_corr, 'unipartite books all data correlations')
unexceptional_projected_members_nodelist_chart = generate_corr_chart(
    unexceptional_projected_members_nodelist_corr, 'unipartite unexceptional_projected_members all data correlations')
unexceptional_projected_books_nodelist_chart = generate_corr_chart(
    unexceptional_projected_books_nodelist_corr, 'unipartite unexceptional_projected_books all data correlations')
alt.hconcat(*[members_nodelist_chart, unexceptional_projected_members_nodelist_chart,
            books_nodelist_chart, unexceptional_projected_books_nodelist_chart])


In [None]:
is_projected = False
unprojected_members_graph, unprojected_members_nodelist, unprojected_members_edgelist, unprojected_books_graph, unprojected_books_nodelist, unprojected_books_edgelist, unprojected_members, unprojected_books = check_reload_build_unipartite_graphs(
    unexceptional_events_grouped, unexceptional_events, member_attrs, book_attrs, edge_attrs, node_attrs, should_process, write_to_file, './data/all_events_unipartite_unprojected', sk_metrics, link_metrics, members_df, books_df, is_projected)


In [None]:
unprojected_members_nodelist


### Temporal Network Comparisons

#### Bipartite Comparisons

In [None]:
# Process the borrowers graph through the network metrics and return dataframes for each month and each year
years = all_events.year.unique().tolist()
books_dfs = []
members_dfs = []
month_ranges = [{'Winter': [12, 1, 2]}, {'Spring': [3, 4, 5]},
                {'Summer': [6, 7, 8]}, {'Fall': [9, 10, 11]}]
for year in tqdm(years):
    for month in month_ranges:
        # Make books graph for each month range
        member_attrs = {'uri': 'member_id'}
        book_attrs = {'uri': 'item_uri'}
        edge_attrs = {'weight': 'counts'}
        all_events_grouped = all_events[(all_events.year == year) & (all_events.month.isin(list(month.values())[0]))].groupby(['member_id', 'item_uri']).size().reset_index(name='counts')
        should_process = True
        write_to_file = False
        sk_metrics = ['katz', 'louvain']
        link_metrics = ['HITS', 'CoHITS', 'BiRank', 'BGRM']


        all_events_bipartite_graph, all_events_bipartite_nodelist, all_events_bipartite_edgelist, all_events_members, all_events_books = check_reload_build_bipartite_graphs(all_events_grouped, member_attrs, book_attrs, edge_attrs, should_process, write_to_file, './data/all_events_bipartite', sk_metrics, link_metrics, members_df, books_df)
        all_event_books['seasons'] = list(month.keys())[0]
        all_events_members['seasons'] = list(month.keys())[0]
        books_dfs.append(all_events_books)
        members_dfs.append(all_events_members)
        
