In [None]:
from network_analysis.generate_baseline_networks import *
from network_analysis.load_datasets import get_updated_shxco_data
import sys
import warnings
from tqdm.notebook import trange, tqdm
import collections
import itertools
import numpy as np
import matplotlib.pyplot as plt
import altair as alt
from pyvis import network as net
from networkx.algorithms.community import greedy_modularity_communities
from networkx.readwrite import json_graph
from networkx.algorithms import bipartite
import networkx as nx
import pandas as pd
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', None)
# import community
# import nx_altair as nxa
# from node2vec import Node2Vec

warnings.filterwarnings("ignore")

sys.path.append("..")


Exploring the stability of graphs in these groupings:
- compare overall graph density
- compare node metrics
1. All Events **vs** All Events minus Exceptional Ones **vs** All Events minus those of Killen and Raphael France
   1. Bipartite
      1. All time
      2. Seasonal
      3. 1920s vs 1930s
   2. Unipartite
      1. All time
      2. Seasonal
      3. 1920s vs 1930s

Goals:
- identify communities
- identify shape of network and how we want to divide it for analysis
- find out which metrics are most useful for modeling subscribers and book reading habits

#### Baseline datasets

In [None]:
all_events = borrow_events.copy()

unexceptional_events = borrow_events[borrow_events.exceptional_types.isna()]

no_rk_borrow_events = borrow_events[(borrow_events.member_id != 'killen') & (
    borrow_events.member_id != 'raphael-france')]
no_rk_members_df = members_df[(members_df.member_id != 'killen') & (
    members_df.member_id != 'raphael-france')]


### Bipartite Comparisons

#### Comparing Across Entire Time of Sco Library

In [None]:
member_attrs = {'uri': 'member_id'}
book_attrs = {'uri': 'item_uri'}
edge_attrs = {'weight': 'counts'}
all_events_grouped = all_events.groupby(
    ['member_id', 'item_uri']).size().reset_index(name='counts')
unexceptional_events_grouped = unexceptional_events.groupby(
    ['member_id', 'item_uri']).size().reset_index(name='counts')
no_rk_events_grouped = no_rk_borrow_events.groupby(
    ['member_id', 'item_uri']).size().reset_index(name='counts')


In [None]:
def build_bipartite_graphs(grouped_events_df, member_attrs, book_attrs, edge_attrs, should_process, write_to_file, file_name, sk_metrics, link_metrics, members_df, books_df):
    bipartite_graph = get_bipartite_graph(grouped_events_df, member_attrs, book_attrs, edge_attrs)
    top_nodes = {n for n, d in bipartite_graph.nodes(data=True) if d["bipartite"] == 0}
    bottom_nodes = set(bipartite_graph) - top_nodes
    print('graph density: ', bipartite.density(bipartite_graph, bottom_nodes))

    if should_process:
        processed_bipartite_graph = get_network_metrics(bipartite_graph, True)
    else:
        processed_bipartite_graph = bipartite_graph

    processed_bipartite_nodelist, processed_bipartite_edgelist = generate_dataframes(processed_bipartite_graph, True)
    print(f"calculating local skmetrics: {' '.join(sk_metrics + link_metrics)}")
    local_nodelist = generate_local_metrics(processed_bipartite_graph,processed_bipartite_nodelist, sk_metrics, link_metrics, True)
    print(f"calculating global skmetrics: {' '.join(sk_metrics)}")
    updated_bipartite_nodelist = generate_sknetwork_metrics(processed_bipartite_edgelist, local_nodelist, metrics)
    
    print(F"calculating global link metrics: : {' '.join(link_metrics)}")
    bipartite_nodelist = generate_link_metrics(processed_bipartite_graph, processed_bipartite_edgelist, updated_bipartite_nodelist, link_metrics, is_bipartite)
    all_metrics = sk_metrics + link_metrics
    for m in all_metrics:
        bipartite_nodelist = bipartite_nodelist.rename(columns={m: f'global_{m}'})
    if write_to_file:
        bipartite_nodelist, bipartite_edgelist = write_dataframe(file_name, processed_bipartite_edgelist, bipartite_nodelist)
    else:
        bipartite_edgelist = processed_bipartite_edgelist
    bipartite_members = bipartite_nodelist[bipartite_nodelist.group == 'members']

    bipartite_books = bipartite_nodelist[bipartite_nodelist.group == 'books']

    members_df['old_uri'] = members_df.uri
    members_df.uri = members_df.member_id

    joined_members = combine_dataframes(bipartite_members, members_df, bipartite_members.columns.tolist(), 'uri', 'inner')
    joined_books = combine_dataframes(bipartite_books, books_df, bipartite_books.columns.tolist(), 'uri', 'inner')
    
    return (processed_bipartite_graph, bipartite_nodelist, bipartite_edgelist, joined_members, joined_books)



In [None]:
## Reload Saved Graphs
def reload_saved_graphs(file_path, members_df, books_df):
    bipartite_graph = nx.read_gml(f'{file_path}_graph.gml')
    bipartite_nodelist = pd.read_csv(f'{file_path}_nodelist.csv')
    bipartite_edgelist = pd.read_csv(f'{file_path}_edgelist.csv')

    bipartite_members = bipartite_nodelist[bipartite_nodelist.group == 'members']

    bipartite_books = bipartite_nodelist[bipartite_nodelist.group == 'books']

    members_df['old_uri'] = members_df.uri
    members_df.uri = members_df.member_id

    joined_members = combine_dataframes(
        bipartite_members, members_df, bipartite_members.columns.tolist(), 'uri', 'inner')
    joined_books = combine_dataframes(
        bipartite_books, books_df, bipartite_books.columns.tolist(), 'uri', 'inner')

    return (bipartite_graph, bipartite_nodelist, bipartite_edgelist, joined_members, joined_books)


all_events_bipartite_graph, all_events_bipartite_nodelist, all_events_bipartite_edgelist, all_events_members, all_events_books = reload_saved_graphs('./data/all_events_bipartite', members_df, books_df)

unexceptional_bipartite_graph, unexceptional_bipartite_nodelist, unexceptional_bipartite_edgelist, unexceptional_members, unexceptional_books = reload_saved_graphs('./data/unexceptional_bipartite', members_df, books_df)

no_rk_bipartite_graph, no_rk_bipartite_nodelist, no_rk_bipartite_edgelist, no_rk_members, no_rk_books = reload_saved_graphs('./data/no_rk_bipartite', members_df, books_df)


In [None]:
columns = ['global_degree','local_degree', 'global_clustering', 'local_clustering',
           'global_closeness', 'local_closeness', 'global_betweenness',
           'local_betweenness','local_katz', 'local_hits', 'local_cohits',
           'global_katz', 'global_hits', 'global_cohits']
no_rk_books_corr = no_rk_books[columns].corr()
all_events_books_corr = all_events_books[columns].corr()
unexceptional_books_corr = unexceptional_books[columns].corr()
no_rk_members_corr = no_rk_members[columns].corr()
all_events_members_corr = all_events_members[columns].corr()
unexceptional_members_corr = unexceptional_members[columns].corr()


def generate_corr_chart(corr_df, title):
    # data preparation
    pivot_cols = list(corr_df.columns)
    corr_df['cat'] = corr_df.index

    base = alt.Chart(corr_df).transform_fold(pivot_cols).encode(
        x="cat:N",  y='key:N').properties(height=300, width=300, title=title)
    boxes = base.mark_rect().encode(color=alt.Color(
        "value:Q", scale=alt.Scale(scheme="redyellowblue")))
    labels = base.mark_text(size=5, color="grey").encode(
        text=alt.Text("value:Q", format="0.1f"))
    chart = boxes + labels
    return chart

In [None]:
unexceptional_corr_members_chart = generate_corr_chart(
    unexceptional_members_corr, 'member correlations for unexceptional data')
no_rk_corr_members_chart = generate_corr_chart(
    no_rk_members_corr, 'member correlations for no rk data')
all_events_corr_members_chart = generate_corr_chart(
    all_events_members_corr, 'member correlations for all events data')

alt.hconcat(*[all_events_corr_members_chart,
            unexceptional_corr_members_chart, no_rk_corr_members_chart])


In [None]:
unexceptional_corr_books_chart = generate_corr_chart(
    unexceptional_books_corr, 'book correlations for unexceptional data')
no_rk_corr_books_chart = generate_corr_chart(
    no_rk_books_corr, 'book correlations for no rk data')
all_events_corr_books_chart = generate_corr_chart(
    all_events_books_corr, 'book correlations for all events data')

alt.hconcat(*[all_events_corr_books_chart,
            unexceptional_corr_books_chart, no_rk_corr_books_chart])


### Unipartite Comparisons

#### Comparing Across Entire Time of Sco Library

In [None]:
bipartite_graph = get_bipartite_graph(
    all_events_grouped, member_attrs, book_attrs, edge_attrs)
member_nodes = [
    n for n in bipartite_graph.nodes if bipartite_graph.nodes[n]['group'] == 'members']
book_nodes = [
    n for n in bipartite_graph.nodes if bipartite_graph.nodes[n]['group'] == 'books']
projected_members_graph = bipartite.weighted_projected_graph(
    bipartite_graph, member_nodes)
projected_books_graph = bipartite.weighted_projected_graph(
    bipartite_graph, book_nodes)


In [None]:
nodelist, edgelist = generate_dataframes(bipartite_graph, True)


In [None]:
nx.pagerank(bipartite_graph)


In [None]:
node_attrs = {}
edge_attrs = {'item_uri': 'item_uri', 'weight': 'counts'}
members_graph = nx.Graph()
node_col = 'member_id'
edge_col = 'item_uri'
create_unipartite_network(borrow_events, members_graph,
                          node_attrs, edge_attrs, node_col, edge_col)
