In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', None)
import networkx as nx
from networkx.algorithms import bipartite
# import community
from networkx.readwrite import json_graph
# import nx_altair as nxa
from networkx.algorithms.community import greedy_modularity_communities
from pyvis import network as net
# from node2vec import Node2Vec
import altair as alt
import matplotlib.pyplot as plt
import numpy as np
import itertools
import collections
from tqdm.notebook import trange, tqdm
tqdm.pandas()
import warnings
warnings.filterwarnings("ignore")
from IPython.display import display, Markdown, HTML
import sys
sys.path.append("..")
from bigraph.predict import pa_predict, jc_predict, cn_predict,aa_predict, katz_predict
from bigraph.evaluation import evaluation
from network_analysis.birankpy import BipartiteNetwork
from network_analysis.load_datasets import get_updated_shxco_data
from network_analysis.generate_network_metrics import *
from network_analysis.create_networks import *
from network_analysis.read_write_networks import * 
from network_analysis.link_prediction import * 
members_df, books_df, borrow_events, events_df = get_updated_shxco_data(get_subscription=False)


In [2]:
from dateutil.relativedelta import relativedelta

sixmonths = relativedelta(months=6)
sixmonths


relativedelta(months=+6)

In [3]:
partial_df = pd.read_csv('../dataset_generator/data/partial_borrowers.csv')
partial_df['index_col'] = partial_df.index
partial_df[0:1]



Unnamed: 0,member_id,subscription_start,subscription_end,known_borrows,index_col
0,martin-maud,1923-10-17,1923-11-17,36,0


In [4]:
partial_members = partial_df.member_id.unique().tolist()
# parse subscription dates so we can use them to identify circulating books
partial_df['subscription_starttime'] = pd.to_datetime(
    partial_df['subscription_start'], errors='coerce')
partial_df['subscription_endtime'] = pd.to_datetime(
    partial_df['subscription_end'], errors='coerce')

# all_events = events_df[events_df.item_uri.isna() == False].copy()

borrow_events = borrow_events[(borrow_events.start_datetime.isna() == False) & (
    borrow_events.end_datetime.isna() == False)]
all_borrows = borrow_events[borrow_events.start_datetime < '1942-01-01'].copy()


### Bipartite Link Predictions for All Event Types

In [5]:
member_attrs = {'uri': 'member_id'}
book_attrs = {'uri': 'item_uri'}
edge_attrs = {'weight': 'counts'}
all_borrows_grouped = all_borrows.groupby(
    ['member_id', 'item_uri']).size().reset_index(name='counts')

should_process = True
write_to_file = True
sk_metrics = ['katz', 'louvain']
link_metrics = ['HITS', 'CoHITS', 'BiRank', 'BGRM']


all_borrows_bipartite_graph, all_borrows_bipartite_nodelist, all_borrows_bipartite_edgelist, all_borrows_members, all_borrows_books = check_reload_build_bipartite_graphs(
    all_borrows_grouped, member_attrs, book_attrs, edge_attrs, should_process, write_to_file, './data/borrow_events_bipartite', sk_metrics, link_metrics, members_df, books_df)


reloading saved graph: ./data/borrow_events_bipartite


In [6]:
# Remove disconnected components
remove = all_borrows_bipartite_nodelist[all_borrows_bipartite_nodelist.component != 0 ].uri.tolist()
all_borrows_bipartite_graph.remove_nodes_from(remove)

In [7]:
# all_preds = get_bipartite_link_predictions(all_borrows_bipartite_graph)


In [8]:
metrics = ['jc_prediction', 'pa_prediction',
           'cn_prediction', 'aa_prediction']
# all_preds[metrics].corr()

In [9]:
title_lookup = {row.uri: row.title for row in books_df.itertuples()}

In [10]:
# output_path = './data/partial_members_bipartite_network_all_events_link_predictions.csv'
# if os.path.exists(output_path):
#     os.remove(output_path)
# partial_df.progress_apply(get_full_predictions, axis=1, number_of_results=10, limit_to_circulation=True, predictions_df=all_preds, events_df=borrow_events, relative_date=sixmonths, predict_group='books', metrics=metrics, output_path=output_path)


In [11]:
# processed_predictions = pd.read_csv(output_path)


### Circulation Specific Bipartite Link Predictions

In [13]:
start_library = borrow_events.sort_values(
    by=['start_datetime'])[0:1].start_datetime.values[0]
output_path = './data/partial_members_bipartite_circulation_events_predictions.csv'
if os.path.exists(output_path):
    os.remove(output_path)
partial_df[0:1].apply(get_specific_predictions, axis=1, number_of_results=10, limit_to_circulation=True, events_df=events_df, borrow_events=borrow_events, members_df=members_df, books_df=books_df, relative_date=start_library, predict_group='books', output_path=output_path)


Processing martin-maud with subscription 1923-10-17
building graph: test2
connected? False
bipartite? True
[940, 3]
graph density:  0.023019996850889624
calculating global degree
calculating top global clustering
calculating bottom global clustering
calculating global closeness
calculating global closeness


  0%|          | 0/2 [00:00<?, ?it/s]

calculating local skmetrics: katz louvain HITS CoHITS BiRank BGRM
component 1 - size 940 - graph density  0.023361495135688685
calculating katz
component 2 - size 3 - graph density  1.0
calculating katz
calculating global skmetrics: katz louvain
calculating katz
calculating global link metrics: : HITS CoHITS BiRank BGRM
Running jaccard link prediction
Jaccard prediction starting...
Jaccard Executed in 0.3634817600250244 seconds 

Running preferential attachment link prediction
Preferential_attachment prediction starting...
Preferential attachment Executed in 0.356295108795166 seconds 

Running common neighbors link prediction
Common neighbor prediction starting...
Common neighbours Executed in 0.27409791946411133 seconds 

Running adamic adar link prediction
Adamic_adar prediction starting...
Adamic-adar Executed in 0.4678828716278076 seconds 



0    None
dtype: object

In [14]:
predictions = pd.read_csv(output_path)
predictions

Unnamed: 0,jc_prediction,jc_prediction_scores,pa_prediction,pa_prediction_scores,cn_prediction,cn_prediction_scores,aa_prediction,aa_prediction_scores,member_id,subscription_starttime,subscription_endtime,known_borrows
0,joyce-portrait-artist-young,0.294118,joyce-exiles,320,joyce-portrait-artist-young,10.0,joyce-portrait-artist-young,2.991369,martin-maud,1923-10-17,1923-11-17,36
1,joyce-exiles,0.257143,joyce-portrait-artist-young,320,joyce-exiles,9.0,joyce-exiles,2.3912,martin-maud,1923-10-17,1923-11-17,36
2,lewis-main-street-story,0.194444,lewis-main-street-story,288,lewis-main-street-story,7.0,lewis-main-street-story,1.853903,martin-maud,1923-10-17,1923-11-17,36
3,squire-london-mercury,0.055556,squire-london-mercury,128,squire-london-mercury,2.0,squire-london-mercury,0.503488,martin-maud,1923-10-17,1923-11-17,36


In [12]:
book_attrs = {'uri': 'item_uri'}
edge_attrs = {'weight': 'counts'}
should_process = True
write_to_file = False
sk_metrics = ['katz', 'louvain']
link_metrics = ['HITS', 'CoHITS', 'BiRank', 'BGRM']
predictions_df = None
# row = partial_df[0:1]
# start = pd.to_datetime(row.subscription_start.values[0], errors='coerce')
# end = pd.to_datetime(row.subscription_end.values[0], errors='coerce')
# circulation_start = start - sixmonths
circulation_events_bipartite_graph = None
circulation_events_bipartite_nodelist = None
circulation_events_books = None
circulation_events_members = None
popular_books = None
number_of_results = 5
start_library = borrow_events.sort_values(
    by=['start_datetime'])[0:1].start_datetime.values[0]
for index, row in partial_df[0:1].iterrows():
    print(f'Processing {row.member_id} with subscription {row.subscription_start}')
    seed_data = events_df[(events_df.member_id == row.member_id) & (events_df.item_uri.isna() == False)]
    
    circulation_events = borrow_events[borrow_events.start_datetime.between(
        start_library, row.subscription_endtime) | borrow_events.end_datetime.between(start_library, row.subscription_endtime)]
    circulation_events = circulation_events[circulation_events.member_id != row.member_id]

    circulation_counts = circulation_events.groupby(['item_uri']).size().reset_index(
        name='counts').sort_values(['counts'], ascending=False)[0:number_of_results]
    popular_books = circulation_counts.item_uri.tolist()


    member_borrows = len(circulation_events[(circulation_events.member_id == row.member_id) & (circulation_events.item_uri.isna()==False)])

    graph_data = pd.concat([seed_data, circulation_events], axis=0)
    member_attrs = {'uri': 'member_id'}

    circulation_events_grouped = graph_data.groupby(['member_id', 'item_uri']).size().reset_index(name='counts')

    circulation_events_bipartite_graph, circulation_events_bipartite_nodelist, circulation_events_bipartite_edgelist, circulation_events_members, circulation_events_books = check_reload_build_bipartite_graphs(
        circulation_events_grouped, member_attrs, book_attrs, edge_attrs, should_process, write_to_file, 'test2', sk_metrics, link_metrics, members_df, books_df)

    predictions_df = get_bipartite_link_predictions(circulation_events_bipartite_graph)
    identified_top_predictions = {}

    member_preds = predictions_df[predictions_df.member_id == row.member_id]
    get_predictions_by_metric(
        row, metric, predictions_df, circulation_books, limit_to_circulation=True)


Processing martin-maud with subscription 1923-10-17
building graph: test2
connected? False
bipartite? True
[940, 3]
graph density:  0.023019996850889624
calculating global degree
calculating top global clustering
calculating bottom global clustering
calculating global closeness
calculating global closeness


  0%|          | 0/2 [00:00<?, ?it/s]

calculating local skmetrics: katz louvain HITS CoHITS BiRank BGRM
component 1 - size 940 - graph density  0.023361495135688685
calculating katz
component 2 - size 3 - graph density  1.0
calculating katz
calculating global skmetrics: katz louvain
calculating katz
calculating global link metrics: : HITS CoHITS BiRank BGRM
Running jaccard link prediction
Jaccard prediction starting...
Jaccard Executed in 0.34180498123168945 seconds 

Running preferential attachment link prediction
Preferential_attachment prediction starting...
Preferential attachment Executed in 0.27380871772766113 seconds 

Running common neighbors link prediction
Common neighbor prediction starting...
Common neighbours Executed in 0.2635178565979004 seconds 

Running adamic adar link prediction
Adamic_adar prediction starting...
Adamic-adar Executed in 0.4570448398590088 seconds 

        member_id                     item_uri  jc_prediction  pa_prediction  \
3662  martin-maud                 joyce-exiles       0.257143

In [15]:
circulation_events_bipartite_nodelist[circulation_events_bipartite_nodelist.component == 1]


Unnamed: 0,uri,group,bipartite,global_degree,local_degree,global_clustering,local_clustering,global_closeness,local_closeness,global_betweenness,local_betweenness,node_title,component,local_katz,local_louvain,local_HITS,local_CoHITS,local_BiRank,local_BGRM,global_katz,global_louvain,global_HITS,global_CoHITS,global_BiRank,global_BGRM
940,meredith-diana-crossways,books,1,0.013699,1.0,1.0,1.0,1.28167,1.0,0.0,0.0,meredith-diana-crossways,1,2.25,0.0,0.5,0.5,0.595137,0.217251,1.5,21.0,1.6e-05,0.003761,0.005072,0.001664
941,wilde-profundis,books,1,0.013699,1.0,1.0,1.0,1.28167,1.0,0.0,0.0,wilde-profundis,1,,,0.5,0.5,0.595137,0.217251,1.5,21.0,1.6e-05,0.003761,0.005072,0.001664
942,ulmann,members,0,0.002299,1.0,0.0,0.0,1.076433,1.0,2e-06,0.0,ulmann,1,2.25,0.0,1.0,1.0,0.865456,0.334663,2.25,21.0,0.000816,0.008476,0.008151,0.003444


In [13]:
predictions_df[predictions_df.member_id == 'martin-maud'].sort_values(
    by=['jc_prediction'], ascending=False)[0:20]


Unnamed: 0,member_id,item_uri,jc_prediction,pa_prediction,cn_prediction,aa_prediction
3910,martin-maud,joyce-portrait-artist-young,0.294118,320,10.0,2.991369
3662,martin-maud,joyce-exiles,0.257143,320,9.0,2.3912
3666,martin-maud,lewis-babbitt,0.205882,224,7.0,2.011791
3667,martin-maud,lewis-main-street-story,0.194444,288,7.0,1.853903
4044,martin-maud,strachey-queen-victoria,0.176471,192,6.0,1.867372
3733,martin-maud,asquith-autobiography-margot-asquith,0.176471,192,6.0,1.659852
3871,martin-maud,smith-trivia,0.176471,192,6.0,1.57782
3928,martin-maud,dreiser-twelve-men,0.147059,160,5.0,1.574827
3637,martin-maud,bennett-lillian,0.142857,192,5.0,1.244263
3674,martin-maud,richardson-pointed-roofs,0.142857,192,5.0,1.283904


In [None]:
top_books = predictions_df[predictions_df.member_id == 'colens'].sort_values(by='jc_prediction', ascending=False)
top_books

In [None]:
top_books[top_books.jc_prediction.isna() == False]

In [None]:
circulation_events_bipartite_nodelist[circulation_events_bipartite_nodelist.group == 'books'].sort_values(by='global_degree', ascending=False)[0:10]


In [None]:
popular_books

In [None]:
book_list = top_books.item_uri.tolist()
top_books_nodes = circulation_events_bipartite_nodelist[circulation_events_bipartite_nodelist.uri.isin(book_list)]
# top_books_nodes[['global_degree', 'local_degree',
#                  'global_clustering', 'local_clustering', 'global_closeness',
#                  'local_closeness', 'global_betweenness', 'local_betweenness',
#                  'local_katz', 'local_HITS',
#                  'local_CoHITS', 'local_BiRank', 'local_BGRM', 'global_katz',
#                  'global_HITS', 'global_CoHITS', 'global_BiRank',
#                  'global_BGRM']].corr()
top_books_nodes

In [None]:
# not_top_books = circulation_events_bipartite_nodelist[circulation_events_bipartite_nodelist.uri.isin(book_list) == False]
# not_top_books[['global_degree', 'local_degree',
#                'global_clustering', 'local_clustering', 'global_closeness',
#                'local_closeness', 'global_betweenness', 'local_betweenness',
#                'local_katz', 'local_HITS',
#                'local_CoHITS', 'local_BiRank', 'local_BGRM', 'global_katz',
#                'global_HITS', 'global_CoHITS', 'global_BiRank',
#                'global_BGRM']].corr()


In [None]:
for idx, book in top_books.iterrows():
    # print(events_df[(events_df.member_id == 'hemingway') & (events_df.item_uri == book.item_uri)])
    print(circulation_events_bipartite_nodelist[circulation_events_bipartite_nodelist.uri == book.item_uri])


In [None]:
[title_lookup[x] for x in top_books.item_uri.values]

In [None]:
for d, v in circulation_events_bipartite_graph.nodes(data=True):
    if d == 'martin-maud':
        print(d, v)


In [None]:
predictions_df[predictions_df.member_id == 'martin-maud']


### Unipartite Link Predictions

In [None]:
member_attrs = {'uri': 'member_id'}
book_attrs = {'uri': 'item_uri'}
edge_attrs = {'weight': 'counts'}
node_attrs = {}
should_process = True
write_to_file = True
sk_metrics = ['katz', 'louvain']
link_metrics = ['pagerank', 'hubs', 'auth']
is_projected = True
all_events_grouped = all_events.groupby(
    ['member_id', 'item_uri']).size().reset_index(name='counts')


projected_members_graph, projected_members_nodelist, projected_members_edgelist, projected_members, projected_books_graph, projected_books_nodelist, projected_books_edgelist, projected_books = check_reload_build_unipartite_graphs(
    all_events_grouped, all_events, member_attrs, book_attrs, edge_attrs, node_attrs, should_process, write_to_file, './data/all_events_unipartite_projected', sk_metrics, link_metrics, members_df, books_df, is_projected)


In [30]:

def get_link_predictions(graph, nodelist):
    # nodelist, edgelist = generate_dataframes(graph, False, True)
    jaccard_coefs = list(nx.jaccard_coefficient(graph))
    jaccard_df = pd.DataFrame(jaccard_coefs, columns=['source', 'target', 'jaccard_coef_prediction'])
    jaccard_df['updated_source'] = jaccard_df['source'].progress_apply(lambda x: nodelist[nodelist.uri == x].uri.values[0])
    jaccard_df['updated_target'] = jaccard_df['target'].progress_apply(
        lambda x: nodelist[nodelist.uri == x].uri.values[0])


    pref_attach = nx.preferential_attachment(graph)
    pref_attach_df = pd.DataFrame(list(pref_attach), columns=['source', 'target', 'preferential_attachment_prediction'])
    pref_attach_df['updated_source'] = pref_attach_df['source'].progress_apply(lambda x: nodelist[nodelist.uri == x].uri.values[0])
    pref_attach_df['updated_target'] = pref_attach_df['target'].progress_apply(
        lambda x: nodelist[nodelist.uri == x].uri.values[0])

    pred_edges = pd.merge(jaccard_df, pref_attach_df, on=['updated_source', 'updated_target', 'source', 'target'])
    return pred_edges



In [21]:
from networkx.algorithms import bipartite
member_nodes = [
    n for n in circulation_events_bipartite_graph.nodes if circulation_events_bipartite_graph.nodes[n]['group'] == 'members']
members_graph = bipartite.weighted_projected_graph(
    circulation_events_bipartite_graph, member_nodes)


In [22]:
nodelist ,edgelist = generate_dataframes(members_graph, False, True)

In [31]:
pred_edges = get_link_predictions(
    members_graph, nodelist)


  0%|          | 0/572 [00:00<?, ?it/s]

  0%|          | 0/572 [00:00<?, ?it/s]

  0%|          | 0/572 [00:00<?, ?it/s]

  0%|          | 0/572 [00:00<?, ?it/s]

In [32]:
pred_edges.sort_values(by=['jaccard_coef_prediction',
                       'preferential_attachment_prediction'], ascending=False)


Unnamed: 0,source,target,jaccard_coef_prediction,updated_source,updated_target,preferential_attachment_prediction
547,milhaud,wilder-thornton,0.666667,milhaud,wilder-thornton,6
329,raphael-france,ottocar,0.500000,raphael-france,ottocar,35
168,treirse,ottensooser,0.500000,treirse,ottensooser,2
319,raphael-france,lucas-mrs,0.428571,raphael-france,lucas-mrs,21
37,rolland-madeleine,wyck,0.400000,rolland-madeleine,wyck,48
...,...,...,...,...,...,...
559,martin-simone,summerell,0.000000,martin-simone,summerell,0
563,violette,summerell,0.000000,violette,summerell,0
567,summerell,marsland,0.000000,summerell,marsland,0
568,summerell,ottensooser,0.000000,summerell,ottensooser,0


In [34]:
pred_edges[pred_edges.target == 'jolas-maria']

Unnamed: 0,source,target,jaccard_coef_prediction,updated_source,updated_target,preferential_attachment_prediction
5,renoir,jolas-maria,0.0,renoir,jolas-maria,0
35,rolland-madeleine,jolas-maria,0.0,rolland-madeleine,jolas-maria,0
62,linossier-raymonde,jolas-maria,0.0,linossier-raymonde,jolas-maria,0
92,rice-matilda,jolas-maria,0.0,rice-matilda,jolas-maria,0
118,stein-gertrude,jolas-maria,0.0,stein-gertrude,jolas-maria,0
150,treirse,jolas-maria,0.0,treirse,jolas-maria,0
171,varney,jolas-maria,0.0,varney,jolas-maria,0


In [None]:
pred_edges[['preferential_attachment_prediction', 'jaccard_coef_prediction']].corr()


In [None]:
nodelist, edgelist = generate_dataframes(projected_books_graph, False, True)
