In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', None)
import networkx as nx
from networkx.algorithms import bipartite
# import community
from networkx.readwrite import json_graph
# import nx_altair as nxa
from networkx.algorithms.community import greedy_modularity_communities
from pyvis import network as net
# from node2vec import Node2Vec
import altair as alt
import matplotlib.pyplot as plt
import numpy as np
import itertools
import collections
from tqdm.notebook import trange, tqdm
tqdm.pandas()
import warnings
warnings.filterwarnings("ignore")
from IPython.display import display, Markdown, HTML
import sys
sys.path.append("..")
from bigraph.predict import pa_predict, jc_predict, cn_predict,aa_predict
from network_analysis.birankpy import BipartiteNetwork
from network_analysis.load_datasets import get_updated_shxco_data
from network_analysis.generate_network_metrics import *
from network_analysis.create_networks import *
from network_analysis.read_write_networks import * 
members_df, books_df, borrow_events, events_df = get_updated_shxco_data(get_subscription=False)


In [2]:
from dateutil.relativedelta import relativedelta

sixmonths = relativedelta(months=6)
sixmonths


relativedelta(months=+6)

In [3]:
all_events = events_df[events_df.item_uri.isna() == False].copy()

partial_df = pd.read_csv('../dataset_generator/data/partial_borrowers.csv')
partial_df[0:1]


Unnamed: 0,member_id,subscription_start,subscription_end,known_borrows
0,martin-maud,1923-10-17,1923-11-17,36


In [4]:
partial_members = partial_df.member_id.unique().tolist()
# parse subscription dates so we can use them to identify circulating books
partial_df['subscription_starttime'] = pd.to_datetime(
    partial_df['subscription_start'], errors='coerce')
partial_df['subscription_endtime'] = pd.to_datetime(
    partial_df['subscription_end'], errors='coerce')

events_df['start_datetime'] = pd.to_datetime(events_df['start_datetime'], errors='coerce')
events_df['end_datetime'] = pd.to_datetime(events_df['end_datetime'], errors='coerce')


### Bipartite Link Predictions

In [5]:
member_attrs = {'uri': 'member_id'}
book_attrs = {'uri': 'item_uri'}
edge_attrs = {'weight': 'counts'}
all_events_grouped = all_events.groupby(
    ['member_id', 'item_uri']).size().reset_index(name='counts')

should_process = True
write_to_file = True
sk_metrics = ['katz', 'louvain']
link_metrics = ['HITS', 'CoHITS', 'BiRank', 'BGRM']


all_events_bipartite_graph, all_events_bipartite_nodelist, all_events_bipartite_edgelist, all_events_members, all_events_books = check_reload_build_bipartite_graphs(
    all_events_grouped, member_attrs, book_attrs, edge_attrs, should_process, write_to_file, './data/full_events_bipartite', sk_metrics, link_metrics, members_df, books_df)


reloading saved graph: ./data/full_events_bipartite


In [6]:
def get_bipartite_link_predictions(graph):
    print('Running jaccard link prediction')
    jc_preds = jc_predict(graph)
    jc_preds_df = pd.DataFrame(data=list(jc_preds.values()),index=jc_preds.keys()).reset_index()
    jc_preds_df.columns = ['member_id', 'item_uri', 'jc_prediction']
    print('Running preferential attachment link prediction')
    pa_preds = pa_predict(graph)
    pa_preds_df = pd.DataFrame(data=list(pa_preds.values()),index=pa_preds.keys()).reset_index()
    pa_preds_df.columns = ['member_id', 'item_uri', 'pa_prediction']
    print('Running common neighbors link prediction')
    cn_preds = cn_predict(graph)
    cn_preds_df = pd.DataFrame(
        data=list(cn_preds.values()), index=cn_preds.keys()).reset_index()
    cn_preds_df.columns = ['member_id', 'item_uri', 'cn_prediction']
    print('Running adamic adar link prediction')
    aa_preds = aa_predict(graph)
    aa_preds_df = pd.DataFrame(
        data=list(aa_preds.values()), index=aa_preds.keys()).reset_index()
    aa_preds_df.columns = ['member_id', 'item_uri', 'aa_prediction']


    all_preds = pd.merge(jc_preds_df, pa_preds_df, on=['member_id', 'item_uri'], how='outer')
    all_preds = pd.merge(all_preds, cn_preds_df, on=['member_id', 'item_uri'], how='outer')
    all_preds = pd.merge(all_preds, aa_preds_df, on=['member_id', 'item_uri'], how='outer')
    return all_preds


In [7]:
all_preds = get_bipartite_link_predictions(all_events_bipartite_graph)

Running jaccard link prediction
Jaccard Executed in 53.186678886413574 seconds 

Running preferential attachment link prediction
Preferential attachment Executed in 38.66534900665283 seconds 

Running common neighbors link prediction
Common neighbours Executed in 32.77828574180603 seconds 

Running adamic adar link prediction
Adamic_adar prediction starting...
Adamic-adar Executed in 125.47807121276855 seconds 



In [8]:
all_preds[['jc_prediction', 'pa_prediction',
           'cn_prediction', 'aa_prediction']].corr()


Unnamed: 0,jc_prediction,pa_prediction,cn_prediction,aa_prediction
jc_prediction,1.0,-0.018728,0.35268,0.312518
pa_prediction,-0.018728,1.0,0.531993,0.559736
cn_prediction,0.35268,0.531993,1.0,0.981303
aa_prediction,0.312518,0.559736,0.981303,1.0


In [9]:
title_lookup = {row.uri: row.title for row in books_df.itertuples()}
metrics = ['jc_prediction', 'pa_prediction',
           'cn_prediction', 'aa_prediction']


In [16]:

    
dfs =[ ]
def generate_top_predictions_for_table(row, metrics, number_of_results, predictions_df, limit_to_circulation=True):
    identified_top_predictions = {}
    circulation_start = row.subscription_starttime - sixmonths
    circulation_events = events_df[events_df.start_datetime.between(circulation_start, row.subscription_endtime) | events_df.end_datetime.between(circulation_start, row.subscription_endtime)]
    popular_current = circulation_events.groupby(['item_uri']).size().reset_index(name='Count').sort_values(['Count'], ascending=False)[0:number_of_results].item_uri.tolist()
    identified_top_predictions['popular_current'] = popular_current
    for idx, m in enumerate(metrics):
        subset_predictions = get_predictions_by_metric(row, m, predictions_df, limit_to_circulation)
        identified_top_predictions[m] = subset_predictions[0:number_of_results].item_uri.tolist()
    
    df_final = pd.DataFrame.from_dict(identified_top_predictions, orient='columns')
    df_final['member_id'] = row.member_id
    df_final['subscription_starttime'] = row.subscription_starttime
    df_final['subscription_endtime'] = row.subscription_endtime
    print(type(df_final))
    return df_final
    




In [21]:
def get_predictions_by_metric(row, metric, predictions_df, circulation_books, limit_to_circulation=True):
    if limit_to_circulation:
        subset_predictions = predictions_df[(predictions_df.member_id == row.member_id) & (
            predictions_df.item_uri.isin(circulation_books))].sort_values(by=f'{metric}', ascending=False)
    else:
        subset_predictions = predictions_df[(
            predictions_df.member_id == row.member_id)].sort_values(by=f'{metric}', ascending=False)

    return subset_predictions[['member_id', 'item_uri', f'{metric}']]


In [27]:
number_of_results = 10
limit_to_circulation = True
predictions_df = all_preds.copy()
dfs = []
for index, row in tqdm(partial_df.iterrows(), total=partial_df.shape[0]):
    identified_top_predictions = {}
    circulation_start = row.subscription_starttime - sixmonths
    circulation_events = events_df[events_df.start_datetime.between(
        circulation_start, row.subscription_endtime) | events_df.end_datetime.between(circulation_start, row.subscription_endtime)]
    popular_current = circulation_events.groupby(['item_uri']).size().reset_index(
        name='counts').sort_values(['counts'], ascending=False)[0:number_of_results]
    circulation_books = circulation_events.item_uri.unique().tolist()
    identified_top_predictions['popular_current_books'] = popular_current.item_uri.tolist(
    )
    identified_top_predictions['popular_current_counts'] = popular_current.counts.tolist(
    )
    for idx, m in enumerate(metrics):
        subset_predictions = get_predictions_by_metric(
            row, m, predictions_df, circulation_books, limit_to_circulation)
        identified_top_predictions[m] = subset_predictions[0:number_of_results].item_uri.tolist()
        identified_top_predictions[f'{m}_scores'] = subset_predictions[0:number_of_results][m].tolist(
        )

    df_final = pd.DataFrame.from_dict(
        identified_top_predictions, orient='columns')

    df_final['member_id'] = row.member_id
    df_final['subscription_starttime'] = row.subscription_starttime
    df_final['subscription_endtime'] = row.subscription_endtime
    df_final['known_borrows'] = row.known_borrows

    output_path = './data/partial_bipartite_link_predictions.csv'
    if index == 0:
        os.remove(output_path)
    if os.path.exists(output_path):
        df_final.to_csv(output_path, mode='a', header=False, index=False)
    else:
        df_final.to_csv(output_path, index=False, header=True)


  0%|          | 0/219 [00:00<?, ?it/s]

In [28]:
tuples = [tuple(x) for x in all_events_bipartite_edgelist.values]
graph = convert_edge_list(tuples, bipartite=True)
biadjacency = graph.biadjacency
names = graph.names
ji = JaccardIndex()
ji.fit(biadjacency)



JaccardIndex()

In [31]:
from numpy import argsort
for member in partial_members:
    i, = np.where(names == member)

    ji_scores = ji.predict(i[0])

    # col_name = '_'.join(list(pred_edge.keys()))
    print('most similar members based on book history', names[argsort(-ji_scores)][0:10])


most similar members based on book history ['martin-maud' 'rolland-madeleine' 'martin-simone' 'wilson-romer'
 'rice-matilda' 'valerio' 'pottecher-therese' 'foret' 'somerville' 'savy']
most similar members based on book history ['reynolds-a-m' 'bernheim' 'ottocar' 'ris' 'james-t-m' 'lacroix-e'
 'potocki-de-montalk' 'antoine-may' 'sperry' 'monnier-j']
most similar members based on book history ['linossier-raymonde' 'sarraute' 'antoine-may' 'pfeffel' 'ottensooser'
 'oerthel' 'joyce-lucia' 'suter' 'reverchon' 'violette']
most similar members based on book history ['mcgrew-marie-carroll' 'edwards-thomas' 'ybarra-penny' 'bremond'
 'camerlynck-guernier' 'faulkner-norma' 'killen' 'raphael-france'
 'antoine-may' 'mespoulet']
most similar members based on book history ['richard-p' 'lucas-b' 'dyer-louise' 'venable' 'walker-natalie' 'imbs'
 'suter' 'lambert-jacqueline' 'gerbault-paul' 'edwards-thomas']
most similar members based on book history ['mcalmon-robert' 'samyn' 'lanux-pierre-de' 'metcalf-

### Unipartite Link Predictions

In [33]:
member_attrs = {'uri': 'member_id'}
book_attrs = {'uri': 'item_uri'}
edge_attrs = {'weight': 'counts'}
node_attrs = {}
should_process = True
write_to_file = True
sk_metrics = ['katz', 'louvain']
link_metrics = ['pagerank', 'hubs', 'auth']
is_projected = True
all_events_grouped = all_events.groupby(
    ['member_id', 'item_uri']).size().reset_index(name='counts')


projected_members_graph, projected_members_nodelist, projected_members_edgelist, projected_members, projected_books_graph, projected_books_nodelist, projected_books_edgelist, projected_books = check_reload_build_unipartite_graphs(
    all_events_grouped, all_events, member_attrs, book_attrs, edge_attrs, node_attrs, should_process, write_to_file, './data/all_events_unipartite_projected', sk_metrics, link_metrics, members_df, books_df, is_projected)


reloading saved graph: ./data/all_events_unipartite_projected


In [36]:

def get_link_predictions(graph, nodelist):
    # nodelist, edgelist = generate_dataframes(graph, False, True)
    jaccard_coefs = list(nx.jaccard_coefficient(graph))
    jaccard_df = pd.DataFrame(jaccard_coefs, columns=['source', 'target', 'jaccard_coef_prediction'])
    jaccard_df['updated_source'] = jaccard_df['source'].progress_apply(lambda x: nodelist[nodelist.label == x].uri.values[0])
    jaccard_df['updated_target'] = jaccard_df['target'].progress_apply(
        lambda x: nodelist[nodelist.label == x].uri.values[0])


    pref_attach = nx.preferential_attachment(projected_members_graph)
    pref_attach_df = pd.DataFrame(list(pref_attach), columns=['source', 'target', 'preferential_attachment_prediction'])
    pref_attach_df['updated_source'] = pref_attach_df['source'].progress_apply(lambda x: nodelist[nodelist.label == x].uri.values[0])
    pref_attach_df['updated_target'] = pref_attach_df['target'].progress_apply(
        lambda x: nodelist[nodelist.label == x].uri.values[0])

    pred_edges = pd.merge(jaccard_df, pref_attach_df, on=['updated_source', 'updated_target', 'source', 'target'])
    return pred_edges



In [38]:
nodelist ,edgelist = generate_dataframes(projected_members_graph, False, True)

In [39]:
pred_edges = get_link_predictions(
    projected_members_graph, nodelist)


  0%|          | 0/169282 [00:00<?, ?it/s]

  0%|          | 0/169282 [00:00<?, ?it/s]

  0%|          | 0/169282 [00:00<?, ?it/s]

  0%|          | 0/169282 [00:00<?, ?it/s]

In [40]:
pred_edges.sort_values(by=['jaccard_coef_prediction',
                       'preferential_attachment_prediction'], ascending=False)


Unnamed: 0,source,target,jaccard_coef_prediction,updated_source,updated_target,preferential_attachment_prediction
50785,n611,n228,0.666667,wright-julia,jordan-howard,6
73614,n31,n186,0.500000,bourassin-2,giedion-welcker,9
59210,n197,n521,0.500000,goyert,tabouis,2
83805,n448,n232,0.500000,roger-roubin,joyce-stanislaus,2
98379,n600,n42,0.500000,wilson-natalie,bruno-jean,2
...,...,...,...,...,...,...
169162,n134,n552,0.000000,eastman,varney,0
169163,n134,n469,0.000000,eastman,saur,0
169164,n134,n524,0.000000,eastman,teissier-jeanine-delpech,0
169165,n134,n351,0.000000,eastman,oldenburger,0


In [41]:
pred_edges[['preferential_attachment_prediction', 'jaccard_coef_prediction']].corr()


Unnamed: 0,preferential_attachment_prediction,jaccard_coef_prediction
preferential_attachment_prediction,1.0,0.762473
jaccard_coef_prediction,0.762473,1.0


In [None]:
nodelist, edgelist = generate_dataframes(projected_books_graph, False, True)


In [None]:
pred_edges_books, books_nodelist = get_link_predictions(projected_books_graph, nodelist)


In [None]:
components = [c for c in sorted(
    nx.connected_components(projected_members_graph), key=len, reverse=True)]

preds_dfs = []
for c in components:
    subgraph = projected_members_graph.subgraph(c)
    # print(len(subgraph))
    preds = nx.common_neighbor_centrality(subgraph)
    pred_df = pd.DataFrame(
        list(preds), columns=['source', 'target', 'common_neighbor_centrality_prediction'])
    preds_dfs.append(pred_df)


In [None]:
preds = list(nx.resource_allocation_index(projected_members_graph))


pred_df = pd.DataFrame(preds, columns=['source', 'target', 'resource_allocation_prediction'])


In [None]:
def update_edge_labels(rows):

    return nodes_df[nodes_df.label == rows].label.values[0]
pred_df['updated_target'] = pred_df.target.apply(update_edge_labels)
pred_df['updated_source'] = pred_df.source.apply(update_edge_labels)


In [None]:
members = projected_members.copy()
partial_members = partial_df.member_id.unique().tolist()
members.loc[(members.exceptional_types.isna() == True),
            'is_exceptional'] = False
members.loc[(members.exceptional_types.isna() == False),
            'is_exceptional'] = True
members.loc[(members.member_id.isin(partial_members)), 'is_partial'] = True
members.loc[(members.member_id.isin(partial_members)
             == False), 'is_partial'] = False


In [None]:
members = all_events[all_events.start_datetime < '1925-01-01'].member_id.unique().tolist()
books = all_events[all_events.start_datetime <'1925-01-01'].item_uri.unique().tolist()


In [None]:
combos = list(itertools.product(members, books))
len(combos)

In [None]:
combo_df = pd.DataFrame(data=combos, columns=['source', 'target'])


In [None]:
edgelist = pd.merge(combo_df, edges_df, on=['source', 'target'], how='outer')
edgelist.weight.fillna(0, inplace=True)

In [None]:
nodelist = pd.merge(members_df[['gender', 'is_organization', 'member_id', 'borrow_count', 'subscription_count', 'exceptional_types',
            'exceptional_counts']], nodes_df, left_on='member_id', right_on='uri')


In [None]:
import random
training = edgelist[edgelist.weight > 0 ].sample(frac=0.5)


In [None]:
# Degree Centrality features
# out_degree_centrality = nx.out_degree_centrality(members_graph)
# in_degree_centrality = nx.in_degree_centrality(members_graph)
# # training['source_out_centrality'] = training.apply(lambda row: degree_centrality[row.source],axis=1)
# # training['target_in_centrality'] = training.apply(lambda row: in_degree_centrality[row.target],axis=1)

# # Page rank
page_rank = nx.pagerank_scipy(members_graph)
training['target_pagerank'] = training.apply(lambda row: page_rank[row.target],axis=1)

# # Preferential Attachment
# # For a directed graph, is equal to K_out_source * K_in_target with K the number of neighbors. Which is equivalent to multiply the available centralities.
# training['preferencial_attachment'] = training.apply(lambda row: row.source_out_centrality * row.target_in_centrality,axis=1)

# # HITS algorithm
hub_score, authority_score = nx.hits(members_graph)
training['source_hub_score'] = training.apply(lambda row: hub_score[row.source],axis=1)
training['target_authority_score'] = training.apply(lambda row: authority_score[row.target],axis=1)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(14, 12))
sns.heatmap(training.corr(),
            vmax=0.5,
            square=True,
            annot=True)


In [None]:
training.loc[training.weight > 0, 'edge_exists'] = 1
training.loc[training.weight == 0, 'edge_exists'] = 0

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(training.drop(
    ['source', 'target', 'edge_exists'], axis=1), training.edge_exists, test_size=0.2)


In [None]:
from sklearn.ensemble import RandomForestClassifier

RF_classifer = RandomForestClassifier(n_estimators=1000)
RF_classifer.fit(X_train, y_train)


In [None]:
# rf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
#                        max_depth=None, max_features='auto', max_leaf_nodes=None,
#                        min_impurity_decrease=0.0,
#                        min_samples_leaf=1, min_samples_split=2,
#                        min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
#                        oob_score=False, random_state=None, verbose=0,
#                        warm_start=False)


In [None]:
RF_classifer.score(X_test, y_test)


In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100,
                           n_jobs=-1,
                           oob_score=True,
                           bootstrap=True,
                           random_state=42)
rf.fit(X_train, y_train)


In [None]:
print('R^2 Training Score: {:.2f} \nOOB Score: {:.2f} \nR^2 Validation Score: {:.2f}'.format(rf.score(X_train, y_train), rf.oob_score_,rf.score(X_test, y_test)))
