In [2]:
import pandas as pd
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', None)

import altair as alt
import matplotlib.pyplot as plt
import numpy as np
tqdm.pandas()
import warnings
warnings.filterwarnings("ignore")
import sys
sys.path.append("..")

from network_analysis.load_datasets import get_updated_shxco_data
from network_analysis.create_networks import *
from network_analysis.read_write_networks import * 
from network_analysis.link_prediction import * 
members_df, books_df, borrow_events, events_df = get_updated_shxco_data(get_subscription=False)


In [3]:
partial_df = pd.read_csv('../dataset_generator/data/partial_borrowers_collapsed.csv')
partial_df['index_col'] = partial_df.index
partial_df[0:1]



Unnamed: 0,member_id,subscription_start,subscription_end,subscription_events,subscription_volumes,subscription_days,internal_gaps,known_borrows,index_col
0,martin-maud,1923-10-17,1923-12-17,Subscription;Renewal,2.0,61,0,36,0


In [4]:
partial_members = ['raphael-france', 'hemingway',
                       'colens', 'kittredge-eleanor-hayden']
partial_df = partial_df[partial_df.member_id.isin(partial_members)]
# parse subscription dates so we can use them to identify circulating books
partial_df['subscription_starttime'] = pd.to_datetime(
    partial_df['subscription_start'], errors='coerce')
partial_df['subscription_endtime'] = pd.to_datetime(
    partial_df['subscription_end'], errors='coerce')

# all_events = events_df[events_df.item_uri.isna() == False].copy()

borrow_events = borrow_events[(borrow_events.start_datetime.isna() == False) & (
    borrow_events.end_datetime.isna() == False)]
all_borrows = borrow_events[borrow_events.start_datetime < '1942-01-01'].copy()


### Bipartite Link Predictions for All Event Types

In [5]:
member_attrs = {'uri': 'member_id'}
book_attrs = {'uri': 'item_uri'}
edge_attrs = {'weight': 'counts'}
all_borrows_grouped = all_borrows.groupby(
    ['member_id', 'item_uri']).size().reset_index(name='counts')

should_process = True
write_to_file = True
sk_metrics = ['katz', 'louvain']
link_metrics = ['HITS', 'CoHITS', 'BiRank', 'BGRM']


all_borrows_bipartite_graph, all_borrows_bipartite_nodelist, all_borrows_bipartite_edgelist, all_borrows_members, all_borrows_books = check_reload_build_bipartite_graphs(
    all_borrows_grouped, member_attrs, book_attrs, edge_attrs, should_process, write_to_file, './data/borrow_events_bipartite', sk_metrics, link_metrics, members_df, books_df)


reloading saved graph: ./data/borrow_events_bipartite


In [6]:
# Remove disconnected components
remove = all_borrows_bipartite_nodelist[all_borrows_bipartite_nodelist.component != 0 ].uri.tolist()
all_borrows_bipartite_graph.remove_nodes_from(remove)
# all_borrows_bipartite_nodelist[all_borrows_bipartite_nodelist.component != 0 ].to_csv('./public_data/disconnected_nodes.csv')

In [34]:
## Run to get full bipartite link predictions from entire graph
all_preds = get_bipartite_link_predictions(all_borrows_bipartite_graph, './data/all_preds.csv')


Running jaccard link prediction
Jaccard prediction starting...
Jaccard Executed in 36.33426904678345 seconds 

Running preferential attachment link prediction
Preferential_attachment prediction starting...
Preferential attachment Executed in 21.603660821914673 seconds 

Running common neighbors link prediction
Common neighbor prediction starting...
Common neighbours Executed in 22.481963872909546 seconds 

Running adamic adar link prediction
Adamic_adar prediction starting...
Adamic-adar Executed in 101.55140519142151 seconds 



In [8]:
metrics = ['jc_prediction', 'pa_prediction',
           'cn_prediction', 'aa_prediction']
all_preds[metrics].corr()

Unnamed: 0,jc_prediction,pa_prediction,cn_prediction,aa_prediction
jc_prediction,1.0,-0.022912,0.354038,0.313452
pa_prediction,-0.022912,1.0,0.52468,0.550824
cn_prediction,0.354038,0.52468,1.0,0.980399
aa_prediction,0.313452,0.550824,0.980399,1.0


In [36]:
all_preds1 = get_bipartite_link_predictions(all_borrows_bipartite_graph, 'all_preds.csv')

In [9]:
output_path = './data/partial_members_bipartite_network_all_events_link_predictions.csv'
processed_predictions = pd.read_csv(output_path)
# if os.path.exists(output_path):
#     os.remove(output_path)
# partial_df.progress_apply(get_full_predictions, axis=1, number_of_results=200, limit_to_circulation=True, predictions_df=all_preds, borrow_events=borrow_events, relative_date=sixmonths, predict_group='books', metrics=metrics, output_path=output_path)


In [10]:
processed_predictions.rename(columns={'predicted_values': 'item_id'}, inplace=True)

In [11]:
index_cols = ['member_id', 'item_id', 
       'subscription_start', 'subscription_end', 'subscription_events',
       'subscription_volumes', 'subscription_days', 'internal_gaps',
       'known_borrows', 'index_col', 'subscription_starttime',
       'subscription_endtime']
pivoted_predictions = pd.pivot(processed_predictions, index=index_cols, columns='metric', values='score').reset_index()

In [12]:
from sklearn.preprocessing import MinMaxScaler

def scale_col(df, cols):
    for col in cols:
        df[col + '_scaled'] = MinMaxScaler().fit_transform(df[col].values.reshape(-1, 1))
    return df

In [13]:
pivoted_predictions = scale_col(pivoted_predictions, metrics)

In [14]:
preds_df = pd.melt(pivoted_predictions, id_vars=index_cols, value_vars=['jc_prediction_scaled', 'pa_prediction_scaled', 'cn_prediction_scaled',
       'aa_prediction_scaled'], var_name='metric', value_name='score')

In [15]:
import altair as alt
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [18]:
preds_df['member_period'] = preds_df.member_id.astype(str) + ': ' + preds_df.subscription_start.astype(str) + '-' + preds_df.subscription_end.astype(str)

In [32]:
selection = alt.selection_multi(fields=['member_period'], bind='legend')
tickplot = alt.Chart(preds_df).mark_tick(opacity=0.7).encode(
    y=alt.Y('item_id', sort='-x'),
    x='score',
    color='member_period:N',
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),
    # column='member_id:N'
).add_selection(
    selection
)
tickplot

In [21]:
preds_df.score = preds_df.score.fillna(0)

In [22]:
import numpy as np
metrics_df = preds_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id']).agg({'score': [np.median, 'skew', 'std', 'var']}).reset_index()
metrics_df.columns = list(map(''.join, metrics_df.columns.values))
metrics_df.columns = [col if 'score' not in col else col.split('score')[1] for col in metrics_df.columns ]
kurt_df = preds_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].apply(pd.DataFrame.kurt).reset_index(name='kurtosis')
final_df = pd.merge(metrics_df, kurt_df, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])
final_df = pd.merge(final_df, preds_df, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'], how='left')

In [23]:
from scipy.stats import zscore, mode
final_df['zscore'] = final_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].transform(lambda x : zscore(x,ddof=1))

In [24]:
top_scores = final_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id']).agg({'score':'max'})[['score']].reset_index()
top_scores = pd.merge(top_scores, final_df, on=top_scores.columns.tolist(), how='inner')

top_scores = top_scores.rename(columns={'score': 'top_score', 'zscore' : 'top_zscore'})

In [25]:
avg_scores = final_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].mean().reset_index(name='avg_score')
scores_df = pd.merge(top_scores, avg_scores, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])

In [26]:
mode_scores = preds_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].agg(lambda x: mode(x)[0]).reset_index()
mode_scores = pd.merge(mode_scores, final_df, on=mode_scores.columns.tolist(), how='inner')
mode_scores = mode_scores.rename(columns={'score': 'mode_score', 'zscore' : 'mode_zscore'})

In [27]:
final_scores = pd.merge(mode_scores[['member_id', 'subscription_start', 'subscription_end', 'item_id', 'mode_score', 'mode_zscore']], scores_df, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])

In [28]:
member_subscriptions = final_scores[['member_id', 'subscription_start', 'subscription_end']].drop_duplicates()

In [29]:
for index, group in member_subscriptions.iterrows():
  print(group.to_dict())
  rows = final_scores[(final_scores.member_id == group.member_id) & (final_scores.subscription_start == group.subscription_start) & (final_scores.subscription_end == group.subscription_end)]
  print('top_scores:', rows.sort_values(by=['top_score'], ascending=False)[0:5][['item_id', 'top_score']].to_dict())
  print('avg_scores:', rows.sort_values(by=['avg_score'], ascending=False)[0:5][['item_id', 'avg_score']].to_dict())
  print('mode_scores:', rows.sort_values(by=['mode_score'], ascending=False)[0:5][['item_id', 'mode_score']].to_dict())

{'member_id': 'colens', 'subscription_start': '1920-04-01', 'subscription_end': '1920-07-07'}
top_scores: {'item_id': {97: 'joyce-dubliners', 99: 'joyce-portrait-artist-young', 98: 'joyce-exiles', 6: 'bronte-wuthering-heights', 138: 'stein-three-lives'}, 'top_score': {97: 0.923076923076923, 99: 0.8974358974358974, 98: 0.641025641025641, 6: 0.4256953262379782, 138: 0.41025641025641024}}
avg_scores: {'item_id': {97: 'joyce-dubliners', 99: 'joyce-portrait-artist-young', 98: 'joyce-exiles', 6: 'bronte-wuthering-heights', 138: 'stein-three-lives'}, 'avg_score': {97: 0.7095077186568883, 99: 0.6955164889117458, 98: 0.48223320969697514, 6: 0.33510678122811266, 138: 0.3112629109199063}}
mode_scores: {'item_id': {97: 'joyce-dubliners', 99: 'joyce-portrait-artist-young', 98: 'joyce-exiles', 44: 'dreiser-sister-carrie', 6: 'bronte-wuthering-heights'}, 'mode_score': {97: 0.5073298958649277, 99: 0.48417753513294914, 98: 0.3336871903750885, 44: 0.2410777474471742, 6: 0.2410777474471742}}
{'member_id'

In [33]:
final_scores.to_csv('./public_data/collapse_bipartite_predictions.csv', index=False)