In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', None)

import altair as alt
import matplotlib.pyplot as plt
import numpy as np

import warnings
warnings.filterwarnings("ignore")
import sys
sys.path.append("..")

from network_analysis.load_datasets import get_updated_shxco_data
from network_analysis.create_networks import *
from network_analysis.read_write_networks import * 
from network_analysis.link_prediction import * 
members_df, books_df, borrow_events, events_df = get_updated_shxco_data(get_subscription=False)


In [2]:
partial_df = pd.read_csv('../dataset_generator/data/partial_borrowers_collapsed.csv')
partial_df['index_col'] = partial_df.index
partial_df[0:1]



Unnamed: 0,member_id,subscription_start,subscription_end,subscription_events,subscription_volumes,subscription_days,internal_gaps,known_borrows,index_col
0,linossier-raymonde,1921-04-01,1921-07-01,Renewal,1.0,91,,57,0


In [3]:
partial_members = ['raphael-france', 'hemingway-ernest',
                       'colens-fernand', 'kittredge-eleanor-hayden']
partial_df = partial_df[partial_df.member_id.isin(partial_members)]
# parse subscription dates so we can use them to identify circulating books
partial_df['subscription_starttime'] = pd.to_datetime(
    partial_df['subscription_start'], errors='coerce')
partial_df['subscription_endtime'] = pd.to_datetime(
    partial_df['subscription_end'], errors='coerce')

# all_events = events_df[events_df.item_uri.isna() == False].copy()

borrow_events = borrow_events[(borrow_events.start_datetime.isna() == False) & (
    borrow_events.end_datetime.isna() == False)]
all_borrows = borrow_events[borrow_events.start_datetime < '1942-01-01'].copy()


### Bipartite Link Predictions for All Event Types

In [4]:
member_attrs = {'uri': 'member_id'}
book_attrs = {'uri': 'item_uri'}
edge_attrs = {'weight': 'counts'}
all_borrows_grouped = all_borrows.groupby(
    ['member_id', 'item_uri']).size().reset_index(name='counts')

should_process = True
write_to_file = True
sk_metrics = ['katz', 'louvain']
link_metrics = ['HITS', 'CoHITS', 'BiRank', 'BGRM']


all_borrows_bipartite_graph, all_borrows_bipartite_nodelist, all_borrows_bipartite_edgelist, all_borrows_members, all_borrows_books = check_reload_build_bipartite_graphs(
    all_borrows_grouped, member_attrs, book_attrs, edge_attrs, should_process, write_to_file, './public_data/borrow_events_bipartite', sk_metrics, link_metrics, members_df, books_df)


reloading saved graph: ./public_data/borrow_events_bipartite


In [5]:
# Remove disconnected components
remove = all_borrows_bipartite_nodelist[all_borrows_bipartite_nodelist.component != 0 ].uri.tolist()
all_borrows_bipartite_graph.remove_nodes_from(remove)
# all_borrows_bipartite_nodelist[all_borrows_bipartite_nodelist.component != 0 ].to_csv('./public_data/disconnected_nodes.csv')

In [6]:
## Run to get full bipartite link predictions from entire graph
all_preds = get_bipartite_link_predictions(all_borrows_bipartite_graph, './public_data/all_preds.csv')


In [7]:
metrics = ['jc_prediction', 'pa_prediction',
           'cn_prediction', 'aa_prediction']
all_preds[metrics].corr()

Unnamed: 0,jc_prediction,pa_prediction,cn_prediction,aa_prediction
jc_prediction,1.0,-0.023207,0.351671,0.31145
pa_prediction,-0.023207,1.0,0.526078,0.551955
cn_prediction,0.351671,0.526078,1.0,0.980548
aa_prediction,0.31145,0.551955,0.980548,1.0


In [8]:
def get_predictions(overwrite, output_path, metrics, all_preds, events_df):
    if overwrite:
        if os.path.exists(output_path):
            os.remove(output_path)

    if os.path.exists(output_path):
        processed_predictions = pd.read_csv(output_path)
    else:
        partial_df.progress_apply(get_full_predictions, axis=1, number_of_results=300, limit_to_circulation=True, predictions_df=all_preds, events_df=events_df,  predict_group='books', metrics=metrics, output_path=output_path)
        processed_predictions = pd.read_csv(output_path)
    processed_predictions.rename(columns={'predicted_values': 'item_id'}, inplace=True)
    return processed_predictions


In [9]:
events_df['item_id'] = events_df.item_uri

In [10]:
output_path = './public_data/partial_members_bipartite_network_all_events_link_predictions_full.csv'
overwrite = False
processed_predictions = get_predictions(overwrite, output_path, metrics, all_preds, events_df)

In [10]:
# subset_preds = processed_predictions[(processed_predictions.metric == 'aa_prediction') | (processed_predictions.metric == 'cn_prediction')]
subset_preds = processed_predictions[(processed_predictions.metric == 'jc_prediction')]


In [11]:
index_cols = ['member_id', 'item_id', 
       'subscription_start', 'subscription_end', 'subscription_events',
       'subscription_volumes', 'subscription_days', 'internal_gaps',
       'known_borrows', 'index_col', 'subscription_starttime',
       'subscription_endtime']
pivoted_predictions = pd.pivot(processed_predictions, index=index_cols, columns='metric', values='score').reset_index()
# pivoted_predictions = pd.pivot(subset_preds, index=index_cols, columns='metric', values='score').reset_index()


In [12]:
from sklearn.preprocessing import MinMaxScaler

def scale_col(df, cols):
    for col in cols:
        df[col + '_scaled'] = MinMaxScaler().fit_transform(df[col].values.reshape(-1, 1))
    return df

In [13]:
# metrics = ['aa_prediction', 'cn_prediction']
# metrics = ['jc_prediction']
pivoted_predictions = scale_col(pivoted_predictions, metrics)

In [14]:
preds_df = pd.melt(pivoted_predictions, id_vars=index_cols, value_vars=['jc_prediction_scaled', 'pa_prediction_scaled', 'cn_prediction_scaled',
       'aa_prediction_scaled'], var_name='metric', value_name='score')
# preds_df = pd.melt(pivoted_predictions, id_vars=index_cols, value_vars=['jc_prediction_scaled'], var_name='metric', value_name='score')

In [15]:
import altair as alt
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [16]:
preds_df['member_period'] = preds_df.member_id.astype(str) + ': ' + preds_df.subscription_start.astype(str) + '/' + preds_df.subscription_end.astype(str)

In [17]:
charts =[]
dfs = []
numb_of_books = len(preds_df[(preds_df.member_id == 'colens-fernand') & (preds_df.subscription_start == '1920-04-01')].item_id.unique())
numb_of_books = 50
for period in preds_df.member_period.unique().tolist():
  rows=preds_df[(preds_df.member_period == period)]
  books = rows.item_id.sample(n=numb_of_books).reset_index()
  books=books.item_id.unique().tolist()
  final_rows = rows[rows.item_id.isin(books)]
  print('initial rows:', len(rows), 'sampled rows:', len(final_rows))
  dfs.append(final_rows)

initial rows: 7496 sampled rows: 200
initial rows: 568 sampled rows: 172
initial rows: 2472 sampled rows: 200
initial rows: 4868 sampled rows: 196
initial rows: 4092 sampled rows: 196
initial rows: 5912 sampled rows: 196
initial rows: 8788 sampled rows: 196
initial rows: 9972 sampled rows: 200
initial rows: 1648 sampled rows: 196


In [18]:
# df = pd.concat(dfs)
# # df['member_time_period'] = df['member_id'] + ' - ' + df['subscription_start'].astype(str) + '/' + df['subscription_end'].astype(str)
# tickplot = alt.Chart(df).mark_tick(opacity=0.7).encode(
#     y=alt.Y('item_id', sort='-x'),
#     x='score',
#     color='member_period:N',
#     shape='metric:N',
# )
# tickplot

In [38]:
members = preds_df.member_id.unique().tolist()
charts = []
num_of_results = 20
for member in members:
    concat_rows = []
    periods = preds_df[preds_df.member_id == member].member_period.unique().tolist()
    for period in periods:
        print(period)
        rows = preds_df[preds_df.member_period == period]
        rows.sort_values(by='score', ascending=False, inplace=True)
        top_items = rows.item_id.unique().tolist()[0:num_of_results]
        concat_rows.append(rows[rows.item_id.isin(top_items)])
    final_rows = pd.concat(concat_rows)
    chart = alt.Chart(final_rows[final_rows.member_id == member]).mark_point(opacity=0.7).encode(
        y=alt.Y('item_id', sort='-x'),
        x='score',
        color='member_period:N',
        column='member_id:N',
        shape='metric:N',
    ).properties(
        width=150
    )
    charts.append(chart)

colens-fernand: 1920-10-30/1927-11-26
colens-fernand: 1920-04-01/1920-07-07
hemingway-ernest: 1921-12-28/1922-11-08
hemingway-ernest: 1924-03-28/1925-03-28
kittredge-eleanor-hayden: 1924-01-17/1924-05-17
kittredge-eleanor-hayden: 1924-11-24/1926-03-23
kittredge-eleanor-hayden: 1928-09-05/1928-12-05
kittredge-eleanor-hayden: 1929-09-10/1929-12-10
raphael-france: 1920-04-30/1921-11-17


In [39]:
def get_formatted_titles(row):

  item = books_df[books_df.id == row.item_id]
  if item.author.isna().any() == False:
    author = ' '.join(item.author.str.split(',').values[0][::-1])
    author = ', ' + author
  else: 
    author = ' (Periodical)'
  title = item.title.values[0]
  return title + author

In [40]:
concat_rows = []
periods = preds_df[preds_df.member_id == 'hemingway-ernest'].member_period.unique().tolist()
for period in periods:
    print(period)
    rows = preds_df[preds_df.member_period == period]
    rows.sort_values(by='score', ascending=False, inplace=True)
    top_items = rows.item_id.unique().tolist()[0:num_of_results]
    concat_rows.append(rows[rows.item_id.isin(top_items)])
final_rows = pd.concat(concat_rows)
final_rows['formatted_title'] = final_rows.apply(get_formatted_titles, axis=1)


hemingway-ernest: 1921-12-28/1922-11-08
hemingway-ernest: 1924-03-28/1925-03-28


In [41]:
final_rows['period'] = final_rows.member_period.str.split(':').str[1]

In [42]:
len(final_rows.item_id.unique())

26

In [49]:
final_rows.loc[final_rows.metric == 'jc_prediction_scaled', 'metric'] = 'Jaccard Coefficient'
final_rows.loc[final_rows.metric == 'aa_prediction_scaled', 'metric'] = 'Adamic-Adar Index'
final_rows.loc[final_rows.metric == 'cn_prediction_scaled', 'metric'] = 'Common Neighbors'
final_rows.loc[final_rows.metric == 'pa_prediction_scaled', 'metric'] = 'Preferential Attachment'

In [56]:
chart = alt.Chart(final_rows).mark_point(opacity=0.6, filled=True, thickness=3).encode(
    y=alt.Y('formatted_title', sort='-x', axis=alt.Axis(title='Predicted Book')),
    x='score',
    color='period:N',
    # column='member_id:N',
    shape=alt.Shape('metric:N'),
).properties(
    width=300,
    title='Top Predictions by Bipartite Link Prediction Methods'
)
chart.configure_axisY(
        titleAngle=0,
        titleAlign="left",
        titleY=-10,
        titleX=-100,
        labelLimit=1000
    )

In [20]:
alt.hconcat(*charts)

In [21]:
# selection = alt.selection_multi(fields=['member_period'], bind='legend')
# tickplot = alt.Chart(concat_rows).mark_tick(opacity=0.7).encode(
#     y=alt.Y('item_id', sort='-x'),
#     x='score',
#     color='member_period:N',
#     opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),
#     column='member_id:N',
#     shape='metric:N',
# ).add_selection(
#     selection
# )
# tickplot

In [22]:
preds_df.score = preds_df.score.fillna(0)

In [23]:
import numpy as np
metrics_df = preds_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id']).agg({'score': [np.median, 'skew', 'std', 'var']}).reset_index()
metrics_df.columns = list(map(''.join, metrics_df.columns.values))
metrics_df.columns = [col if 'score' not in col else col.split('score')[1] for col in metrics_df.columns ]
kurt_df = preds_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].apply(pd.DataFrame.kurt).reset_index(name='kurtosis')
final_df = pd.merge(metrics_df, kurt_df, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])
final_df = pd.merge(final_df, preds_df, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'], how='left')

In [24]:
from scipy.stats import zscore, mode
final_df['zscore'] = final_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].transform(lambda x : zscore(x,ddof=1))

In [25]:
top_scores = final_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id']).agg({'score':'max'})[['score']].reset_index()
top_scores = pd.merge(top_scores, final_df, on=top_scores.columns.tolist(), how='inner')

top_scores = top_scores.rename(columns={'score': 'top_score', 'zscore' : 'top_zscore'})

In [26]:
avg_scores = final_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].mean().reset_index(name='avg_score')
scores_df = pd.merge(top_scores, avg_scores, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])

In [27]:
median_scores = final_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].median().reset_index(name='median_score')
scores_df = pd.merge(scores_df, median_scores, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])

In [28]:
std_scores = final_df.groupby(['member_id', 'subscription_start',
                               'subscription_end', 'item_id']).agg({'score': 'std'}).reset_index()
std_scores = std_scores.rename(columns={'score': 'std_score'})
scores_df = pd.merge(scores_df, std_scores, on=[
                     'member_id', 'subscription_start', 'subscription_end', 'item_id'])


In [29]:
mode_scores = preds_df.groupby(['member_id', 'subscription_start', 'subscription_end', 'item_id'])['score'].agg(lambda x: mode(x)[0]).reset_index()
mode_scores = pd.merge(mode_scores, final_df, on=mode_scores.columns.tolist(), how='inner')
mode_scores = mode_scores.rename(columns={'score': 'mode_score', 'zscore' : 'mode_zscore'})

In [30]:
final_scores = pd.merge(mode_scores[['member_id', 'subscription_start', 'subscription_end', 'item_id', 'mode_score', 'mode_zscore']], scores_df, on=['member_id', 'subscription_start', 'subscription_end', 'item_id'])

In [31]:
member_subscriptions = final_scores[['member_id', 'subscription_start', 'subscription_end']].drop_duplicates()

In [32]:
for index, group in member_subscriptions.iterrows():
  print(group.to_dict())
  rows = final_scores[(final_scores.member_id == group.member_id) & (final_scores.subscription_start == group.subscription_start) & (final_scores.subscription_end == group.subscription_end)]
  print('top_scores:', rows.sort_values(by=['top_score'], ascending=False)[0:5][['item_id', 'top_score']].to_dict())
  print('avg_scores:', rows.sort_values(by=['avg_score'], ascending=False)[0:5][['item_id', 'avg_score']].to_dict())
  print('mode_scores:', rows.sort_values(by=['mode_score'], ascending=False)[0:5][['item_id', 'mode_score']].to_dict())

{'member_id': 'colens-fernand', 'subscription_start': '1920-04-01', 'subscription_end': '1920-07-07'}
top_scores: {'item_id': {101: 'joyce-portrait-artist-young', 99: 'joyce-dubliners', 100: 'joyce-exiles', 6: 'bronte-wuthering-heights', 146: 'stein-three-lives'}, 'top_score': {101: 0.923076923076923, 99: 0.923076923076923, 100: 0.641025641025641, 6: 0.42527920615735987, 146: 0.41025641025641024}}
avg_scores: {'item_id': {101: 'joyce-portrait-artist-young', 99: 'joyce-dubliners', 100: 'joyce-exiles', 6: 'bronte-wuthering-heights', 146: 'stein-three-lives'}, 'avg_score': {101: 0.717102935043487, 99: 0.7122986293345589, 100: 0.48420610929400165, 6: 0.33627990199232694, 146: 0.31259883815020245}}
mode_scores: {'item_id': {99: 'joyce-dubliners', 101: 'joyce-portrait-artist-young', 100: 'joyce-exiles', 6: 'bronte-wuthering-heights', 45: 'dreiser-sister-carrie'}, 'mode_score': {99: 0.5084420179961582, 101: 0.49684056212718636, 100: 0.33442017996158124, 6: 0.2416085330098069, 45: 0.2416085330

In [33]:
final_scores_dedup = final_scores.drop_duplicates()

In [36]:
final_scores_dedup['coef_var'] = (final_scores_dedup.std_score/ final_scores_dedup.median_score)

In [37]:
for index, group in member_subscriptions.iterrows():
  print(group.to_dict())
  rows = final_scores_dedup[(final_scores_dedup.member_id == group.member_id) & (final_scores_dedup.subscription_start == group.subscription_start) & (final_scores_dedup.subscription_end == group.subscription_end)]
  print('coef_scores:', rows.sort_values(by=['coef_var'], ascending=True)[0:5][['item_id', 'coef_var', 'median_score']].to_dict())
  print('median_scores:', rows.sort_values(by=['median_score'], ascending=False)[0:5][['item_id', 'median_score', 'coef_var']].to_dict())
#   print('mode_scores:', rows.sort_values(by=['mode_score'], ascending=False)[0:5][['item_id', 'mode_score']].to_dict())

{'member_id': 'colens-fernand', 'subscription_start': '1920-04-01', 'subscription_end': '1920-07-07'}
coef_scores: {'item_id': {71: 'hardy-desperate-remedies', 47: 'drinkwater-prose-papers', 102: 'keynes-economic-consequences-peace', 59: 'galsworthy-mob', 76: 'hardy-return-native'}, 'coef_var': {71: 0.13745879812099412, 47: 0.13768601083100104, 102: 0.13837445156392672, 59: 0.1493577668341456, 76: 0.15287349184072513}, 'median_score': {71: 0.02202532404437354, 47: 0.022107685378124783, 102: 0.021502085303820445, 59: 0.045598835285345564, 76: 0.043440367223484745}}
median_scores: {'item_id': {101: 'joyce-portrait-artist-young', 99: 'joyce-dubliners', 100: 'joyce-exiles', 6: 'bronte-wuthering-heights', 146: 'stein-three-lives'}, 'median_score': {101: 0.7242471274849192, 99: 0.7088377881325771, 100: 0.48068930809439214, 6: 0.3391159344010705, 146: 0.3108666605362681}, 'coef_var': {101: 0.27962079057948974, 99: 0.27191251953165835, 100: 0.28071702607743687, 6: 0.2798636055918552, 146: 0.27

In [38]:
len(final_scores), len(final_scores_dedup)

(17998, 12021)

In [39]:
final_scores_dedup.to_csv('./public_data/collapsed_bipartite_predictions.csv', index=False)