In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', None)
import networkx as nx
from networkx.algorithms import bipartite
# import community
from networkx.readwrite import json_graph
# import nx_altair as nxa
from networkx.algorithms.community import greedy_modularity_communities
from pyvis import network as net
# from node2vec import Node2Vec
import altair as alt
import matplotlib.pyplot as plt
import numpy as np
import itertools
import collections
from tqdm.notebook import trange, tqdm
tqdm.pandas()
import warnings
warnings.filterwarnings("ignore")
from IPython.display import display, Markdown, HTML
import sys
sys.path.append("..")
from bigraph.predict import pa_predict, jc_predict, cn_predict,aa_predict, katz_predict
from bigraph.evaluation import evaluation
from network_analysis.birankpy import BipartiteNetwork
from network_analysis.load_datasets import get_updated_shxco_data
from network_analysis.generate_network_metrics import *
from network_analysis.create_networks import *
from network_analysis.read_write_networks import * 
from network_analysis.link_prediction import * 
members_df, books_df, borrow_events, events_df = get_updated_shxco_data(get_subscription=False)


In [2]:
from dateutil.relativedelta import relativedelta

sixmonths = relativedelta(months=6)
sixmonths


relativedelta(months=+6)

In [4]:
partial_df = pd.read_csv('../dataset_generator/data/partial_borrowers.csv')
partial_df['index_col'] = partial_df.index
partial_df[0:1]



Unnamed: 0,member_id,subscription_start,subscription_end,known_borrows,index_col
0,martin-maud,1923-10-17,1923-11-17,36,0


In [5]:
partial_members = partial_df.member_id.unique().tolist()
# parse subscription dates so we can use them to identify circulating books
partial_df['subscription_starttime'] = pd.to_datetime(
    partial_df['subscription_start'], errors='coerce')
partial_df['subscription_endtime'] = pd.to_datetime(
    partial_df['subscription_end'], errors='coerce')

# all_events = events_df[events_df.item_uri.isna() == False].copy()

all_events = borrow_events.copy()


### Bipartite Link Predictions for All Event Types

In [6]:
member_attrs = {'uri': 'member_id'}
book_attrs = {'uri': 'item_uri'}
edge_attrs = {'weight': 'counts'}
all_events_grouped = all_events.groupby(
    ['member_id', 'item_uri']).size().reset_index(name='counts')

should_process = True
write_to_file = True
sk_metrics = ['katz', 'louvain']
link_metrics = ['HITS', 'CoHITS', 'BiRank', 'BGRM']


all_events_bipartite_graph, all_events_bipartite_nodelist, all_events_bipartite_edgelist, all_events_members, all_events_books = check_reload_build_bipartite_graphs(
    all_events_grouped, member_attrs, book_attrs, edge_attrs, should_process, write_to_file, './data/all_events_bipartite', sk_metrics, link_metrics, members_df, books_df)


reloading saved graph: ./data/all_events_bipartite


In [7]:
all_preds = get_bipartite_link_predictions(all_events_bipartite_graph)


Running jaccard link prediction
Jaccard prediction starting...
Jaccard Executed in 49.35101890563965 seconds 

Running preferential attachment link prediction
Preferential_attachment prediction starting...
Preferential attachment Executed in 31.891121864318848 seconds 

Running common neighbors link prediction
Common neighbor prediction starting...
Common neighbours Executed in 31.474580764770508 seconds 

Running adamic adar link prediction
Adamic_adar prediction starting...
Adamic-adar Executed in 135.7455222606659 seconds 



In [8]:
metrics = ['jc_prediction', 'pa_prediction',
           'cn_prediction', 'aa_prediction']
all_preds[metrics].corr()

Unnamed: 0,jc_prediction,pa_prediction,cn_prediction,aa_prediction
jc_prediction,1.0,-0.018728,0.35268,0.312518
pa_prediction,-0.018728,1.0,0.531993,0.559736
cn_prediction,0.35268,0.531993,1.0,0.981303
aa_prediction,0.312518,0.559736,0.981303,1.0


In [9]:
title_lookup = {row.uri: row.title for row in books_df.itertuples()}

In [10]:
output_path = './data/partial_members_bipartite_network_all_events_link_predictions.csv'
if os.path.exists(output_path):
    os.remove(output_path)
partial_df.progress_apply(get_full_predictions, axis=1, number_of_results=10, limit_to_circulation=True, predictions_df=all_preds, events_df=events_df, relative_date=sixmonths, predict_group='books', output_path=output_path)


  0%|          | 0/219 [00:00<?, ?it/s]

NameError: name 'events_df' is not defined

In [None]:
processed_predictions = pd.read_csv(output_path)


In [None]:
number_of_results = 10
limit_to_circulation = True
predictions_df = all_preds.copy()
dfs = []
for index, row in tqdm(partial_df.iterrows(), total=partial_df.shape[0]):
    identified_top_predictions = {}

    circulation_start = row.subscription_starttime - sixmonths

    all_possible_circulations = events_df[(
        row.subscription_endtime >= events_df.end_datetime)]
    circulation_events = events_df[events_df.start_datetime.between(
        circulation_start, row.subscription_endtime) | events_df.end_datetime.between(circulation_start, row.subscription_endtime)]

    popular_all = all_possible_circulations.groupby(['item_uri']).size().reset_index(
    name='counts').sort_values(['counts'], ascending=False)[0:number_of_results]
    popular_current = circulation_events.groupby(['item_uri']).size().reset_index(
        name='counts').sort_values(['counts'], ascending=False)[0:number_of_results]
    
    circulation_all = all_possible_circulations.item_uri.unique().tolist()
    circulation_books = circulation_events.item_uri.unique().tolist()

    identified_top_predictions['popular_all_books'] = popular_all.item_uri.tolist(
    )
    identified_top_predictions['popular_all_counts'] = popular_all.counts.tolist(
    )
    identified_top_predictions['popular_current_books'] = popular_current.item_uri.tolist(
    )
    identified_top_predictions['popular_current_counts'] = popular_current.counts.tolist(
    )
    for idx, m in enumerate(metrics):

        subset_all_predictions = get_predictions_by_metric(
            row, m, predictions_df, circulation_all, limit_to_circulation)
        subset_predictions = get_predictions_by_metric(
            row, m, predictions_df, circulation_books, limit_to_circulation)
        identified_top_predictions[f'{m}_all'] = subset_all_predictions[0:number_of_results].item_uri.tolist(
        )
        identified_top_predictions[f'{m}_sixmonths'] = subset_predictions[0:number_of_results].item_uri.tolist()

        identified_top_predictions[f'{m}_all_scores'] = subset_predictions[0:number_of_results][m].tolist(
        )
        identified_top_predictions[f'{m}_six_months_scores'] = subset_predictions[0:number_of_results][m].tolist(
        )
    df_final = pd.DataFrame.from_dict(
        identified_top_predictions, orient='columns')

    df_final['member_id'] = row.member_id
    df_final['subscription_starttime'] = row.subscription_starttime
    df_final['subscription_endtime'] = row.subscription_endtime
    df_final['known_borrows'] = row.known_borrows

    output_path = './data/partial_members_bipartite_network_all_events_link_predictions.csv'
    if (index == 0) & os.path.exists(output_path):
        os.remove(output_path)
    if os.path.exists(output_path):
        df_final.to_csv(output_path, mode='a', header=False, index=False)
    else:
        df_final.to_csv(output_path, index=False, header=True)


In [None]:
tuples = [tuple(x) for x in all_events_bipartite_edgelist.values]
graph = convert_edge_list(tuples, bipartite=True)
biadjacency = graph.biadjacency
names = graph.names
ji = JaccardIndex()
ji.fit(biadjacency)


In [None]:
from numpy import argsort
for member in partial_members:
    i, = np.where(names == member)

    ji_scores = ji.predict(i[0])

    # col_name = '_'.join(list(pred_edge.keys()))
    print(f'most similar members based on book history of {member}', names[argsort(-ji_scores)][0:10])


In [None]:
G = sorted(nx.connected_components(graph), key=len, reverse=True)
G = graph.subgraph(G[0])


In [None]:
from networkx.algorithms import bipartite

print(bipartite.average_clustering(graph))
# print(bipartite.transitivy(G))


In [None]:
graph.number_of_edges(), graph.number_of_nodes()

In [None]:
results = evaluation.evaluate(graph, k=2, method='all')


In [None]:


tpr = list(results.values())[0][2]
fpr = list(results.values())[0][3]
metric = list(results.keys())[0]

evaluation._plot_ROC(fpr, tpr, metric)


In [None]:
hop2s = dict()
neighbors = dict()
katz_sim = defaultdict(dict)
sortDic = {}
left_set = [n for n, d in G.nodes(data=True) if d['bipartite'] == 0]
right_set = [n for n, d in G.nodes(data=True) if d['bipartite'] == 1]


In [None]:
list(set(G['colens'])).count('squire-london-mercury')



### Circulation Specific Bipartite Link Predictions

In [12]:
book_attrs = {'uri': 'item_uri'}
edge_attrs = {'weight': 'counts'}
should_process = True
write_to_file = False
sk_metrics = ['katz', 'louvain']
link_metrics = ['HITS', 'CoHITS', 'BiRank', 'BGRM']
predictions_df = None
# row = partial_df[0:1]
# start = pd.to_datetime(row.subscription_start.values[0], errors='coerce')
# end = pd.to_datetime(row.subscription_end.values[0], errors='coerce')
# circulation_start = start - sixmonths
circulation_events_bipartite_graph = None
circulation_events_bipartite_nodelist = None
circulation_events_books = None
circulation_events_members = None
popular_books = None
number_of_results = 5
for index, row in partial_df[9:10].iterrows():
    print(f'Processing {row.member_id} with subscription {row.subscription_start}')
    seed_data = events_df[(events_df.member_id == row.member_id) & (events_df.item_uri.isna() == False)]
    
    circulation_start = row.subscription_starttime - sixmonths
    circulation_events = events_df[events_df.start_datetime.between(
        circulation_start, row.subscription_endtime) | events_df.end_datetime.between(circulation_start, row.subscription_endtime)]
    circulation_events = circulation_events[circulation_events.member_id != row.member_id]

    all_possible_circulations = events_df[(row.subscription_endtime >= events_df.end_datetime)]
    all_possible_circulations = all_possible_circulations[all_possible_circulations.member_id != row.member_id]

    popular_current = circulation_events.groupby(['item_uri']).size().reset_index(
        name='counts').sort_values(['counts'], ascending=False)[0:number_of_results]
    popular_books = popular_current.item_uri.tolist()

    popular_all = all_possible_circulations.groupby(['item_uri']).size().reset_index(
        name='counts').sort_values(['counts'], ascending=False)[0:number_of_results]
    popular_books_all = popular_all.item_uri.tolist()

    member_borrows = len(circulation_events[(circulation_events.member_id == row.member_id) & (circulation_events.item_uri.isna()==False)])
    all_member_borrows = len(all_possible_circulations[(all_possible_circulations.member_id == row.member_id) & (all_possible_circulations.item_uri.isna() == False)])
    # print(row.member_id, member_borrows, len(circulation_events), len(seed_data))
    sixmonths_graph_data = pd.concat([seed_data, circulation_events], axis=0)
    all_graph_data = pd.concat([seed_data, all_possible_circulations], axis=0)
    member_attrs = {'uri': 'member_id'}

    all_circulation_events_grouped = all_graph_data.groupby(['member_id', 'item_uri']).size().reset_index(name='counts')
    circulation_events_grouped = sixmonths_graph_data.groupby(
    ['member_id', 'item_uri']).size().reset_index(name='counts')

    circulation_events_bipartite_graph, circulation_events_bipartite_nodelist, circulation_events_bipartite_edgelist, circulation_events_members, circulation_events_books = check_reload_build_bipartite_graphs(
        circulation_events_grouped, member_attrs, book_attrs, edge_attrs, should_process, write_to_file, 'test2', sk_metrics, link_metrics, members_df, books_df)

    predictions_df = get_bipartite_link_predictions(circulation_events_bipartite_graph)
    
    print(predictions_df[predictions_df.member_id == row.member_id].sort_values(['pa_prediction'], ascending=False))

Processing jolas-maria with subscription 1922-01-17
building graph: test2
connected? False
bipartite? True
[276, 7, 6, 6, 5, 4, 2, 2, 2]
graph density:  0.033860502838605026
calculating global degree
calculating top global clustering
calculating bottom global clustering
calculating global closeness
calculating global closeness


  0%|          | 0/9 [00:00<?, ?it/s]

calculating local skmetrics: katz louvain HITS CoHITS BiRank BGRM
component 1 - size 6 - graph density  1.0
calculating katz
component 2 - size 276 - graph density  0.04435483870967742
calculating katz
component 3 - size 6 - graph density  1.0
calculating katz
component 4 - size 5 - graph density  1.0
calculating katz
component 5 - size 2 - graph density  1.0
calculating katz
component 6 - size 2 - graph density  1.0
calculating katz
component 7 - size 7 - graph density  1.0
calculating katz
component 8 - size 2 - graph density  1.0
calculating katz
component 9 - size 4 - graph density  1.0
calculating katz
calculating global skmetrics: katz louvain
calculating katz
calculating global link metrics: : HITS CoHITS BiRank BGRM
Running jaccard link prediction
Jaccard prediction starting...
Jaccard Executed in 0.059008121490478516 seconds 

Running preferential attachment link prediction
Preferential_attachment prediction starting...
Preferential attachment Executed in 0.0442500114440918 se

In [13]:
partial_df[9:10].member_id

9    jolas-maria
Name: member_id, dtype: object

In [19]:
predictions_df[predictions_df.member_id == 'jolas-maria'].sort_values(
    by=['jc_prediction'], ascending=False)[0:20]


Unnamed: 0,member_id,item_uri,jc_prediction,pa_prediction,cn_prediction,aa_prediction
1505,jolas-maria,clouston-lunatic-large-novel,,10,,
1506,jolas-maria,fielding-history-tom-jones,,5,,
1507,jolas-maria,strachey-eminent-victorians,,10,,
1508,jolas-maria,foster-coquette-history-eliza,,15,,
1509,jolas-maria,kaye-smith-green-apple-harvest,,10,,
1510,jolas-maria,lawrence-trespasser,,15,,
1511,jolas-maria,loving-fifty-contemporary-one,,10,,
1512,jolas-maria,man-king,,5,,
1513,jolas-maria,sayler-russian-theatre,,10,,
1514,jolas-maria,andersen-nexo-ditte-daughter-man,,5,,


In [None]:
top_books = predictions_df[predictions_df.member_id == 'colens'].sort_values(by='jc_prediction', ascending=False)
top_books

In [None]:
top_books[top_books.jc_prediction.isna() == False]

In [None]:
circulation_events_bipartite_nodelist[circulation_events_bipartite_nodelist.group == 'books'].sort_values(by='global_degree', ascending=False)[0:10]


In [None]:
popular_books

In [None]:
book_list = top_books.item_uri.tolist()
top_books_nodes = circulation_events_bipartite_nodelist[circulation_events_bipartite_nodelist.uri.isin(book_list)]
# top_books_nodes[['global_degree', 'local_degree',
#                  'global_clustering', 'local_clustering', 'global_closeness',
#                  'local_closeness', 'global_betweenness', 'local_betweenness',
#                  'local_katz', 'local_HITS',
#                  'local_CoHITS', 'local_BiRank', 'local_BGRM', 'global_katz',
#                  'global_HITS', 'global_CoHITS', 'global_BiRank',
#                  'global_BGRM']].corr()
top_books_nodes

In [None]:
# not_top_books = circulation_events_bipartite_nodelist[circulation_events_bipartite_nodelist.uri.isin(book_list) == False]
# not_top_books[['global_degree', 'local_degree',
#                'global_clustering', 'local_clustering', 'global_closeness',
#                'local_closeness', 'global_betweenness', 'local_betweenness',
#                'local_katz', 'local_HITS',
#                'local_CoHITS', 'local_BiRank', 'local_BGRM', 'global_katz',
#                'global_HITS', 'global_CoHITS', 'global_BiRank',
#                'global_BGRM']].corr()


In [None]:
for idx, book in top_books.iterrows():
    # print(events_df[(events_df.member_id == 'hemingway') & (events_df.item_uri == book.item_uri)])
    print(circulation_events_bipartite_nodelist[circulation_events_bipartite_nodelist.uri == book.item_uri])


In [None]:
[title_lookup[x] for x in top_books.item_uri.values]

In [None]:
for d, v in circulation_events_bipartite_graph.nodes(data=True):
    if d == 'martin-maud':
        print(d, v)


In [None]:
predictions_df[predictions_df.member_id == 'martin-maud']


### Unipartite Link Predictions

In [None]:
member_attrs = {'uri': 'member_id'}
book_attrs = {'uri': 'item_uri'}
edge_attrs = {'weight': 'counts'}
node_attrs = {}
should_process = True
write_to_file = True
sk_metrics = ['katz', 'louvain']
link_metrics = ['pagerank', 'hubs', 'auth']
is_projected = True
all_events_grouped = all_events.groupby(
    ['member_id', 'item_uri']).size().reset_index(name='counts')


projected_members_graph, projected_members_nodelist, projected_members_edgelist, projected_members, projected_books_graph, projected_books_nodelist, projected_books_edgelist, projected_books = check_reload_build_unipartite_graphs(
    all_events_grouped, all_events, member_attrs, book_attrs, edge_attrs, node_attrs, should_process, write_to_file, './data/all_events_unipartite_projected', sk_metrics, link_metrics, members_df, books_df, is_projected)


In [30]:

def get_link_predictions(graph, nodelist):
    # nodelist, edgelist = generate_dataframes(graph, False, True)
    jaccard_coefs = list(nx.jaccard_coefficient(graph))
    jaccard_df = pd.DataFrame(jaccard_coefs, columns=['source', 'target', 'jaccard_coef_prediction'])
    jaccard_df['updated_source'] = jaccard_df['source'].progress_apply(lambda x: nodelist[nodelist.uri == x].uri.values[0])
    jaccard_df['updated_target'] = jaccard_df['target'].progress_apply(
        lambda x: nodelist[nodelist.uri == x].uri.values[0])


    pref_attach = nx.preferential_attachment(graph)
    pref_attach_df = pd.DataFrame(list(pref_attach), columns=['source', 'target', 'preferential_attachment_prediction'])
    pref_attach_df['updated_source'] = pref_attach_df['source'].progress_apply(lambda x: nodelist[nodelist.uri == x].uri.values[0])
    pref_attach_df['updated_target'] = pref_attach_df['target'].progress_apply(
        lambda x: nodelist[nodelist.uri == x].uri.values[0])

    pred_edges = pd.merge(jaccard_df, pref_attach_df, on=['updated_source', 'updated_target', 'source', 'target'])
    return pred_edges



In [21]:
from networkx.algorithms import bipartite
member_nodes = [
    n for n in circulation_events_bipartite_graph.nodes if circulation_events_bipartite_graph.nodes[n]['group'] == 'members']
members_graph = bipartite.weighted_projected_graph(
    circulation_events_bipartite_graph, member_nodes)


In [22]:
nodelist ,edgelist = generate_dataframes(members_graph, False, True)

In [31]:
pred_edges = get_link_predictions(
    members_graph, nodelist)


  0%|          | 0/572 [00:00<?, ?it/s]

  0%|          | 0/572 [00:00<?, ?it/s]

  0%|          | 0/572 [00:00<?, ?it/s]

  0%|          | 0/572 [00:00<?, ?it/s]

In [32]:
pred_edges.sort_values(by=['jaccard_coef_prediction',
                       'preferential_attachment_prediction'], ascending=False)


Unnamed: 0,source,target,jaccard_coef_prediction,updated_source,updated_target,preferential_attachment_prediction
547,milhaud,wilder-thornton,0.666667,milhaud,wilder-thornton,6
329,raphael-france,ottocar,0.500000,raphael-france,ottocar,35
168,treirse,ottensooser,0.500000,treirse,ottensooser,2
319,raphael-france,lucas-mrs,0.428571,raphael-france,lucas-mrs,21
37,rolland-madeleine,wyck,0.400000,rolland-madeleine,wyck,48
...,...,...,...,...,...,...
559,martin-simone,summerell,0.000000,martin-simone,summerell,0
563,violette,summerell,0.000000,violette,summerell,0
567,summerell,marsland,0.000000,summerell,marsland,0
568,summerell,ottensooser,0.000000,summerell,ottensooser,0


In [34]:
pred_edges[pred_edges.target == 'jolas-maria']

Unnamed: 0,source,target,jaccard_coef_prediction,updated_source,updated_target,preferential_attachment_prediction
5,renoir,jolas-maria,0.0,renoir,jolas-maria,0
35,rolland-madeleine,jolas-maria,0.0,rolland-madeleine,jolas-maria,0
62,linossier-raymonde,jolas-maria,0.0,linossier-raymonde,jolas-maria,0
92,rice-matilda,jolas-maria,0.0,rice-matilda,jolas-maria,0
118,stein-gertrude,jolas-maria,0.0,stein-gertrude,jolas-maria,0
150,treirse,jolas-maria,0.0,treirse,jolas-maria,0
171,varney,jolas-maria,0.0,varney,jolas-maria,0


In [None]:
pred_edges[['preferential_attachment_prediction', 'jaccard_coef_prediction']].corr()


In [None]:
nodelist, edgelist = generate_dataframes(projected_books_graph, False, True)


In [None]:
pred_edges_books, books_nodelist = get_link_predictions(projected_books_graph, nodelist)


In [None]:
components = [c for c in sorted(
    nx.connected_components(projected_members_graph), key=len, reverse=True)]

preds_dfs = []
for c in components:
    subgraph = projected_members_graph.subgraph(c)
    # print(len(subgraph))
    preds = nx.common_neighbor_centrality(subgraph)
    pred_df = pd.DataFrame(
        list(preds), columns=['source', 'target', 'common_neighbor_centrality_prediction'])
    preds_dfs.append(pred_df)


In [None]:
preds = list(nx.resource_allocation_index(projected_members_graph))


pred_df = pd.DataFrame(preds, columns=['source', 'target', 'resource_allocation_prediction'])


In [None]:
def update_edge_labels(rows):

    return nodes_df[nodes_df.label == rows].label.values[0]
pred_df['updated_target'] = pred_df.target.apply(update_edge_labels)
pred_df['updated_source'] = pred_df.source.apply(update_edge_labels)


In [None]:
members = projected_members.copy()
partial_members = partial_df.member_id.unique().tolist()
members.loc[(members.exceptional_types.isna() == True),
            'is_exceptional'] = False
members.loc[(members.exceptional_types.isna() == False),
            'is_exceptional'] = True
members.loc[(members.member_id.isin(partial_members)), 'is_partial'] = True
members.loc[(members.member_id.isin(partial_members)
             == False), 'is_partial'] = False


In [None]:
members = all_events[all_events.start_datetime < '1925-01-01'].member_id.unique().tolist()
books = all_events[all_events.start_datetime <'1925-01-01'].item_uri.unique().tolist()


In [None]:
combos = list(itertools.product(members, books))
len(combos)

In [None]:
combo_df = pd.DataFrame(data=combos, columns=['source', 'target'])


In [None]:
edgelist = pd.merge(combo_df, edges_df, on=['source', 'target'], how='outer')
edgelist.weight.fillna(0, inplace=True)

In [None]:
nodelist = pd.merge(members_df[['gender', 'is_organization', 'member_id', 'borrow_count', 'subscription_count', 'exceptional_types',
            'exceptional_counts']], nodes_df, left_on='member_id', right_on='uri')


In [None]:
import random
training = edgelist[edgelist.weight > 0 ].sample(frac=0.5)


In [None]:
# Degree Centrality features
# out_degree_centrality = nx.out_degree_centrality(members_graph)
# in_degree_centrality = nx.in_degree_centrality(members_graph)
# # training['source_out_centrality'] = training.apply(lambda row: degree_centrality[row.source],axis=1)
# # training['target_in_centrality'] = training.apply(lambda row: in_degree_centrality[row.target],axis=1)

# # Page rank
page_rank = nx.pagerank_scipy(members_graph)
training['target_pagerank'] = training.apply(lambda row: page_rank[row.target],axis=1)

# # Preferential Attachment
# # For a directed graph, is equal to K_out_source * K_in_target with K the number of neighbors. Which is equivalent to multiply the available centralities.
# training['preferencial_attachment'] = training.apply(lambda row: row.source_out_centrality * row.target_in_centrality,axis=1)

# # HITS algorithm
hub_score, authority_score = nx.hits(members_graph)
training['source_hub_score'] = training.apply(lambda row: hub_score[row.source],axis=1)
training['target_authority_score'] = training.apply(lambda row: authority_score[row.target],axis=1)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(14, 12))
sns.heatmap(training.corr(),
            vmax=0.5,
            square=True,
            annot=True)


In [None]:
training.loc[training.weight > 0, 'edge_exists'] = 1
training.loc[training.weight == 0, 'edge_exists'] = 0

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(training.drop(
    ['source', 'target', 'edge_exists'], axis=1), training.edge_exists, test_size=0.2)


In [None]:
from sklearn.ensemble import RandomForestClassifier

RF_classifer = RandomForestClassifier(n_estimators=1000)
RF_classifer.fit(X_train, y_train)


In [None]:
# rf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
#                        max_depth=None, max_features='auto', max_leaf_nodes=None,
#                        min_impurity_decrease=0.0,
#                        min_samples_leaf=1, min_samples_split=2,
#                        min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
#                        oob_score=False, random_state=None, verbose=0,
#                        warm_start=False)


In [None]:
RF_classifer.score(X_test, y_test)


In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100,
                           n_jobs=-1,
                           oob_score=True,
                           bootstrap=True,
                           random_state=42)
rf.fit(X_train, y_train)


In [None]:
print('R^2 Training Score: {:.2f} \nOOB Score: {:.2f} \nR^2 Validation Score: {:.2f}'.format(rf.score(X_train, y_train), rf.oob_score_,rf.score(X_test, y_test)))
