# **This notebook's process**

1. Load in Crunchbase dataframes(4 merged CSVs created in `1_SS_EDA.ipynb`)
    - Organizations: `files/output/organizations_merged.csv`
    - Jobs: `files/output/p1_jobs.csv`
    - Investments: `files/output/p1_investments.csv`
    - Partner investments: `files/output/p1_investments_partner.csv`
2. Select date and filter the dataframes by date
3. Save filtered dataframes as separate CSVs, and then load in as SFrames.
    - Crunchbase network: `files/output/graph_temp/cb/{}_df.csv`
    - Pledge 1% network: `files/output/graph_temp/p1/{}_df.csv`
    - Model network: `files/output/graph_temp/model/{}_df.csv`
    - Not Pledge 1% network: `files/output/graph_temp/np1/{}_df.csv`
4. Load SFrames into graph and remove duplicate edges. Produce 8 graphs based on # of edges allowed & direction.
5. Reduce size of dataset by limiting degrees of freedom from Pledge 1% companies, and save the vertices list for a few different network sizes
6. Produce 100 samples of the Crunchbase graphs and save to CSV.
    - 5 Degrees from Pledge 1% Companies: `Model_DF_D5`
        - Baseline: `files/output/Model_DF_D5/B/{}.csv`
        - Baseline Reduced: `files/output/Model_DF_D5/BR/{}.csv`
        - Graph & Baseline: `files/output/Model_DF_D5/GB/{}.csv`
        - Graph & Baseline Reduced: `files/output/Model_DF_D5/GBR/{}.csv`
        - Graph: `files/output/Model_DF_D5/G/{}.csv`
    - 4 Degrees from Pledge 1% Companies: `Model_DF_D4`
        - Baseline: `files/output/Model_DF_D4/B/{}.csv`
        - Baseline Reduced: `files/output/Model_DF_D4/BR/{}.csv`
        - Graph & Baseline: `files/output/Model_DF_D4/GB/{}.csv`
        - Graph & Baseline Reduced: `files/output/Model_DF_D4/GBR/{}.csv`
        - Graph: `files/output/Model_DF_D4/G/{}.csv`

## **Model**
`p1_tag` ~ `rank` + `total_funding_usd` + `age_yr` + `employee_count` (ordinal) + `country` (nominal, 112 indicator columns) + `category_groups` (nominal, 46 indicator columns) + ((GRAPH FEATURES))

In [50]:
'''Importing basic data analysis packages'''
import numpy as np
import pandas as pd
import csv
import warnings
import os
import time
import math
from functools import reduce
from datetime import datetime
warnings.filterwarnings('ignore')

'''Graph'''
import networkx as nx
from pyvis.network import Network
import turicreate
from turicreate import pagerank, kcore, degree_counting, shortest_path, connected_components, triangle_counting
from turicreate import SFrame, SGraph, SArray, load_sgraph, aggregate 

'''Plotting packages'''
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='white', font_scale=1.3)

def reduce_mem_usage(df, verbose=True):   
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

def network_by_date(date, df_input, jobs_input, invest_input, invest_prtnr_input, model_uuids=[], skip_not_p1=True):
    '''
    This function filters down Crunchbase dataframes by date 
    to ensure that the companies/people/investments being used in modeling exist at a given time.

    INPUT:
        - `date`: string w/ format 'YEAR-MO-DY' (e.g. '2020-09-08')
        - `df`: pandas dataframe of Crunchbase organizationss with necessary column fields:
            * `p1_date`, `founded_on`, `closed_on`
        - `jobs`: pandas dataframe of Crunchbase jobss with necessary column fields:
            * `p1_date`, `started_on`, `ended_on`
        - `invest`: pandas dataframe of Crunchbase investmentss with necessary column fields:
            * `p1_date`, `announced_on`
        - `invest_prtnr`: pandas dataframe of Crunchbase investments with necessary column fields:
            * `p1_date`, `announced_on`
        - `model_uuids`: list that contains the uuids of organizations that are used to construct the model graph
        - `skip_no_p1`: Boolean that defaults to excluding the opposite of the Pledge 1% neighborhood. Likely will delete option altogether later.
    
    OUTPUT:
        - List of dataframe lists, 2 lists of length 12: 
            * [Crunchbase neighborhood dataframes], [Pledge 1% neighborhood dataframes]
                                        OR
              [Crunchbase neighborhood dataframes], [Model neighborhood dataframes]
        - Each dataframe list contains 12 dataframes that will be saved & loaded as SFrames in the next processing step.
            0. Companies
            1. Investors
            2. Investments
            3. Partner investments
            4. Current Jobs
            5. Former jobs
            6. Former affiliated's new jobs
            7. Partner investor's affiliation (if not in jobs dataframes)
            8. Partner investor's coworkers at the investing firm
            9. Partner investor's coworkers' partner investments
            10. Current affiliated's old jobs
            11. Organization nodes from edges in 2,3,6,7,9,10 if not already in 0 or 1
    '''
    # Soft copy of dataframes
    df = df_input.copy()
    jobs = jobs_input.copy()
    invest = invest_input.copy()
    invest_prtnr = invest_prtnr_input.copy()
    
    #*******************************************************************************************************
    # DATE PROCESSING
    
    # Convert date columns to datetime
    df['p1_date'] = pd.to_datetime(df['p1_date'], errors='coerce')
    df['founded_on'] = pd.to_datetime(df['founded_on'], errors='coerce')
    df['closed_on'] = pd.to_datetime(df['closed_on'], errors='coerce')
    jobs['p1_date'] = pd.to_datetime(jobs['p1_date'], errors='coerce')
    jobs['started_on'] = pd.to_datetime(jobs['started_on'], errors='coerce')
    jobs['ended_on'] = pd.to_datetime(jobs['ended_on'], errors='coerce')
    invest['p1_date'] = pd.to_datetime(invest['p1_date'], errors='coerce')
    invest['announced_on'] = pd.to_datetime(invest['announced_on'], errors='coerce')
    invest_prtnr['p1_date'] = pd.to_datetime(invest_prtnr['p1_date'], errors='coerce')
    invest_prtnr['announced_on'] = pd.to_datetime(invest_prtnr['announced_on'], errors='coerce')
    
    # Convert input date to datetime object
    date = pd.Timestamp(date)
    print('\nAS OF {}:\n'.format(date.strftime('%B %d, %Y').upper()))
    
    #*******************************************************************************************************
    # Create new row for tagging model companies
    df['add_to_model'] = 0
    df['add_to_model'][df['uuid'].isin(model_uuids)] = 1
    jobs['add_to_model'] = 0
    jobs['add_to_model'][jobs['org_uuid'].isin(model_uuids)] = 1
    invest['add_to_model'] = 0
    invest['add_to_model'][invest['org_uuid'].isin(model_uuids)] = 1
    invest_prtnr['add_to_model'] = 0
    invest_prtnr['add_to_model'][invest_prtnr['org_uuid'].isin(model_uuids)] = 1
    
    #*******************************************************************************************************
    # COMPANY FILTER
    # Crunchbase company must be founded after DATE and closed before DATE (or DATE == NaT)
    CB_companies = df[(df['founded_on']<=date) & 
                      ((df['closed_on']>date) | (pd.isnull(df['closed_on']))) & 
                      (df['primary_role']=='company')].reset_index(drop=True)
    
    #*******************************************************************************************************
    # INVESTOR FILTER:
    # Crunchbase investor must be founded AFTER date and closed BEFORE date (or date == NaT)
    CB_investors = df[(df['founded_on']<=date) & 
                      ((df['closed_on']>date) | (pd.isnull(df['closed_on']))) & 
                      (df['primary_role']=='investor')].reset_index(drop=True)
    
    #*******************************************************************************************************
    # INVESTMENT FILTER
    # Crunchbase investment must have taken place BEFORE date
    CB_investments = invest[(invest['announced_on']<=date) & 
                            (invest['investor_type']=='organization')].reset_index(drop=True)
    
    #*******************************************************************************************************
    # PARTNER INVESTMENT FILTER
    # Crunchbase partner investment must have taken place BEFORE date
    CB_investment_partners = invest_prtnr[invest_prtnr['announced_on']<=date].reset_index(drop=True)
    
    #*******************************************************************************************************
    # CURRENT JOB FILTER
    # Crunchbase job must have started BEFORE date and ended AFTER date (or date == NaT)
    CB_jobs = jobs[(jobs['job_type'].isin(['executive','board_member','advisor','board_observer'])) & 
                      (jobs['started_on']<=date) & 
                      ((jobs['ended_on']>date) | (pd.isnull(jobs['ended_on'])))].reset_index(drop=True)
    
    #*******************************************************************************************************
    # FORMER JOB FILTER
    # Crunchbase job must have ended BEFORE date or started AFTER date
    CB_jobs_former = jobs[(jobs['job_type'].isin(['executive','board_member','advisor','board_observer'])) & 
                          ((jobs['ended_on']<=date) | (jobs['started_on']>date))].reset_index(drop=True)
    
    #*******************************************************************************************************
    # COMBINE THESE 6 (or 7) INTO LIST OF FRAMES
    lst_of_frames = []
    # Crunchbase frames
    CB_frames = [CB_companies,CB_investors,CB_investments,CB_investment_partners,CB_jobs,CB_jobs_former]
    # Add to list of frames
    lst_of_frames.append(CB_frames)
    # If model_uuids are not supplied, calculate Pledge 1% neighborhood
    if model_uuids == []:
        P1_frames = []
        for frame in CB_frames:
            # Pledge 1% frames must have Crunchbase assumptions in addition to an earlier pledge date
            new_frame = frame[frame['p1_date']<=date].reset_index(drop=True).drop('add_to_model',axis=1)
            P1_frames.append(new_frame)
        # Add to list of frames
        lst_of_frames.append(P1_frames)
    # If model_uuids are supplied, calculate model neighborhood
    if model_uuids != []:
        model_frames = []
        for frame in CB_frames:
            # Include model dataframe if condition satisfied: either are a Pledge 1% company or tagged by model_uuids
            new_frame=frame[(frame['p1_date']<=date) | (frame['add_to_model']==1)].reset_index(drop=True).drop('add_to_model',axis=1)
            model_frames.append(new_frame)
        # Add to list of frames
        lst_of_frames.append(model_frames)
    # If this boolean value is False, calculate ~Pledge 1% neighborhood
    if skip_not_p1 is False:
        not_P1_frames = []
        for frame in CB_frames:
            # Non-Pledge 1% frames must have Crunchbase assumptions in addition to NaT pledge date or later pledge date
            new_frame = frame[(pd.isnull(frame['p1_date']) | (frame['p1_date']>date))].reset_index(drop=True).drop('add_to_model',axis=1)
            not_P1_frames.append(new_frame)
        # Add to list of frames
        lst_of_frames.append(not_P1_frames) 
    # Remove extra column 'add_to_model'
    for idx,frame in enumerate(CB_frames):
        CB_frames[idx] = frame.drop('add_to_model',axis=1)

    #*******************************************************************************************************
    # FORMER NEW JOB FILTER
    print('CaLcUlAtInG... FORMER NEW JOB FILTER')
    
    for frame in lst_of_frames:
        # Where do the former affiliated work now?
        # Pull their uuids
        former_people = frame[5].person_uuid.unique()
        # Pull their current jobs from Crunchbase
        jobs_former_new = CB_frames[4][CB_frames[4].person_uuid.isin(former_people)] 
        # Check they're not already in the current jobs dataframe
        # Combine into one temp data frame
        combined_jobs = pd.concat([frame[4], jobs_former_new]).reset_index(drop=True) 
        df_gpby = combined_jobs.groupby(list(combined_jobs.columns))
        # Only count non-duplicated columns
        idx = [x[0] for x in df_gpby.groups.values() if len(x) == 1]
        # Reindex dataframe
        jobs_former_new = combined_jobs.reindex(idx)
        # Add to list of frames
        frame.append(jobs_former_new)
    
    #*******************************************************************************************************
    # PARTNER INVESTMENT JOB FILTER
    print('CaLcUlAtInG... PARTNER INVESTMENT JOB FILTER')
    
    for frame in lst_of_frames:
        # Are the partner investment jobs already in one of the jobs dataframes? If not, we should add them.
        # Create temporary dataframe and column to make checking the intersection between dataframes easier 
        # frame[4]: current jobs | frame[5]: former jobs | frame[6]: former new jobs
        jobs_combined = pd.concat([frame[4],frame[5],frame[6]])
        jobs_combined['person,company'] = jobs_combined['person_uuid'] + ',' + jobs_combined['org_uuid']
        # frame[3]: partner investments
        frame[3]['person,company'] = frame[3]['partner_uuid']+ ',' + frame[3]['investor_uuid']
        # Number of unique partner investments
        unique_PI = frame[3]['person,company'].unique()
        # Overlap between PI and combined J frames, create temporary jobs view
        # These PI are already found in J frames, so we do not need to include them
        jobs_already_in_J = jobs_combined[jobs_combined['person,company'].isin(unique_PI)] 
        # This will return non intersecting value of PI with temp J
        # These PI are not found in J, so we would like to include them
        PI_not_in_J = np.setdiff1d(unique_PI,jobs_already_in_J['person,company'].unique())
        # Need to create separate jobs dataframe for non intersecting PI/J person/company pairs
        grouped = frame[3][frame[3]['person,company'].isin(PI_not_in_J)].groupby(['partner_uuid','partner_name','investor_uuid','investor_name']).count()
        grouped_df = grouped.reset_index()[['partner_uuid','partner_name','investor_uuid','investor_name']]
        grouped_df['job_type'] = 'executive'
        # Add to list of frames
        frame.append(grouped_df)
    
    #*******************************************************************************************************
    # OTHER FIRM PARNTERS
    print('CaLcUlAtInG... OTHER FIRM PARTNER JOBS & INVESTMENTS FILTER')
    
    for frame in lst_of_frames:
        # OTHER FIRM PARNTERS - JOBS
        # Who are the other partners that work at the investment firms present in the neighborhood?
        # Get the unique investor uuids associated with the dataframes
        # frame[2]: from investments dataframe
        unique_investor_firm_A = list(frame[2]['investor_uuid'].unique())
        # frame[3]: from partner investments dataframe
        unique_investor_firm_B = list(frame[3]['investor_uuid'].unique())
        partners = list(frame[3]['partner_uuid'].unique())
        # Combine to get list of unique uuids of VC firms
        unique_firms = list(set(unique_investor_firm_A+unique_investor_firm_B))
        # Grab current jobs from Crunchbase for these investing firms
        # Exclude duplicate partner job (already represented by partners list calculated above)
        partner_jobs = CB_frames[4][(CB_frames[4]['org_uuid'].isin(unique_firms)) &  
                                    ~(CB_frames[4]['person_uuid'].isin(partners))].reset_index(drop=True)
        # Check they're not already in the current/former jobs dataframe
        # Combine into one temp data frame
        combined_jobs = pd.concat([frame[4], partner_jobs]).reset_index(drop=True) 
        df_gpby = combined_jobs.groupby(list(combined_jobs.columns))
        # Only count non-duplicated rows
        idx = [x[0] for x in df_gpby.groups.values() if len(x) == 1]
        # Reindex dataframe
        partner_jobs = combined_jobs.reindex(idx)
        # Add to list of frames
        frame.append(partner_jobs)
        # OTHER FIRM PARNTERS - PARTNER INVESTMENTS
        # For these new partners, what companies are they invested in?
        # Get the unique parnter uuids associated with the dataframes
        other_partners = partner_jobs['person_uuid'].unique()
        other_partner_investments = CB_frames[3][CB_frames[3]['partner_uuid'].isin(other_partners)]
        # Check they're not already in the partner investments dataframe
        # Combine into one temp data frame
        combined_jobs = pd.concat([frame[3], other_partner_investments]).reset_index(drop=True) 
        df_gpby = combined_jobs.groupby(list(combined_jobs.columns))
        # Only count non-duplicated rows
        idx = [x[0] for x in df_gpby.groups.values() if len(x) == 1]
        # Reindex dataframe
        other_partner_investments = combined_jobs.reindex(idx)
        # Add to list of frames
        frame.append(other_partner_investments)
    
    #*******************************************************************************************************
    # CURRENT OLD JOB FILTER
    print('CaLcUlAtInG... CURRENT OLD JOB FILTER')
    
    for frame in lst_of_frames:
        # Where did the current affiliated work previously?
        current_people = frame[4].person_uuid.unique() # Pull their IDs
        jobs_current_old = CB_frames[5][CB_frames[5].person_uuid.isin(current_people)] # Pull their current jobs from Crunchbase
        # Check they're not already in the current jobs dataframe
        # Combine into one temp data frame
        combined_jobs = pd.concat([frame[5], jobs_current_old]).reset_index(drop=True) 
        df_gpby = combined_jobs.groupby(list(combined_jobs.columns))
        # Only count non-duplicated columns
        idx = [x[0] for x in df_gpby.groups.values() if len(x) == 1]
        # Reindex dataframe
        jobs_current_old = combined_jobs.reindex(idx)
        # Add to list of frames
        frame.append(jobs_current_old)
        
    #*******************************************************************************************************
    # GET EXTRA ORG UUID ATTRIBUTES FROM INVESTMENTS & JOBS
    print('CaLcUlAtInG... EXTRA ORGANIZATION NODES')
    
    CB_orgs = pd.concat([CB_companies, CB_investors])
    for frame in lst_of_frames:
        unique_orgs = []
        # Investments
        unique_orgs.extend(list(frame[2]['investor_uuid'].unique()))
        # Partner investments
        unique_orgs.extend(list(frame[3]['investor_uuid'].unique()))
        # Former new jobs organizations
        unique_orgs.extend(list(frame[6]['org_uuid'].unique()))
        # Parter jobs organizations
        unique_orgs.extend(list(frame[7]['investor_uuid'].unique()))
        # Other parter investments organizations
        unique_orgs.extend(list(frame[9]['org_uuid'].unique()))
        # Current old jobs organizations
        unique_orgs.extend(list(frame[10]['org_uuid'].unique()))
        # Pull their organization information from Crunchbase
        new_org_nodes = CB_orgs[CB_orgs['uuid'].isin(list(set(unique_orgs)))]
        # Add to list of frames
        frame.append(new_org_nodes)
    
    #*******************************************************************************************************
    
    # Output print statements
    print('\nCrunchbase Neighborhood')
    print('NODES | OUTPUT FRAME 0/CB_companies {}'.format(CB_frames[0].shape))
    print('NODES | OUTPUT FRAME 1/CB_investors {}'.format(CB_frames[1].shape))
    print('NODES&EDGES | OUTPUT FRAME 2/CB_investments {}'.format(CB_frames[2].shape))
    print('NODES&EDGES | OUTPUT FRAME 3/CB_investment_partners {}'.format(CB_frames[3].shape))
    print('NODES&EDGES | OUTPUT FRAME 4/CB_jobs {}'.format(CB_frames[4].shape))
    print('NODES&EDGES | OUTPUT FRAME 5/CB_jobs_former {}'.format(CB_frames[5].shape))
    print('NODES&EDGES | OUTPUT FRAME 6/CB_jobs_former_new {}'.format(CB_frames[6].shape))
    print('NODES&EDGES | OUTPUT FRAME 7/CB_jobs_partner {}'.format(CB_frames[7].shape))
    print('NODES&EDGES | OUTPUT FRAME 8/CB_jobs_other_partners {}'.format(CB_frames[8].shape))
    print('NODES&EDGES | OUTPUT FRAME 9/CB_invest_other_partners {}'.format(CB_frames[9].shape))
    print('NODES&EDGES | OUTPUT FRAME 10/CB_jobs_current_old {}'.format(CB_frames[10].shape))
    print('NODES | OUTPUT FRAME 11/CB_extra_org_nodes {}'.format(CB_frames[11].shape))
    if model_uuids != []:
        print('\nModel Neighborhood')
        print('NODES | OUTPUT FRAME 0/model_companies {}'.format(model_frames[0].shape))
        print('NODES | OUTPUT FRAME 1/model_investors {}'.format(model_frames[1].shape))
        print('NODES&EDGES | OUTPUT FRAME 2/model_investments {}'.format(model_frames[2].shape))
        print('NODES&EDGES | OUTPUT FRAME 3/model_investment_partners {}'.format(model_frames[3].shape))
        print('NODES&EDGES | OUTPUT FRAME 4/model_jobs {}'.format(model_frames[4].shape))
        print('NODES&EDGES | OUTPUT FRAME 5/model_jobs_former {}'.format(model_frames[5].shape))
        print('NODES&EDGES | OUTPUT FRAME 6/model_jobs_former_new {}'.format(model_frames[6].shape))
        print('NODES&EDGES | OUTPUT FRAME 7/model_jobs_partner {}'.format(model_frames[7].shape))
        print('NODES&EDGES | OUTPUT FRAME 8/model_jobs_other_partners {}'.format(model_frames[8].shape))
        print('NODES&EDGES | OUTPUT FRAME 9/model_invest_other_partners {}'.format(model_frames[9].shape))
        print('NODES&EDGES | OUTPUT FRAME 10/model_jobs_current_old {}'.format(model_frames[10].shape))
        print('NODES | OUTPUT FRAME 11/model_extra_org_nodes {}'.format(model_frames[11].shape))
        return lst_of_frames
    print('\nPledge 1% Neighborhood')
    print('NODES | OUTPUT FRAME 0/P1_companies {}'.format(P1_frames[0].shape))
    print('NODES | OUTPUT FRAME 1/P1_investors {}'.format(P1_frames[1].shape))
    print('NODES&EDGES | OUTPUT FRAME 2/P1_investments {}'.format(P1_frames[2].shape))
    print('NODES&EDGES | OUTPUT FRAME 3/P1_investment_partners {}'.format(P1_frames[3].shape))
    print('NODES&EDGES | OUTPUT FRAME 4/P1_jobs {}'.format(P1_frames[4].shape))
    print('NODES&EDGES | OUTPUT FRAME 5/P1_jobs_former {}'.format(P1_frames[5].shape))
    print('NODES&EDGES | OUTPUT FRAME 6/P1_jobs_former_new {}'.format(P1_frames[6].shape))
    print('NODES&EDGES | OUTPUT FRAME 7/P1_jobs_partner {}'.format(P1_frames[7].shape))
    print('NODES&EDGES | OUTPUT FRAME 8/P1_jobs_other_partners {}'.format(P1_frames[8].shape))
    print('NODES&EDGES | OUTPUT FRAME 9/P1_invest_other_partners {}'.format(P1_frames[9].shape))
    print('NODES&EDGES | OUTPUT FRAME 10/P1_jobs_current_old {}'.format(P1_frames[10].shape))
    print('NODES | OUTPUT FRAME 11/P1_extra_org_nodes {}'.format(P1_frames[11].shape))
    # Skip Not P1 Calculations
    if skip_not_p1 is False:
        print('\n~Pledge 1% Neighborhood')
        print('NODES | OUTPUT FRAME 0/not_P1_companies {}'.format(not_P1_frames[0].shape))
        print('NODES | OUTPUT FRAME 1/not_P1_investors {}'.format(not_P1_frames[1].shape))
        print('NODES&EDGES | OUTPUT FRAME 2/not_P1_investments {}'.format(not_P1_frames[2].shape))
        print('NODES&EDGES | OUTPUT FRAME 3/not_P1_investment_partners {}'.format(not_P1_frames[3].shape))
        print('NODES&EDGES | OUTPUT FRAME 4/not_P1_jobs {}'.format(not_P1_frames[4].shape))
        print('NODES&EDGES | OUTPUT FRAME 5/not_P1_jobs_former {}'.format(not_P1_frames[5].shape))
        print('NODES&EDGES | OUTPUT FRAME 6/not_P1_jobs_former_new {}'.format(not_P1_frames[6].shape))
        print('NODES&EDGES | OUTPUT FRAME 7/not_P1_jobs_partner {}'.format(not_P1_frames[7].shape))
        print('NODES&EDGES | OUTPUT FRAME 8/not_P1_jobs_other_partners {}'.format(not_P1_frames[8].shape))
        print('NODES&EDGES | OUTPUT FRAME 9/not_P1_invest_other_partners {}'.format(not_P1_frames[9].shape))
        print('NODES&EDGES | OUTPUT FRAME 10/not_P1_jobs_current_old {}'.format(not_P1_frames[10].shape))
        print('NODES | OUTPUT FRAME 11/not_P1_extra_org_nodes {}'.format(not_P1_frames[11].shape))
    return lst_of_frames

def load_vertices(sframes, g):
    # For jobs dataframes
    for idx in [4,5,6,8,10]:
        # Keep relevant node attributes
        frame_temp = sframes[idx][['person_uuid', 'person_name']].rename({'person_uuid':'__id', 'person_name':'name'})
        frame_temp['__node_type'] = 'person'
        # Add p1_tag to the vertex
        frame_temp['p1_tag'] = 0
        g = g.add_vertices(vertices=frame_temp, vid_field='__id')
    # For jobs and partner investments dataframes
    for idx in [2,3,4,5,6,8,9,10]:
        # Keep relevant node attributes
        frame_temp = sframes[idx][['org_uuid', 'org_name', 'p1_tag']].rename({'org_uuid':'__id', 'org_name':'name'})
        frame_temp['__node_type'] = 'company'
        # Add p1_tag to the vertex
        frame_temp['p1_tag'] = frame_temp['p1_tag'].apply(lambda x: 0 if (x=="" or x==0) else 1)
        frame_temp['p1_tag'] = frame_temp['p1_tag'].astype(int)
        g = g.add_vertices(vertices=frame_temp, vid_field='__id')
    # For investments dataframes
    for idx in [2,3,7,9]:
        # Keep relevant node attributes
        frame_temp = sframes[idx][['investor_uuid', 'investor_name']].rename({'investor_uuid':'__id', 'investor_name':'name'})
        frame_temp['__node_type'] = 'investor'
        # Add p1_tag to the vertex
        frame_temp['p1_tag'] = 0
        g = g.add_vertices(vertices=frame_temp, vid_field='__id')
    # For partner investments dataframes
    for idx in [3,7,9]:
        # Keep relevant node attributes
        frame_temp = sframes[idx][['partner_uuid', 'partner_name']].rename({'partner_uuid':'__id', 'partner_name':'name'})
        frame_temp['__node_type'] = 'person'
        # Add p1_tag to the vertex
        frame_temp['p1_tag'] = 0
        g = g.add_vertices(vertices=frame_temp, vid_field='__id')
    # Organizations
    for idx in [0,1,11]:
        # Keep relevant node attributes
        frame_temp = sframes[idx][['uuid', 'name', 'primary_role', 'p1_tag']].rename({'uuid':'__id', 'primary_role':'__node_type'})
        # Add p1_tag to the vertex
        frame_temp['p1_tag'] = frame_temp['p1_tag'].apply(lambda x: 0 if (x=="" or x==0) else 1)
        frame_temp['p1_tag'] = frame_temp['p1_tag'].astype(int)
        # Load into graph
        g = g.add_vertices(vertices=frame_temp, vid_field='__id')
    # Return SGraph
    return g

def find_p1_affiliations(p1_sframes):
    frames = p1_sframes.copy()
    # Combine company and investor Pledge 1% dataframes, keeping only the uuid column
    p1_affiliations = frames[0][['uuid']].append(frames[1][['uuid']])
    # Add edge connecting to Pledge 1% uuid
    p1_affiliations['p1_uuid'] = 'fd9e2d10-a882-c6f4-737e-fd388d4ffd7c'
    # Create id, source, destination fields in SFrame
    p1_affiliations = p1_affiliations.rename({'uuid':'src','p1_uuid':'dst'})
    p1_affiliations['p1_tag'] = 1
    # Return SFrame
    return p1_affiliations

def load_edges(sframes, g, p1_affiliations=[], include_edges=[2,3], reverse=False, add_weights=False):
    w = {'status':{'primary':3,'secondary':2,'tertiary':1}, '__edge_type':{'job':1, 'investment':2}}
    # Since it is a directed graph, need to include option for reverse direction
    # Forward
    source = 'src'
    destination = 'dst'
    # Reverse
    if reverse:
        source = 'dst'
        destination = 'src'
    if type(p1_affiliations) == SFrame:
        # P1 Companies: Company/Investor --> Pledge 1%
        g = g.add_edges(edges=p1_affiliations, src_field=source, dst_field=destination)
        if add_weights:
            frame_temp['weight'] = 6
    # Investments: Investor --> Company
    # Create id, source, destination fields in SFrame
    frame_temp = sframes[2][['investment_uuid','investor_uuid','org_uuid','investment_type','raised_amount_usd','investor_count','is_lead_investor','lead_investor_count']].rename({'investment_uuid':'__id','investor_uuid':'src','org_uuid':'dst'})
    frame_temp['__edge_type'] = 'investment'
    frame_temp['status'] = 'primary'
    if add_weights:
        frame_temp['weight'] = w['__edge_type']['investment'] * w['status']['primary']
    g = g.add_edges(edges=frame_temp, src_field=source, dst_field=destination)
    # Partner Investments, Investments: Person --> Company
    # Create id, source, destination fields in SFrame
    frame_temp = sframes[3][['investment_uuid','partner_uuid','org_uuid','investment_type','raised_amount_usd','investor_count']].rename({'investment_uuid':'__id','partner_uuid':'src','org_uuid':'dst'})
    frame_temp['__edge_type'] = 'investment'
    frame_temp['status'] = 'primary'
    if add_weights:
        frame_temp['weight'] = w['__edge_type']['investment'] * w['status']['primary']
    g = g.add_edges(edges=frame_temp, src_field=source, dst_field=destination)
    # Partner Investments, Investments: Investor --> Company
    # Create id, source, destination fields in SFrame
    frame_temp = sframes[3][['investor_uuid','org_uuid','investment_type','investor_count']].rename({'investor_uuid':'src','org_uuid':'dst'})
    frame_temp['__edge_type'] = 'investment'
    frame_temp['status'] = 'secondary'
    if add_weights:
        frame_temp['weight'] = w['__edge_type']['investment'] * w['status']['secondary']
    # Secondary relationships, skip if not specified at input
    if 2 in include_edges:
        g = g.add_edges(edges=frame_temp, src_field=source, dst_field=destination)
    # Partner Investments, Jobs: Person --> Company
    # Create id, source, destination fields in SFrame
    frame_temp = sframes[7][['partner_uuid','investor_uuid']].rename({'partner_uuid':'src','investor_uuid':'dst'})
    frame_temp['__edge_type'] = 'job'
    frame_temp['status'] = 'secondary'
    if add_weights:
        frame_temp['weight'] = w['__edge_type']['job'] * w['status']['secondary']
    # Secondary relationships, skip if not specified at input
    if 2 in include_edges:
        g = g.add_edges(edges=frame_temp, src_field=source, dst_field=destination)    
    # Other Partner Investments, Investments: Person --> Company
    # Create id, source, destination fields in SFrame
    frame_temp = sframes[9][['investment_uuid','partner_uuid','org_uuid','investment_type','raised_amount_usd','investor_count']].rename({'investment_uuid':'__id','partner_uuid':'src','org_uuid':'dst'})
    frame_temp['__edge_type'] = 'investment'
    frame_temp['status'] = 'tertiary'
    if add_weights:
        frame_temp['weight'] = w['status']['tertiary'] * w['__edge_type']['investment']
    # Tertiary relationships, skip if not specified at input
    if 3 in include_edges:
        g = g.add_edges(edges=frame_temp, src_field=source, dst_field=destination)
    # Jobs: Person --> Company
    for idx in [4,5,6,8,10]:
        # Create id, source, destination fields in SFrame
        frame_temp = sframes[idx][['job_uuid','person_uuid','org_uuid','job_type','title']].rename({'job_uuid':'__id','person_uuid':'src','org_uuid':'dst'})
        frame_temp['__edge_type'] = 'job'
        # Current jobs
        if idx == 4:
            frame_temp['status'] = 'primary'
            if add_weights:
                frame_temp['weight'] = w['status']['primary'] * w['__edge_type']['job']
            g = g.add_edges(edges=frame_temp, src_field=source, dst_field=destination)
            continue
        # Secondary relationships, skip if not specified at input
        if 2 in include_edges:
            # Former jobs | Former new jobs | Current old jobs 
            if idx in [5,6,10]:
                frame_temp['status'] = 'secondary'
                if add_weights:
                    frame_temp['weight'] = w['status']['secondary'] * w['__edge_type']['job']
                g = g.add_edges(edges=frame_temp, src_field=source, dst_field=destination)
                continue  
        # Tertiary relationships, skip if not specified at input
        if 3 in include_edges:
            # Other partners at firm
            if idx == 8:
                frame_temp['status'] = 'tertiary'
                if add_weights:
                    frame_temp['weight'] = w['status']['tertiary'] * w['__edge_type']['job']
                g = g.add_edges(edges=frame_temp, src_field=source, dst_field=destination)
                continue
    # Return SGraph
    return g

def update_cb_weights(src, edge, dst):
    if src['__id'] != dst['__id']: # ignore self-links
        edge['weight'] = 0
        edge['weight_status'] = 0
        edge['weight_type'] = 0
        if edge['status'] == 'primary':
            edge['weight_status'] = 3
        if edge['status'] == 'secondary':
            edge['weight_status'] = 2
        if edge['status'] == 'tertiary':
            edge['weight_status'] = 1
        if edge['__edge_type'] == 'job':
            edge['weight_type'] = 1
        if edge['__edge_type'] == 'investment':
            edge['weight_type'] = 2
        edge['weight'] = edge['weight_status'] * edge['weight_type']
    return (src, edge, dst)

def update_pagerank_weight(src, edge, dst):
    if src['__id'] != dst['__id']: # ignore self-links
        dst['pagerank'] += src['prev_pagerank'] * edge['weight']
    return (src, edge, dst)

def update_pagerank_reset_prob(src, edge, dst):
    global reset
    if src['__id'] != dst['__id']: # ignore self-links
        dst['pagerank'] *= (1 - reset)
        dst['pagerank'] += reset
    return (src, edge, dst)

def update_pagerank_prev_to_current(src, edge, dst):
    if src['__id'] != dst['__id']: # ignore self-links
        src['prev_pagerank'] = src['pagerank']
    return (src, edge, dst)

def sum_weight(src, edge, dst):
    if src['__id'] != dst['__id']: # ignore self-links
        src['total_weight'] += edge['weight']
    return src, edge, dst

def make_pagerank_zero(src, edge, dst):
    if src['__id'] != dst['__id']: # ignore self-links
        dst['pagerank'] = 0
    return src, edge, dst

def update_l1_delta(src, edge, dst):
    if src['__id'] != dst['__id']: # ignore self-links
        dst['l1_delta'] = abs(dst['pagerank'] - dst['prev_pagerank'])
        src['l1_delta'] = abs(src['pagerank'] - src['prev_pagerank'])
    return src, edge, dst

def normalize_weight(src, edge, dst):
    if src['__id'] != dst['__id']: # ignore self-links
        edge['weight'] /= src['total_weight']
    return src, edge, dst

def pagerank_weighted(input_graph, reset_prob=0.15, threshold=0.01, max_iterations=3):
    g = SGraph(input_graph.vertices, input_graph.edges)
    global reset
    reset = reset_prob
    # compute normalized edge weight
    g.vertices['total_weight'] = 0.0
    g = g.triple_apply(sum_weight, ['total_weight'])
    g = g.triple_apply(normalize_weight, ['weight'])
    del g.vertices['total_weight']
    # initialize vertex field
    g.vertices['prev_pagerank'] = 1.0
    it = 0
    total_l1_delta = len(g.vertices)
    start = time.time()
    while(total_l1_delta > threshold and it < max_iterations):
        if 'pagerank' not in g.get_vertex_fields():
            g.vertices['pagerank'] = 0.0
        else:
            g = g.triple_apply(make_pagerank_zero, ['pagerank'])
        g = g.triple_apply(update_pagerank_weight, ['pagerank'])
        g = g.triple_apply(update_pagerank_reset_prob, ['pagerank'])
        if 'l1_delta' not in g.get_vertex_fields():
            g.vertices['l1_delta'] = (g.vertices['pagerank'] - g.vertices['prev_pagerank']).apply(lambda x: abs(x))
        else:
            g = g.triple_apply(update_l1_delta, ['l1_delta'])
        total_l1_delta = g.vertices['l1_delta'].sum()
        g = g.triple_apply(update_pagerank_prev_to_current, ['prev_pagerank'])
        print ("Iteration %d: total pagerank changed in L1 = %f" % (it, total_l1_delta))
        it = it + 1
    print ("Weighted pagerank finished in: %f secs" % (time.time() - start))
    del g.vertices['prev_pagerank']
    return g.vertices

# Visualization of Relationships
Companies are blue

People are red

#### Primary relationships

- VC Firm investing in a company
- Partner at VC Firm investing  in a company
- Person has current job at a company

In [None]:
# g = Network(notebook=True, directed=True, heading='Primary')
# g.force_atlas_2based()

# # Primary nodes
# g.add_node(0, label='0', color='#add8e6', size=9)
# g.add_node(1, label='1', color='blue', size=9)
# g.add_node(2, label='2', color='red', size=5)
# g.add_node(3, label='3', color='red', size=5)

# # Primary edges
# g.add_edge(1,0, label="investment", color="grey")
# g.add_edge(2,0, label="investment", color="grey")
# g.add_edge(3,0, label='job', color='grey')

# g.show('Primary.html')

#### Secondary relationships

What's added in green:
- Former jobs of current employees of companies
- Former employees of companies and their new jobs
- Partner investors' affiliation with their VC firm and the indirect link to the same invested company

In [None]:
# g = Network(notebook=True, directed=True, heading='Secondary')
# g.force_atlas_2based()

# # Primary nodes
# g.add_node(0, label='0', color='#add8e6', size=9)
# g.add_node(1, label='1', color='blue', size=9)
# g.add_node(2, label='2', color='red', size=5)
# g.add_node(3, label='3', color='red', size=5)
# g.add_node(4, label='4', color='blue', size=9)

# # Primary edges
# g.add_edge(1,0, label="investment", color="grey")
# g.add_edge(2,0, label="investment", color="grey")
# g.add_edge(3,0, label='job', color='grey')

# # Secondary nodes
# g.add_node(5, label='5', color='red', size=5)
# g.add_node(8, label='8', color='blue', size=9)
# g.add_node(11, label='11', color='blue', size=9)

# # Secondary edges
# g.add_edge(5,0, label='job(former)', color='green')
# g.add_edge(2,4, label='job', color='green')
# g.add_edge(4,0, label='investment(indirect)', color='green')
# g.add_edge(3,11, label='job(former)', color='green')
# g.add_edge(5,8, label='job', color='green')

# g.show('Secondary.html')

#### Teritiary relationships

What's added in orange:
- Coworkers of partner investors and their current investments

In [None]:
# g = Network(notebook=True, directed=True, heading='Teritiary')
# g.force_atlas_2based()

# # Primary nodes
# g.add_node(0, label='0', color='#add8e6', size=9)
# g.add_node(1, label='1', color='blue', size=9)
# g.add_node(2, label='2', color='red', size=5)
# g.add_node(3, label='3', color='red', size=5)
# g.add_node(4, label='4', color='blue', size=9)

# # Primary edges
# g.add_edge(1,0, label="investment", color="grey")
# g.add_edge(2,0, label="investment", color="grey")
# g.add_edge(3,0, label='job', color='grey')

# # Secondary nodes
# g.add_node(5, label='5', color='red', size=5)
# g.add_node(8, label='8', color='blue', size=9)
# g.add_node(11, label='11', color='blue', size=9)

# # Secondary edges
# g.add_edge(5,0, label='job(former)', color='green')
# g.add_edge(2,4, label='job', color='green')
# g.add_edge(4,0, label='investment(indirect)', color='green')
# g.add_edge(3,11, label='job(former)', color='green')
# g.add_edge(5,8, label='job', color='green')

# # Tertiary nodes
# g.add_node(6, label='6', color='red', size=5)
# g.add_node(7, label='7', color='red', size=5)
# g.add_node(9, label='9', color='blue', size=9)
# g.add_node(10, label='10', color='blue', size=9)

# # Tertiary edges
# g.add_edge(6,1, label='job', color='orange')
# g.add_edge(7,4, label='job', color='orange')
# g.add_edge(6,9, label='investment', color='orange')
# g.add_edge(7,10, label='investment', color='orange')

# g.show('Teritiary.html')

## 

# 1. Load in Crunchbase dataframes. Comment out once you've completed Step 3.

In [None]:
# # Import CSVs as Pandas DataFrames
# path = 'files/output/organizations_merged.csv'
# df = pd.read_csv(path).drop(['Unnamed: 0'],axis=1)
# print('INPUT df=p1+org FROM CSV: {}'.format(path))
# print('ORGANIZATION/df cols: {}\nSHAPE: {}'.format(df.columns.to_list(), df.shape))
# df = reduce_mem_usage(df, verbose=True)

# path = 'files/output/p1_jobs.csv'
# jobs = pd.read_csv(path)
# print('\nINPUT jobs FROM CSV: {}'.format(path))
# print('JOBS/jobs cols: {}\nSHAPE: {}'.format(jobs.columns.to_list(), jobs.shape))
# jobs = reduce_mem_usage(jobs, verbose=True)

# path = 'files/output/p1_investments.csv'
# invest = pd.read_csv(path)
# print('\nINPUT invest FROM CSV: {}'.format(path))
# print('INVESTMENTS/invest cols: {}\nSHAPE: {}'.format(invest.columns.to_list(), invest.shape))
# invest = reduce_mem_usage(invest, verbose=True)

# path = 'files/output/p1_investments_partner.csv'
# invest_prtnr = pd.read_csv(path)
# print('\nINPUT invest_prtnr FROM CSV: {}'.format(path))
# print('PARTNER INVESTMENTS/invest_prtnr cols: {}\nSHAPE: {}'.format(invest_prtnr.columns.to_list(), invest_prtnr.shape))
# invest_prtnr = reduce_mem_usage(invest_prtnr, verbose=True)

# print('\n\nPledge 1% UUID: {}'.format(df[df['name']=='Pledge 1%'].uuid.values[0]))

# 2. Create mutliple merged pandaframes based on relationships using `network_by_date` function, which filters the dataframes by date to ensure the job/investment/company existed at that time. Comment out once you've saved these as CSVs in Step 3.

In [None]:
# date = '2020-09-08'
# cb_frames,p1_frames = network_by_date(date, df, jobs, invest, invest_prtnr)

# 3. Save filtered dataframes as separate CSVs, and then load in as SFrames

### Save filtered dataframes as separate CSVs. Load in nodes and edges as SFrames.  Comment out once you've saved these.

In [9]:
# for idx, frame in enumerate(cb_frames):
#     path = 'files/output/graph_temp/cb/{}_df.csv'.format(idx)
#     print('SAVED TO CSV', path)
#     frame.to_csv(path, index=False)
# for idx, frame in enumerate(p1_frames):
#     path = 'files/output/graph_temp/p1/{}_df.csv'.format(idx)
#     print('SAVED TO CSV', path)
#     frame.to_csv(path, index=False)
    
# lst_of_frames = []
# for val in ['cb','p1']:
#     lst = []
#     for idx in range(12):
#         path = 'files/output/graph_temp/{}/{}_df.csv'.format(val, idx)
#         lst.append(SFrame(data=path))
#     lst_of_frames.append(lst)
# cb_sframes,p1_sframes = lst_of_frames

### ((((START FROM HERE)))) IF USING THE SAME DATE AS PREVIOUS RUNS: Load in nodes and edges as SFrames.


In [22]:
# lst_of_frames = []
# for val in ['cb','p1']:
#     lst = []
#     for idx in range(12):
#         path = 'files/output/graph_temp/{}/{}_df.csv'.format(val, idx)
#         lst.append(SFrame(data=path))
#     lst_of_frames.append(lst)
# cb_sframes,p1_sframes = lst_of_frames

# # List of Pledge 1% uuids
# global p1_companies_uuid
# p1_companies_uuid = []
# p1_companies_uuid.extend(list(p1_sframes[0]['uuid'].unique()))
# p1_companies_uuid.extend(list(p1_sframes[1]['uuid'].unique()))
# p1_companies_uuid = list(set(p1_companies_uuid))

# 4. Load SFrames into graph and remove duplicate edges. Comment out once you've created the 8 graphs below. 

### Use functions to format SFrames to load into SGraph, `load_vertices`, `p1_affiliations`, and `load_edges`. Remove duplicate edges.

#### Vertices: Person, Company, or Investor

Node attributes: `__id`, `__node_type`, `name`, `p1_tag`

#### Edges: Investment, Job

Edge attributes: `__src_id`, `__dst_id`, `__edge_type`, `status`, {`__id`}, {`investment_type`,`raised_amount_usd`, `investor_count`, `is_lead_investor`, `lead_investor_count`}, {`job_type`, `title`}

Reference: <a href='https://github.com/turi-code/how-to/blob/master/remove_duplicate_edges.py'>Remove duplicate edges from SGraph</a>

In [None]:
# def make_graph(cb_sframes, weights=False, reverse_edges=False, remove_parallel_edges=False):
    
#     print('\nBuIlDiNg GrApH...')
#     # Load in crunchbase with relationships
    
#     # If adding weights...
#     if weights:
#         print('- ADDING WEIGHTS IN THE FORWARD DIRECTION')
#         cb = load_edges(cb_sframes, load_vertices(cb_sframes, SGraph()), p1_affiliations=[], include_edges=[2,3], reverse=False, add_weights=True)
#     elif not weights:
#         cb = load_edges(cb_sframes, load_vertices(cb_sframes, SGraph()), p1_affiliations=[], include_edges=[2,3], reverse=False, add_weights=False)
    
#     # If adding reversed edges...
#     if reverse_edges:
#         print('- ADDING EDGES IN THE REVERSE DIRECTION')
#         # If adding weights...
#         if weights:
#             print('  - ADDING WEIGHTS IN THE REVERSE DIRECTION')
#             cb = load_edges(cb_sframes, cb, p1_affiliations=[], include_edges=[2,3], reverse=True, add_weights=True)
#         elif not weights:
#             cb = load_edges(cb_sframes, cb, p1_affiliations=[], include_edges=[2,3], reverse=True, add_weights=False)

# #     # Before comparison
# #     before = cb.summary()
# #     before_pri = cb.get_edges(fields={'status':'primary'}).shape[0]
# #     before_sec = cb.get_edges(fields={'status':'secondary'}).shape[0]
# #     before_ter = cb.get_edges(fields={'status':'tertiary'}).shape[0]

#     # Get list of edge fields
#     graph_edge_fields = cb.get_edge_fields()
    
#     # If removing parallel edges...
#     if remove_parallel_edges:
#         print('- REMOVING PARALLEL EDGES')
#         # Create temporary edge attribute that you'll use in aggregate function
#         cb.edges['relationship'] = cb.edges['status']
#         if weights:
#             cb = SGraph(cb.vertices, cb.edges.groupby(['__src_id','__dst_id','__edge_type','weight'], {'status': aggregate.SELECT_ONE('relationship')}))
#         elif not weights:
#             cb = SGraph(cb.vertices, cb.edges.groupby(['__src_id','__dst_id','__edge_type'], {'status': aggregate.SELECT_ONE('relationship')}))
#     elif not remove_parallel_edges:
#         # Create temporary edge attribute that you'll use in aggregate function
#         cb.edges['combined'] = cb.edges['__id']+','+cb.edges['status']+','+cb.edges['__src_id']+','+cb.edges['__dst_id']
#         cb = SGraph(cb.vertices, cb.edges.groupby(graph_edge_fields, {'combined': aggregate.SELECT_ONE('combined')}))
#         del cb.edges['combined']

#     # After comparison
#     after = cb.summary()
#     after_pri = cb.get_edges(fields={'status':'primary'}).shape[0]
#     after_sec = cb.get_edges(fields={'status':'secondary'}).shape[0]
#     after_ter = cb.get_edges(fields={'status':'tertiary'}).shape[0]

# #     # Output
# #     print('\nRemove duplicates from Crunchbase graph')
# #     print('\nNode change: {:,} --> {:,}'.format(before['num_vertices'], after['num_vertices']))
# #     print('Edge change: {:,} --> {:,}'.format(before['num_edges'], after['num_edges']))
# #     print('\nPRIMARY Edge change: {:,} --> {:,}'.format(before_pri,after_pri))
# #     print('SECONDARY Edge change: {:,} --> {:,}'.format(before_sec,after_sec))
# #     print('TERTIARY Edge change: {:,} --> {:,}'.format(before_ter,after_ter))
    
#     # Save and load graphs
#     # UPDATE PATH 
#     if not reverse_edges and  not remove_parallel_edges: #(~A,~B)
#         name = 'Cruncbase_1Way_MultiEdge'
#     elif reverse_edges and not remove_parallel_edges: #(A,~B)
#         name = 'Crunchbase_2Ways_MultiEdge'
#     elif not reverse_edges and remove_parallel_edges: #(~A,B)
#         name = 'Cruncbase_1Way_SingleEdge'
#     elif reverse_edges and remove_parallel_edges: #(A,B)
#         name = 'Crunchbase_2Ways_SingleEdge'
#     if weights:
#         name += '_Weighted'
#     print('\nSAVING {}: ({},{})'.format(name, after['num_vertices'],after['num_edges']))
#     print('*'*50)
#     path = 'CrunchbaseGraphs/{}'.format(name)
#     cb.save(path)
#     cb = load_sgraph(path)
#     return cb

# # Construct all 8
# for weights_bool in [False, True]:
#     for reverse_bool in [False, True]:
#         for parallel_bool in [False, True]:
#             cb = make_graph(cb_sframes, weights=weights_bool, reverse_edges=reverse_bool, remove_parallel_edges=parallel_bool)

### Load Graph(s) In (Checkpoint!)
- `Cruncbase_1Way_MultiEdge`: Directed SGraph, one way, parallel edges ***MAIN GRAPH
    - $G(V,E) = (1290346, 2085199)$

- `Cruncbase_1Way_SingleEdge`: Directed SGraph, one way, **no parallel edges**
    - $G(V,E) = (1290346, 981877)$

- `Crunchbase_2Ways_MultiEdge`: Directed SGraph, **two ways**, parallel edges **WHEN NEEDED FOR FEATURES
    - $G(V,E) = (1290346, 4170144)$  
    
- `Crunchbase_2Ways_SingleEdge`: Directed SGraph, two ways, **no parallel edges**
    - $G(V,E) = (1290346, 1963489)$
    
And there are another 4 w/ weights added! See list in code cell.

In [29]:
# Load
#cb = load_sgraph('CrunchbaseGraphs/Cruncbase_1Way_MultiEdge')
#cb = load_sgraph('CrunchbaseGraphs/Crunchbase_2Ways_MultiEdge')
#cb = load_sgraph('CrunchbaseGraphs/Cruncbase_1Way_SingleEdge')
#cb = load_sgraph('CrunchbaseGraphs/Crunchbase_2Ways_SingleEdge')

# With Weights
#cb = load_sgraph('CrunchbaseGraphs/Cruncbase_1Way_MultiEdge_Weighted')
#cb = load_sgraph('CrunchbaseGraphs/Crunchbase_2Ways_MultiEdge_Weighted')
#cb = load_sgraph('CrunchbaseGraphs/Cruncbase_1Way_SingleEdge_Weighted')
#cb = load_sgraph('CrunchbaseGraphs/Crunchbase_2Ways_SingleEdge_Weighted')

# 5. Reduce size of dataset by limiting degrees of freedom from Pledge 1% companies. Comment out once you've saved the vertices list.

This creates the `ALL_CB_Pick_Sample_Companies_From_Here.csv`,`DEGREE_4_Pick_Sample_Companies_From_Here.csv`, `DEGREE_2_Pick_Sample_Companies_From_Here.csv` files.

In [None]:
# # Get subgraph vertices to sample from
# cb_vertices = cb.get_vertices()

# # Append investors + companies together into new SFrame
# sample_vertices = cb_vertices[cb_vertices['__node_type']=='investor']
# sample_vertices = sample_vertices.append(cb_vertices[cb_vertices['__node_type']=='company'])

# # Save to CSV so you don't have to re-do this !
# pd.DataFrame(sample_vertices).to_csv('ALL_CB_Pick_Sample_Companies_From_Here.csv', index=False)

### Reduce the CB dataset

- Retrieve the graph neighborhood around a set of vertices, ignoring edge directions.
- <a href='https://apple.github.io/turicreate/docs/api/generated/turicreate.SGraph.get_neighborhood.html'>turicreate.SGraph.get_neighborhood</a>

In [30]:
# # Define radius for calculating degrees of separation away from Pledge 1% companies
# rad = 5

# # Create subgraph
# cb_smol = cb.get_neighborhood(ids=p1_companies_uuid, radius=rad, full_subgraph=True)

# # Save dictionaries which store info about graph
# before = cb.summary() # Full graph
# after = cb_smol.summary() # Subgraph

# # Output
# print('Radius of the neighborhood: {} degrees of separation from Pledge 1% companies uuids'.format(rad))
# print('Reduction in nodes: {:.2f}%'.format((1-(after['num_vertices']/before['num_vertices']))*100))
# print('Reduction in edges: {:.2f}%'.format((1-(after['num_edges']/before['num_edges']))*100))
# print('\nNode change: {:,} --> {:,}'.format(before['num_vertices'], after['num_vertices']))
# print('Edge change: {:,} --> {:,}'.format(before['num_edges'], after['num_edges']))

# # Get subgraph vertices to sample from
# cb_smol_vertices = cb_smol.get_vertices()

# # Append investors + companies together into new SFrame
# sample_vertices = cb_smol_vertices[cb_smol_vertices['__node_type']=='investor']
# sample_vertices = sample_vertices.append(cb_smol_vertices[cb_smol_vertices['__node_type']=='company'])

# # Save to CSV so you don't have to re-do this !
# pd.DataFrame(sample_vertices).to_csv('DEGREE_5_Pick_Sample_Companies_From_Here.csv', index=False)

Radius of the neighborhood: 5 degrees of separation from Pledge 1% companies uuids
Reduction in nodes: 64.26%
Reduction in edges: 21.36%

Node change: 1,290,346 --> 461,229
Edge change: 4,170,144 --> 3,279,543


# 6. Produce 100 samples of the Crunchbase graphs. 10 for each scenario below. Save to CSV.

**Graph Network Size: 4 Degrees (`Model_DF_D4`)**
1. Features: Baseline reduced only
2. Features: Baseline only
3. Features: Graph only
4. Features: Graph + Baseline reduced
5. Features: Graph + Baseline
**Graph Network Size: 5 Degrees (`Model_DF_D5`)**
- Same as above (6-10)


In [23]:
# lst_of_frames = []
# for val in ['cb','p1']:
#     lst = []
#     for idx in range(12):
#         path = 'files/output/graph_temp/{}/{}_df.csv'.format(val, idx)
#         lst.append(SFrame(data=path))
#     lst_of_frames.append(lst)
# cb_sframes,p1_sframes = lst_of_frames

# # List of Pledge 1% uuids
# global p1_companies_uuid
# p1_companies_uuid = []
# p1_companies_uuid.extend(list(p1_sframes[0]['uuid'].unique()))
# p1_companies_uuid.extend(list(p1_sframes[1]['uuid'].unique()))
# p1_companies_uuid = list(set(p1_companies_uuid))

In [40]:
# # Load CB Graphs
# cb0 = load_sgraph('CrunchbaseGraphs/Cruncbase_1Way_MultiEdge')
# cb1 = load_sgraph('CrunchbaseGraphs/Crunchbase_2Ways_MultiEdge')
# cb2 = load_sgraph('CrunchbaseGraphs/Cruncbase_1Way_SingleEdge')
# cb3 = load_sgraph('CrunchbaseGraphs/Crunchbase_2Ways_SingleEdge')

# # Load CB Graphs With Weights
# cb0w = load_sgraph('CrunchbaseGraphs/Cruncbase_1Way_MultiEdge_Weighted')
# cb1w = load_sgraph('CrunchbaseGraphs/Crunchbase_2Ways_MultiEdge_Weighted')
# cb2w = load_sgraph('CrunchbaseGraphs/Cruncbase_1Way_SingleEdge_Weighted')
# cb3w = load_sgraph('CrunchbaseGraphs/Crunchbase_2Ways_SingleEdge_Weighted')

# # P1 Companie uuids
# positive_labels = pd.read_csv('Pledge1_09_08_2020.csv')['src'].to_list()

# # Vertices from different Crunchbase graphs
# ALL_vertices = pd.read_csv('ALL_CB_Pick_Sample_Companies_From_Here.csv')
# DEGREE_5_vertices = pd.read_csv('DEGREE_5_Pick_Sample_Companies_From_Here.csv')
# DEGREE_4_vertices = pd.read_csv('DEGREE_4_Pick_Sample_Companies_From_Here.csv')
# DEGREE_2_vertices = pd.read_csv('DEGREE_2_Pick_Sample_Companies_From_Here.csv')

## (DONE) Output 20 graphs for each graph network.

### The code below is for Baseline Reduced (`BR`) & Baseline (`B`) scenarios, which require no graph feature calculations.
- Included `Model_DF_D4`,`Model_DF_D5` (plus `Model_DF_D2`, `Model_DF_ALL`)

In [13]:
# # P1 Companie uuids
# positive_labels = pd.read_csv('Pledge1_09_08_2020.csv')['src'].to_list()

# # Grab relevant neighborhood
# ALL_vertices = pd.read_csv('ALL_CB_Pick_Sample_Companies_From_Here.csv')
# DEGREE_5_vertices = pd.read_csv('DEGREE_5_Pick_Sample_Companies_From_Here.csv')
# DEGREE_4_vertices = pd.read_csv('DEGREE_4_Pick_Sample_Companies_From_Here.csv')
# DEGREE_2_vertices = pd.read_csv('DEGREE_2_Pick_Sample_Companies_From_Here.csv')

# # Setting up loop
# neighborhoods_name = ['Model_DF_D2', 'Model_DF_D4', 'Model_DF_D5', 'Model_DF_ALL']
# neighborhoods = [DEGREE_2_vertices, DEGREE_4_vertices, DEGREE_5_vertices, ALL_vertices]
# neighborhoods_dict = dict(zip(neighborhoods_name,neighborhoods))

# for neighborhood in neighborhoods_name:
#     for scenario in ['B', 'BR']:
#         for idx in range(10):
            
#             DF = neighborhoods_dict[neighborhood]
#             # Sample equal size of non-P1 companies from vertices dataframe
#             negatives_labels = DF.sample(int(len(positive_labels)), replace=False)['__id'].to_list()
        
#             # Combine, avoid duplicates
#             model_labels = list(np.unique(positive_labels + negatives_labels))
            
#             # Reduce to sample CSV
#             smol_DF = DF[['__id']][DF['__id'].isin(model_labels)].reset_index(drop=True).rename({'__id':'uuid'},axis=1)
            
#             # Output to CSV
#             path = 'files/output/{}/{}/{}.csv'.format(neighborhood,scenario,idx)
#             smol_DF.to_csv(path, index=False)
#             print('SAVING to {}\n'.format(path))

In [32]:
# # Fields needed for this function
# lst_of_graphs = [cb0,cb1,cb2,cb3,cb0w,cb1w,cb2w,cb3w]
# sgraph_idx_assign = {0:'cb0',1:'cb1',2:'cb2',3:'cb3',0:'cb0',1:'cb1',2:'cb2',3:'cb3'}
# vertex_type_list = ['cb_smol_ALL', 'cb_smol_D4', 'cb_smol_D2']
# model_uuids_dict = {v:[] for v in vertex_type_list}

# def make_smol_sgraphs(positive_labels, vertex_df, string, SGraph_list, radius=3):
    
#     # Sample equal size of non-P1 companies from vertices dataframe
#     negatives_labels = vertex_df.sample(int(len(positive_labels)), replace=False)['__id'].to_list()
        
#     # Combine, avoid duplicates
#     model_labels = list(np.unique(positive_labels + negatives_labels))

#     for idx,graph in enumerate(lst_of_graphs):
            
#         # Create subgraph
#         print('Creating graph {}'.format(sgraph_idx_assign[idx].upper()))
#         smol = graph.get_neighborhood(ids=model_labels, radius=radius, full_subgraph=True)   
            
#         # Save subgraph
#         path = 'ModelGraphs/test/{}_{}'.format(string,sgraph_idx_assign[idx])
#         smol.save(path)
#         print('SAVING to {}\n'.format(path))
        
#     # Output model labels for this set of graphs
#     return model_labels

# model_labels = make_smol_sgraphs(positive_labels, ALL_vertices, 'cb_smol_ALL', lst_of_graphs, radius=3)
# model_uuids_dict['cb_smol_ALL'] = model_labels

# model_labels = make_smol_sgraphs(positive_labels, DEGREE_4_vertices, 'cb_smol_D4',lst_of_graphs, radius=3)
# model_uuids_dict['cb_smol_D4'] = model_labels

# model_labels = make_smol_sgraphs(positive_labels, DEGREE_2_vertices, 'cb_smol_D2', lst_of_graphs, radius=3)
# model_uuids_dict['cb_smol_D2'] = model_labels

### `feature_creation` methods for computing Graph features

#### Pagerank
- The pagerank.create() method computes the pagerank for each vertex and returns a PagerankModel. The pagerank value indicates the centrality of each node in the graph.
- Compute the PageRank for each vertex in the graph. Return a model object with total PageRank as well as the PageRank value for each vertex in the graph.
- <a href='https://apple.github.io/turicreate/docs/api/generated/turicreate.pagerank.create.html#turicreate.pagerank.create'>turicreate.pagerank.create</a>

#### Shortest path
- Compute the single source shortest path distance from the source vertex to all vertices in the graph. Note that because SGraph is directed, shortest paths are also directed. To find undirected shortest paths add edges to the SGraph in both directions. Return a model object with distance each of vertex in the graph.
- <a href='https://apple.github.io/turicreate/docs/api/generated/turicreate.shortest_path.create.html#turicreate.shortest_path.create'>turicreate.shortest_path.create</a>

#### K-core decomposition
- Compute the K-core decomposition of the graph. Return a model object with total number of cores as well as the core id for each vertex in the graph.
- <a href='https://apple.github.io/turicreate/docs/api/generated/turicreate.kcore.create.html'>turicreate.kcore.create</a>

#### Degree counting
- Compute the in degree, out degree and total degree of each vertex.
- <a href='https://apple.github.io/turicreate/docs/api/generated/turicreate.degree_counting.create.html#turicreate.degree_counting.create'>turicreate.degree_counting.create</a>

#### Triangle Counting
- Compute the number of triangles each vertex belongs to, ignoring edge directions. A triangle is a complete subgraph with only three vertices. Return a model object with total number of triangles as well as the triangle counts for each vertex in the graph.
- <a href='https://apple.github.io/turicreate/docs/api/generated/turicreate.triangle_counting.create.html#turicreate.triangle_counting.create'>turicreate.triangle_counting.create</a>



In [57]:
def feature_creation(model_labels, list_of_graphs, p1_companies_uuid, radius=3):
    turicreate.config.set_runtime_config('TURI_DEFAULT_NUM_PYLAMBDA_WORKERS', 96)
    list_of_frames = []
    for idx,graph in enumerate(list_of_graphs):
            
        # CREATE SUBGRAPH
        print('Creating graph {}'.format(sgraph_idx[idx].upper()))
        smol_graph = graph.get_neighborhood(ids=model_labels, radius=radius, full_subgraph=True)   
        
        # FUNCTION FOR PAGERANK
        print('HERE_PR')
        DF_PG = add_pagerank(smol_graph, model_labels, idx)
        print(DF_PG.columns.to_list())
        if idx==1:
            DF_PG_1 = DF_PG
        if DF_PG.shape[0] != 0:
            list_of_frames.append(DF_PG)
        
        # FUNCTION FOR WEIGHTED PAGERANK
        print('HERE_PR_W')
        DF_PG_W = add_weighted_pagerank(smol_graph, model_labels, idx)
        print(DF_PG_W.columns.to_list())
        if idx==1:
            DF_PG_W_1 = DF_PG_W
        if DF_PG_W.shape[0] != 0:
            list_of_frames.append(DF_PG_W)
        
        # FUNCTION FOR SHORTEST PATH TOP 5
        print('HERE_SP')
        if idx==3:
            DF = add_shortest_path(smol_graph, model_labels, idx, DF_PG_1, p1_companies_uuid)
        else:
            DF = add_shortest_path(smol_graph, model_labels, idx, DF_PG, p1_companies_uuid)
        print(DF.columns.to_list())
        if DF.shape[0] != 0:
            list_of_frames.append(DF)
        
        # FUNCTION FOR SHORTEST PATH TOP 5 WEIGHTED
        print('HERE_SP_W')
        if idx==3:
            DF = add_weighted_shortest_path(smol_graph, model_labels, idx, DF_PG_W_1, p1_companies_uuid)
        else:
            DF = add_weighted_shortest_path(smol_graph, model_labels, idx, DF_PG_W, p1_companies_uuid)
        print(DF.columns.to_list())
        if DF.shape[0] != 0:
            list_of_frames.append(DF)
        
        # FUNCTION FOR K-CORE DECOPOSITION
        print('HERE_KC')
        DF = add_kcore(smol_graph, model_labels, idx)
        print(DF.columns.to_list())
        if DF.shape[0] != 0:
            list_of_frames.append(DF)
        
        # FUNCTION FOR DEGREES
        print('HERE_D')
        DF = add_degree(smol_graph, model_labels, idx)
        print(DF.columns.to_list())
        if DF.shape[0] != 0:
            list_of_frames.append(DF)
        
        # FUNCTION FOR TRIANGLE
        print('HERE_T')
        DF = add_triangle(smol_graph, model_labels, idx)
        print(DF.columns.to_list())
        if DF.shape[0] != 0:
            list_of_frames.append(DF)
    
    # Merge all feature dataframes together
    DF_ALL = reduce(lambda df1,df2: pd.merge(df1,df2,on='__id'), list_of_frames)
    print('DATAFRAME SHAPE: {}'.format(DF_ALL.shape))
    
    # Output final DF
    return DF_ALL

def add_pagerank(graph, model_labels, index):
    turicreate.config.set_runtime_config('TURI_DEFAULT_NUM_PYLAMBDA_WORKERS', 96)
    # If this particular graph is in the list of approved graphs, then continue, otherwise return empty dataframe
    if sgraph_idx[index] in feat_graph_map['pagerank']:
        # Create pagerank SFrame
        pr = pagerank.create(graph, verbose=False)
        pr_sframe = pr['pagerank']
        # Modifying output SFrame
        pr_df = pd.DataFrame(pr_sframe)
        pr_df = pr_df.drop('delta', axis=1)
        pr_df = pr_df[pr_df['__id'].isin(model_labels)].reset_index(drop=True)
        pr_df = pr_df.rename({'pagerank':'pr_{}'.format(index)}, axis=1)
        # Return modified dataframe
        return pr_df
    else:
        # Return empty dataframe
        return pd.DataFrame(columns=['__id'])

def add_weighted_pagerank(graph, model_labels, index):
    turicreate.config.set_runtime_config('TURI_DEFAULT_NUM_PYLAMBDA_WORKERS', 96)
    # If this particular graph is in the list of approved graphs, then continue, otherwise return empty dataframe
    if sgraph_idx[index] in feat_graph_map['pagerank_weight']:
        pr_w = pagerank_weighted(graph)
        pr_w_sframe = pr_w['__id', 'pagerank']
        # Modifying output SFrame
        pr_w_df = pd.DataFrame(pr_w_sframe)
        pr_w_df = pr_w_df[pr_w_df['__id'].isin(model_labels)].reset_index(drop=True)
        pr_w_df = pr_w_df.rename({'pagerank':'w_pr_{}'.format(index)}, axis=1)
        # Return modified dataframe
        return pr_w_df
    else:
        # Return empty dataframe
        return pd.DataFrame(columns=['__id'])   
      
def add_shortest_path(graph, model_labels, index, pagerank_dataframe, p1_companies_uuid):
    #mapping_for_pr = {1:1, 2:1}
    turicreate.config.set_runtime_config('TURI_DEFAULT_NUM_PYLAMBDA_WORKERS', 96)
    # If this particular graph is in the list of approved graphs, then continue, otherwise return empty dataframe
    if sgraph_idx[index] in feat_graph_map['shortest']:
        # Grab pagerank dataframe
        pr = pagerank_dataframe[['__id', 'pr_1']].sort_values(by='pr_1',ascending=False)
        pr = pr['__id'].to_list()
        # Find top 5 p1 companies 
        count = 0
        top_p1 = []
        while len(top_p1) < 5:
            if pr[count] in p1_companies_uuid:
                top_p1.append(pr[count])
            count += 1
            print(count)
        # Loop over top 5 companies to find shortest path to each
        list_of_frames = []
        for jdx,uuid in enumerate(top_p1):
            # Create shortest path SFrame
            sp = shortest_path.create(graph, source_vid=uuid, verbose=False)
            sp_sframe = sp['distance']
            # Modifying output SFrame
            sp_df = pd.DataFrame(sp_sframe)
            sp_df = sp_df[sp_df['__id'].isin(model_labels)].reset_index(drop=True)
            sp_df = sp_df.rename({'distance': 'spath_top_{}_{}'.format(index,jdx)}, axis=1)
            list_of_frames.append(sp_df)
        # Combine 5 shortest path columns
        sp_df = reduce(lambda df1,df2: pd.merge(df1,df2,on='__id'), list_of_frames)
        # Add minimum path (to top 5) column
        sp_df['spath_top_min_{}'.format(index)] = sp_df.min(axis=1)
        # Return modified dataframe
        return sp_df
    else:
        # Return empty dataframe
         return pd.DataFrame(columns=['__id'])

def add_weighted_shortest_path(graph, model_labels, index, pagerank_dataframe_weighted, p1_companies_uuid):
    #mapping_for_pr = {1:1, 2:1}
    turicreate.config.set_runtime_config('TURI_DEFAULT_NUM_PYLAMBDA_WORKERS', 96)
    # If this particular graph is in the list of approved graphs, then continue, otherwise return empty dataframe
    if sgraph_idx[index] in feat_graph_map['shortest_weight']:
        # Grab weighted pagerank dataframe
        pr = pagerank_dataframe_weighted[['__id', 'w_pr_1']].sort_values(by='w_pr_1',ascending=False)
        pr = pr['__id'].to_list()
        # Find top 5 p1 companies 
        count = 0
        top_p1 = []
        while len(top_p1) < 5:
            if pr[count] in p1_companies_uuid:
                top_p1.append(pr[count])
            count += 1
            print(count)
        # Loop over top 5 companies to find shortest path to each
        list_of_frames = []
        for jdx,uuid in enumerate(top_p1):
            # Create shortest path SFrame
            sp = shortest_path.create(graph, source_vid=uuid, weight_field='weight', verbose=False)
            sp_sframe = sp['distance']
            # Modifying output SFrame
            sp_df = pd.DataFrame(sp_sframe)
            sp_df = sp_df[sp_df['__id'].isin(model_labels)].reset_index(drop=True)
            sp_df = sp_df.rename({'distance': 'w_spath_top_{}_{}'.format(index,jdx)}, axis=1)
            list_of_frames.append(sp_df)
        # Combine 5 shortest path columns
        sp_df = reduce(lambda df1,df2: pd.merge(df1,df2,on='__id'), list_of_frames)
        # Add minimum path (to top 5) column
        sp_df['w_spath_top_min_{}'.format(index)] = sp_df.min(axis=1)
        # Return modified dataframe
        return sp_df
    else:
        # Return empty dataframe
         return pd.DataFrame(columns=['__id'])

def add_kcore(graph, model_labels, index):
    turicreate.config.set_runtime_config('TURI_DEFAULT_NUM_PYLAMBDA_WORKERS', 96)
    # If this particular graph is in the list of approved graphs, then continue, otherwise return empty dataframe
    if sgraph_idx[index] in feat_graph_map['kcore']:
        # Create kcore SFrame
        kc = kcore.create(graph, kmin=0, kmax=5, verbose=False)
        kc_sframe = kc['core_id'] 
        # Modifying output SFrame
        kc_df = pd.DataFrame(kc_sframe)
        kc_df = kc_df[kc_df['__id'].isin(model_labels)].reset_index(drop=True)
        kc_df = kc_df.rename({'core_id':'kc_{}'.format(index)}, axis=1)
        # Return modified dataframe
        return kc_df
    else:
        # Return empty dataframe
         return pd.DataFrame(columns=['__id'])

def add_degree(graph, model_labels, index):
    turicreate.config.set_runtime_config('TURI_DEFAULT_NUM_PYLAMBDA_WORKERS', 96)
    # If this particular graph is in the list of approved graphs, then continue, otherwise return empty dataframe
    if sgraph_idx[index] in feat_graph_map['degree']:
        # Create degree SGraph
        deg = degree_counting.create(graph)
        deg_sgraph = deg['graph'] 
        # Modifying output SFrame
        deg_df = pd.DataFrame(deg_sgraph.vertices[['__id', 'in_degree', 'out_degree']])
        deg_df = deg_df[deg_df['__id'].isin(model_labels)].reset_index(drop=True)
        deg_df = deg_df.rename({'in_degree':'in_deg_{}'.format(index),'out_degree':'out_deg_{}'.format(index)}, axis=1)
        # Return modified dataframe
        return deg_df
    else:
        # Return empty dataframe
         return pd.DataFrame(columns=['__id'])
        
def add_triangle(graph, model_labels, index):
    turicreate.config.set_runtime_config('TURI_DEFAULT_NUM_PYLAMBDA_WORKERS', 96)
    # If this particular graph is in the list of approved graphs, then continue, otherwise return empty dataframe
    if sgraph_idx[index] in feat_graph_map['triangle']:
        # Create triangle counting SFrame
        tc = triangle_counting.create(graph, verbose=False)
        tc_sframes = tc['triangle_count']
        # Modifying output SFrame
        tri_df = pd.DataFrame(tc_sframes)
        tri_df = tri_df[tri_df['__id'].isin(model_labels)].reset_index(drop=True)
        tri_df = tri_df.rename({'triangle_count':'tri_{}'.format(index)},axis=1)
        # Return modified dataframe
        return tri_df
    else:
        # Return empty dataframe
         return pd.DataFrame(columns=['__id'])

### (IN PROGRESS) Output 20 graphs for each graph network. Produce samples for Graph only (`G`), Graph & Baseline (`GB`), Graph & Baseline Reduced (`GBR`) scenarios in 2 graph networks.

In [60]:
#turicreate.config.set_runtime_config('TURI_DEFAULT_NUM_GRAPH_LAMBDA_WORKERS', 96)
turicreate.config.set_num_gpus(1)
# # Load CB Graphs
# cb0 = load_sgraph('CrunchbaseGraphs/Cruncbase_1Way_MultiEdge')
# cb1 = load_sgraph('CrunchbaseGraphs/Crunchbase_2Ways_MultiEdge')
# cb2 = load_sgraph('CrunchbaseGraphs/Cruncbase_1Way_SingleEdge')
# cb3 = load_sgraph('CrunchbaseGraphs/Crunchbase_2Ways_SingleEdge')

# Load CB Graphs With Weights
cb0w = load_sgraph('CrunchbaseGraphs/Cruncbase_1Way_MultiEdge_Weighted')
cb1w = load_sgraph('CrunchbaseGraphs/Crunchbase_2Ways_MultiEdge_Weighted')
cb2w = load_sgraph('CrunchbaseGraphs/Cruncbase_1Way_SingleEdge_Weighted') # Wasn't needed
cb3w = load_sgraph('CrunchbaseGraphs/Crunchbase_2Ways_SingleEdge_Weighted')

# P1 Companie uuids
positive_labels = pd.read_csv('Pledge1_09_08_2020.csv')['src'].to_list()

# Grab relevant neighborhood
#ALL_vertices = pd.read_csv('ALL_CB_Pick_Sample_Companies_From_Here.csv')
DEGREE_5_vertices = pd.read_csv('DEGREE_5_Pick_Sample_Companies_From_Here.csv')
DEGREE_4_vertices = pd.read_csv('DEGREE_4_Pick_Sample_Companies_From_Here.csv')
#DEGREE_2_vertices = pd.read_csv('DEGREE_2_Pick_Sample_Companies_From_Here.csv')

# Setting up loop
neighborhoods_name = ['Model_DF_D4', 'Model_DF_D5']
neighborhoods = [DEGREE_4_vertices, DEGREE_5_vertices]
#neighborhoods_name = ['Model_DF_D2', 'Model_DF_D4', 'Model_DF_D5', 'Model_DF_ALL']
#neighborhoods = [DEGREE_2_vertices, DEGREE_4_vertices, DEGREE_5_vertices, ALL_vertices]
neighborhoods_dict = dict(zip(neighborhoods_name,neighborhoods))

# Fields needed
#sgraph_idx = {0:'cb0',1:'cb1',2:'cb2',3:'cb3',4:'cb0w',5:'cb1w',6:'cb2w',7:'cb3w'}
sgraph_idx = {0:'cb0w',1:'cb1w',2:'cb2w', 3:'cb3w'}

sgraph_idx_inv = {v:k for (k,v) in sgraph_idx.items()} # For saving the right column name
#list_of_graphs = [cb0,cb1,cb2,cb3,cb0w,cb1w,cb2w,cb3w]
list_of_graphs = [cb0w,cb1w,cb2w,cb3w]
#list_of_graphs = [cb0w,cb1w,cb3w]

# Coordinating -- for loading in graphs
feat_graph_map = {'pagerank':['cb0w', 'cb1w', 'cb2w'],
                  'pagerank_weight':['cb0w', 'cb1w', 'cb2w'],
                  'kcore':['cb2w', 'cb3w'], # Number of edges does not matter, single edge
                  'degree':['cb0w', 'cb1w', 'cb2w', 'cb3w'], # Doesn't require a lot of computational power
                  'triangle':['cb0w', 'cb2w'], # Ignores edge directions, 1-way
                  'shortest':['cb3w'],  # Requires bi-directional edges
                  'shortest_weight':['cb3w']} # Requires bi-directional edges

for neighborhood in neighborhoods_name: # 2 times
    for scenario in ['G','GB','GBR']: # 3 times
        for idx in range(1,10): # 10 times
            print('{} | {} | {}'.format(neighborhood,scenario,idx))
            print('*'*50)
            # Grab neighborhood DF to start with
            DF = neighborhoods_dict[neighborhood]
            # Sample equal size of non-P1 companies from vertices dataframe
            negatives_labels = DF.sample(int(len(positive_labels)), replace=False)['__id'].to_list()
            # Combine, avoid duplicates
            model_labels = list(np.unique(positive_labels + negatives_labels))
            # SEND TO GRAPH FEATURE METHOD WHICH: CREATES GRAPH FOR FEATURE & APPENDS FEATURE TO MODEL DATAFRAME
            smol_DF = feature_creation(model_labels, list_of_graphs, p1_companies_uuid)
            # Output to CSV
            path = 'files/output/{}/{}/{}.csv'.format(neighborhood,scenario,idx)
            smol_DF.to_csv(path, index=False)
            print('SAVING to {}\n'.format(path))

Model_DF_D4 | G | 1
**************************************************
Creating graph CB0W
HERE_PR
['__id', 'pr_0']
HERE_PR_W
Iteration 0: total pagerank changed in L1 = 308819.196625
Iteration 1: total pagerank changed in L1 = 74723.674463
Iteration 2: total pagerank changed in L1 = 74077.515237
Weighted pagerank finished in: 59.649648 secs
['__id', 'w_pr_0']
HERE_SP
['__id']
HERE_SP_W
['__id']
HERE_KC
['__id']
HERE_D
['__id', 'in_deg_0', 'out_deg_0']
HERE_T
['__id', 'tri_0']
Creating graph CB1W
HERE_PR
['__id', 'pr_1']
HERE_PR_W
Iteration 0: total pagerank changed in L1 = 206601.596298
Iteration 1: total pagerank changed in L1 = 8781.752519
Iteration 2: total pagerank changed in L1 = 0.000000
Weighted pagerank finished in: 95.212619 secs
['__id', 'w_pr_1']
HERE_SP
['__id']
HERE_SP_W
['__id']
HERE_KC
['__id']
HERE_D
['__id', 'in_deg_1', 'out_deg_1']
HERE_T
['__id']
Creating graph CB2W
HERE_PR
['__id', 'pr_2']
HERE_PR_W
Iteration 0: total pagerank changed in L1 = 310731.924903
Iteratio

### Old code from previous graph feature testing...

In [96]:
# # Coordinating -- for loading in graphs
# vertex_type_list = ['cb_smol_ALL', 'cb_smol_D4','cb_smol_D2']
# feat_graph_map = {'pagerank':['cb0','cb1','cb2','cb3'], 
#                   'kcore':['cb0','cb1','cb2','cb3'],
#                   'degree':['cb0','cb1'], 
#                   'triangle':['cb0','cb1'],
#                   'shortest':['cb1', 'cb3'], 
#                   'shortest_weight':['cb1w', 'cb3w']}
# vertex_df_map = {v:pd.DataFrame(columns=['__id']) for v in vertex_type_list}

In [39]:
# from turicreate import pagerank
# from functools import reduce

# # Mapping for this function
# sgraph_idx_assign = {0:'cb0',1:'cb1',2:'cb2',3:'cb3'}

# if not len(sgraph_idx_assign.items())==len(feat_graph_map['pagerank']):
#     print('THE ASSIGNMENT DOES NOT MATCH NUMBER OF GRAPHS')

# for vertex_type in vertex_type_list:
#     lst_of_frames = []
#     for idx,smol in enumerate(feat_graph_map['pagerank']):
#         print('CaLcUlAtInG pAgeRaNk for graph {}, in graph neighborhood {}'.format(sgraph_idx_assign[idx].upper(),vertex_type.upper()))
#         path = 'ModelGraphs/test/{}_{}'.format(vertex_type,smol)
#         graph = load_sgraph(path)
#         pr = pagerank.create(graph, verbose=False)
#         pr_sframe = pr['pagerank']

#         # Modifying output SFrame
#         pr_df = pd.DataFrame(pr_sframe)
#         pr_df = pr_df.drop('delta', axis=1)
#         pr_df = pr_df[pr_df['__id'].isin(model_uuids_dict[vertex_type])].reset_index(drop=True)
#         pr_df = pr_df.rename({'pagerank':'pr_{}'.format(idx)}, axis=1)
        
#         # Save to temp lst_of_frames
#         lst_of_frames.append(pr_df)
    
#     PR_DF = reduce(lambda df1,df2: pd.merge(df1,df2,on='__id'), lst_of_frames)
#     vertex_df_map[vertex_type] = pd.merge(vertex_df_map[vertex_type], PR_DF, on='__id', how='outer')
    
#################################################################################
# from turicreate import kcore
# # Mapping for this function
# sgraph_idx_assign = {0:'cb0',1:'cb1',2:'cb2',3:'cb3'}

# if not len(sgraph_idx_assign.items())==len(feat_graph_map['kcore']):
#     print('THE ASSIGNMENT DOES NOT MATCH NUMBER OF GRAPHS')

# for vertex_type in vertex_type_list:
#     lst_of_frames = []
#     for idx,smol in enumerate(feat_graph_map['kcore']):
#         print('CaLcUlAtInG kCoRe for graph {}, in graph neighborhood {}'.format(sgraph_idx_assign[idx].upper(),vertex_type.upper()))
#         path = 'ModelGraphs/test/{}_{}'.format(vertex_type, smol)
#         graph = load_sgraph(path)
#         kc = kcore.create(graph, kmin=0, kmax=10, verbose=False)
#         kc_sframe = kc['core_id'] 
        
#         # Modifying output SFrame
#         kc_df = pd.DataFrame(kc_sframe)
#         kc_df = kc_df[kc_df['__id'].isin(model_uuids_dict[vertex_type])].reset_index(drop=True)
#         kc_df = kc_df.rename({'core_id':'kc_{}'.format(idx)}, axis=1)
        
#         # Save to temp lst_of_frames
#         lst_of_frames.append(kc_df)
    
#     KC_DF = reduce(lambda df1,df2: pd.merge(df1,df2,on='__id'), lst_of_frames)
#     vertex_df_map[vertex_type] = pd.merge(vertex_df_map[vertex_type], KC_DF, on='__id', how='outer')

#################################################################################
# from turicreate import degree_counting
# # Mapping for this function
# sgraph_idx_assign = {0:'cb0',1:'cb1'}

# if not len(sgraph_idx_assign.items())==len(feat_graph_map['degree']):
#     print('THE ASSIGNMENT DOES NOT MATCH NUMBER OF GRAPHS')

# for vertex_type in vertex_type_list:
#     lst_of_frames = []
#     for idx,smol in enumerate(feat_graph_map['degree']):
#         print('CaLcUlAtInG dEgReEs for graph {}, in graph neighborhood {}'.format(sgraph_idx_assign[idx].upper(),vertex_type.upper()))
#         path = 'ModelGraphs/test/{}_{}'.format(vertex_type, smol)
#         graph = load_sgraph(path)
#         deg = degree_counting.create(graph)
#         deg_sgraph = deg['graph'] 
#         deg_df = pd.DataFrame(deg_sgraph.vertices[['__id', 'in_degree', 'out_degree']])
#         deg_df = deg_df[deg_df['__id'].isin(model_uuids_dict[vertex_type])].reset_index(drop=True)
#         deg_df = deg_df.rename({'in_degree':'in_deg_{}'.format(idx),
#                              'out_degree':'out_deg_{}'.format(idx)}, axis=1)
#         # Save to temp lst_of_frames
#         lst_of_frames.append(deg_df)
#     DEG_DF = reduce(lambda df1,df2: pd.merge(df1,df2,on='__id'), lst_of_frames)
#     vertex_df_map[vertex_type] = pd.merge(vertex_df_map[vertex_type], DEG_DF, on='__id', how='outer')
    
#################################################################################
# from turicreate import triangle_counting
# # Mapping for this function
# sgraph_idx_assign = {0:'cb0', 1:'cb1'}

# if not len(sgraph_idx_assign.items())==len(feat_graph_map['triangle']):
#     print('THE ASSIGNMENT DOES NOT MATCH NUMBER OF GRAPHS')
    
# for vertex_type in vertex_type_list:
#     lst_of_frames = []
#     for idx,smol in enumerate(feat_graph_map['triangle']):
#         print('CaLcUlAtInG TrIaNgLeS for graph {}, in graph neighborhood {}'.format(sgraph_idx_assign[idx].upper(),vertex_type.upper()))
#         path = 'ModelGraphs/test/{}_{}'.format(vertex_type, smol)
#         graph = load_sgraph(path)
#         tc = triangle_counting.create(graph, verbose=False)
#         tri_df = pd.DataFrame(tc['triangle_count'])
#         tri_df = tri_df[tri_df['__id'].isin(model_uuids_dict[vertex_type])].reset_index(drop=True)
#         tri_df = tri_df.rename({'triangle_count':'tri_{}'.format(idx)},axis=1)
#         # Save to temp lst_of_frames
#         lst_of_frames.append(tri_df)
#     TRI_DF = reduce(lambda df1,df2: pd.merge(df1,df2,on='__id'), lst_of_frames)
#     vertex_df_map[vertex_type] = pd.merge(vertex_df_map[vertex_type], TRI_DF, on='__id', how='outer')
    
#################################################################################
# # Mapping for this function
# sgraph_idx_assign = {0:'cb1',1:'cb3'}
# sgraph_idx_jdx_assign = {0:1, 1:3}

# if not len(sgraph_idx_assign.items())==len(feat_graph_map['shortest']):
#     print('THE ASSIGNMENT DOES NOT MATCH NUMBER OF GRAPHS')
    
# for vertex_type in vertex_type_list:
#     lst_of_frames = []

#     for idx,smol in enumerate(feat_graph_map['shortest']):
#         print('CaLcUlAtInG sHoRtEsT PaTh tOP P1 for graph {}, in graph neighborhood {}'.format(sgraph_idx_assign[idx].upper(),vertex_type.upper()))
#         path = 'ModelGraphs/test/{}_{}'.format(vertex_type, smol)
#         graph = load_sgraph(path)
#         pr = vertex_df_map[vertex_type][['__id', 'pr_{}'.format(sgraph_idx_jdx_assign[idx])]].sort_values(by='pr_{}'.format(sgraph_idx_jdx_assign[idx]),ascending=False)
#         pr = pr['__id'].to_list()[:200]
#         count = 0
#         top_p1 = []
#         while len(top_p1) < 5:
#             if pr[count] in p1_companies_uuid:
#                 top_p1.append(pr[count])
#             count += 1
#         lst_of_lst_of_frames = []
#         for jdx,uuid in enumerate(top_p1):
#             sp = shortest_path.create(graph, source_vid=uuid, verbose=False)
#             sp_df = pd.DataFrame(sp['distance'])
#             sp_df = sp_df[sp_df['__id'].isin(model_uuids_dict[vertex_type])].reset_index(drop=True)
#             sp_df = sp_df.rename({'distance': 'spath_top_{}_{}'.format(sgraph_idx_jdx_assign[idx],jdx)}, axis=1)
#             lst_of_lst_of_frames.append(sp_df)
#         sp_df = reduce(lambda df1,df2: pd.merge(df1,df2,on='__id'), lst_of_lst_of_frames)
#         sp_df['spath_top_min_{}'.format(sgraph_idx_jdx_assign[idx])] = sp_df.min(axis=1) 
#         lst_of_frames.append(sp_df)

#     DIST_DF = reduce(lambda df1,df2: pd.merge(df1,df2,on='__id'), lst_of_frames)
#     vertex_df_map[vertex_type] = pd.merge(vertex_df_map[vertex_type], DIST_DF, on='__id', how='outer')
    
#################################################################################
# from turicreate import shortest_path

# # Mapping for this function
# sgraph_idx_assign = {0:'cb1w',1:'cb3w'}
# sgraph_idx_jdx_assign = {0:1, 1:3}

# if not len(sgraph_idx_assign.items())==len(feat_graph_map['shortest']):
#     print('THE ASSIGNMENT DOES NOT MATCH NUMBER OF GRAPHS')
    
# for vertex_type in vertex_type_list:
#     lst_of_frames = []

#     for idx,smol in enumerate(feat_graph_map['shortest_weight']):
#         print('CaLcUlAtInG sHoRtEsT PaTh tOP P1 for graph {}, in graph neighborhood {}'.format(sgraph_idx_assign[idx].upper(),vertex_type.upper()))
#         path = 'ModelGraphs/test/{}_{}'.format(vertex_type, smol)
#         graph = load_sgraph(path)
#         pr = vertex_df_map[vertex_type][['__id', 'pr_{}'.format(sgraph_idx_jdx_assign[idx])]].sort_values(by='pr_{}'.format(sgraph_idx_jdx_assign[idx]),ascending=False)
#         pr = pr['__id'].to_list()[:200]
#         count = 0
#         top_p1 = []
#         while len(top_p1) < 5:
#             if pr[count] in p1_companies_uuid:
#                 top_p1.append(pr[count])
#             count += 1
#         lst_of_lst_of_frames = []
#         for jdx,uuid in enumerate(top_p1):
#             sp = shortest_path.create(graph, source_vid=uuid, weight_field='weight', verbose=False)
#             sp_df = pd.DataFrame(sp['distance'])
#             sp_df = sp_df[sp_df['__id'].isin(model_uuids_dict[vertex_type])].reset_index(drop=True)
#             sp_df = sp_df.rename({'distance': 'w_spath_top_{}_{}'.format(sgraph_idx_jdx_assign[idx],jdx)}, axis=1)
#             lst_of_lst_of_frames.append(sp_df)
#         sp_df = reduce(lambda df1,df2: pd.merge(df1,df2,on='__id'), lst_of_lst_of_frames)
#         sp_df['w_spath_top_min_{}'.format(sgraph_idx_jdx_assign[idx])] = sp_df.min(axis=1) 
#         lst_of_frames.append(sp_df)

#     DIST_DF = reduce(lambda df1,df2: pd.merge(df1,df2,on='__id'), lst_of_frames)
#     vertex_df_map[vertex_type] = pd.merge(vertex_df_map[vertex_type], DIST_DF, on='__id', how='outer')

#################################################################################
# # Weighted pagerank
# # Mapping for this function
# sgraph_idx_assign = {0:'cb1w',1:'cb2w', 2:'cb3w', 3:'cb4w'}
# if not len(sgraph_idx_assign.items())==len(feat_graph_map['pagerank_weight']):
#     print('THE ASSIGNMENT DOES NOT MATCH NUMBER OF GRAPHS')
# for vertex_type in vertex_type_list:
#     lst_of_frames = []
#     for idx,smol in enumerate(feat_graph_map['pagerank_weight']):
#         print('CaLcUlAtInG wEiGhTeD pAgeRaNk for graph {}, in graph neighborhood {}'.format(sgraph_idx_assign[idx].upper(),vertex_type.upper()))
#         path = 'ModelGraphs/test/{}_{}'.format(vertex_type,smol)
#         graph = load_sgraph(path)
#         pr_w = pagerank_weighted(graph)
#         pr_w_sframe = pr_w['__id', 'pagerank']
#         # Modifying output SFrame
#         pr_w_df = pd.DataFrame(pr_w_sframe)
#         pr_w_df = pr_w_df[pr_w_df['__id'].isin(model_uuids_dict[vertex_type])].reset_index(drop=True)
#         pr_w_df = pr_w_df.rename({'pagerank_weight':'w_pr_{}'.format(idx)}, axis=1)
#         # Save to temp lst_of_frames
#         lst_of_frames.append(pr_w_df)
#     PR_W_DF = reduce(lambda df1,df2: pd.merge(df1,df2,on='__id'), lst_of_frames)
#     vertex_df_map[vertex_type] = pd.merge(vertex_df_map[vertex_type], PR_W_DF, on='__id', how='outer')

CaLcUlAtInG pAgeRaNk for graph CB0, in graph neighborhood CB_SMOL_ALL
CaLcUlAtInG pAgeRaNk for graph CB1, in graph neighborhood CB_SMOL_ALL
CaLcUlAtInG pAgeRaNk for graph CB2, in graph neighborhood CB_SMOL_ALL
CaLcUlAtInG pAgeRaNk for graph CB3, in graph neighborhood CB_SMOL_ALL
CaLcUlAtInG pAgeRaNk for graph CB0, in graph neighborhood CB_SMOL_D4
CaLcUlAtInG pAgeRaNk for graph CB1, in graph neighborhood CB_SMOL_D4
CaLcUlAtInG pAgeRaNk for graph CB2, in graph neighborhood CB_SMOL_D4
CaLcUlAtInG pAgeRaNk for graph CB3, in graph neighborhood CB_SMOL_D4
CaLcUlAtInG pAgeRaNk for graph CB0, in graph neighborhood CB_SMOL_D2
CaLcUlAtInG pAgeRaNk for graph CB1, in graph neighborhood CB_SMOL_D2
CaLcUlAtInG pAgeRaNk for graph CB2, in graph neighborhood CB_SMOL_D2
CaLcUlAtInG pAgeRaNk for graph CB3, in graph neighborhood CB_SMOL_D2
CaLcUlAtInG kCoRe for graph CB0, in graph neighborhood CB_SMOL_ALL
CaLcUlAtInG kCoRe for graph CB1, in graph neighborhood CB_SMOL_ALL
CaLcUlAtInG kCoRe for graph CB2, i