# 8.2 Appendix (PySpark P2 Times)

Most of the code here is adjusted from Section 5 and so not many comments are made. This is essentially to run the same code without the issues of slowing down. 

**NOTE: THE WAY THIS APPENDIX IS STRUCTURED, YOU'D NEED TO RUN EVERYTHING FOR N2R2 AND SAVE YOUR RESULTS THEN RESTART THE KERNEL AND RUN FOR N3R2 ETC. DEPENDING ON DEVICE THIS MAY OR MAY NOT WORK AND MAY LEAD TO `WINERROR10054` IF RUNTIME IS TOO LONG. THE PURPOSE OF INCLUDING THIS APPENDIX IS MOSTLY FOR COMPLETENESS**

In [1]:
from graphframes import *
from pyspark import *
from pyspark.sql import *
from pyspark.sql.types import IntegerType

import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

In [2]:
#initialise sparkContext and set checkpoint 
sc = spark.sparkContext
sc.setCheckpointDir('times1_cps')

In [3]:
#number of nodes that correspond to our ER graphs
nodes = [10,32, 100, 316, 1000, 3162, 10000]

## Loading in Graph Connection CSVs

In [4]:
i=2
# loop over nodes
for n in nodes:
    #set path to grab correct edges csv e.g. 'G_ 100 _p1'
    path = '../Data/G_ ' + str(n) + ' _p' + str(i)
    edges = spark.read.csv(path,header=True)
    #rename to match the 'src' and 'dst format that spark prefers, drop index col '_c0'
    edges = edges.withColumnRenamed('V1','src').withColumnRenamed('V2','dst').drop('_c0')
    #create a list of ids; these are the vertices for our erdos-renyi graphs
    ids = list(range(1,n+1))
    #assign vertices to ids
    vertices = spark.createDataFrame(ids, IntegerType()).withColumnRenamed('value','id')
    #create g's
    locals()['g{}_{}'.format(i,n)] = GraphFrame(v=vertices,e=edges)
    #retrieve edge count and assign it to dataframe
    
#delete our temp vars
del edges
del ids
del vertices

## DataFrame

### Connected

#### N2R2

In [5]:
#function for calculating num of communities
#this is in a separate cell so that the magic %%time function in the next cell works correctly
def countComps(graph):
    return graph.connectedComponents().select('component').distinct().count()

In [6]:
def timeResults(graph):
    result = %timeit -n2 -r2 -o countComps(graph)
    return result

In [7]:
np.random.seed(42) 

for n in nodes:
    graph = locals()['g{}_{}'.format(i,n)]
    locals()['r{}_{}'.format(i,n)] = timeResults(graph)

13.1 s ± 2.36 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
14.5 s ± 1.12 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
20.1 s ± 1.39 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
23.1 s ± 2.73 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
26.7 s ± 3 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
33.1 s ± 7.06 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
1min ± 16.6 s per loop (mean ± std. dev. of 2 runs, 2 loops each)


In [8]:
times_lst = []

for n in nodes:
    times_lst.append((i,n,str(locals()['r{}_{}'.format(i,n)])))

In [24]:
def getMean(ts):
    splt =  ts.split('±')[0].split(' ')[0:2]
    #if time in seconds, we dont need to do anything 
    if splt[1] == 's':
        time = splt[0]
        return time 
    else: #if time is 1min 10s for example
        mins = splt[0][0] #take the number before 'min'
        #if we have exactly X mins and 0 seconds, grabbing secs will give us ''
        try: #try this
            secs = float(splt[1][:-1]) #take everything except the 's'
        except: #if it fails because the above is '', then return 0
            secs = 0
        time = int(mins)*60 + int(secs) #convert to seconds
        return time

In [19]:
ts = times_lst[0][2]
ts

'13.1 s ± 2.36 s per loop (mean ± std. dev. of 2 runs, 2 loops each)'

In [20]:
ts.split('±')[1].split(' ')[1:3]

['2.36', 's']

In [21]:
times_lst

[(2,
  10,
  '13.1 s ± 2.36 s per loop (mean ± std. dev. of 2 runs, 2 loops each)'),
 (2,
  32,
  '14.5 s ± 1.12 s per loop (mean ± std. dev. of 2 runs, 2 loops each)'),
 (2,
  100,
  '20.1 s ± 1.39 s per loop (mean ± std. dev. of 2 runs, 2 loops each)'),
 (2,
  316,
  '23.1 s ± 2.73 s per loop (mean ± std. dev. of 2 runs, 2 loops each)'),
 (2, 1000, '26.7 s ± 3 s per loop (mean ± std. dev. of 2 runs, 2 loops each)'),
 (2,
  3162,
  '33.1 s ± 7.06 s per loop (mean ± std. dev. of 2 runs, 2 loops each)'),
 (2,
  10000,
  '1min ± 16.6 s per loop (mean ± std. dev. of 2 runs, 2 loops each)')]

In [22]:
def getSD(ts):
    splt = ts.split('±')[1].split(' ')[1:3]
    if splt[1] == 'ms':
        time = float(splt[0])/1000
        return time
    else:
        time = splt[0]
        return time

In [23]:
times_dict = {}
times_dict['p'] = [i for i,n,ts in times_lst]
times_dict['nodes'] = [n for i,n,ts in times_lst]
times_dict['mean'] = [getMean(ts) for i,n,ts in times_lst]
times_dict['sd'] = [getSD(ts) for i,n,ts in times_lst]

In [25]:
times_dict

{'p': [2, 2, 2, 2, 2, 2, 2],
 'nodes': [10, 32, 100, 316, 1000, 3162, 10000],
 'mean': ['13.1', '14.5', '20.1', '23.1', '26.7', '33.1', 60],
 'sd': ['2.36', '1.12', '1.39', '2.73', '3', '7.06', '16.6']}

In [26]:
times_df = pd.DataFrame(data=times_dict)
times_df.sort_values(by=['p','nodes'])

Unnamed: 0,p,nodes,mean,sd
0,2,10,13.1,2.36
1,2,32,14.5,1.12
2,2,100,20.1,1.39
3,2,316,23.1,2.73
4,2,1000,26.7,3.0
5,2,3162,33.1,7.06
6,2,10000,60.0,16.6


In [27]:
times_df.to_csv('../Data/p2_times_1.csv')

#### N3 R2

In [5]:
#function for calculating num of communities
#this is in a separate cell so that the magic %%time function in the next cell works correctly
def countComps(graph):
    return graph.connectedComponents().select('component').distinct().count()

In [6]:
def timeResults(graph):
    result = %timeit -n3 -r2 -o countComps(graph)
    return result

In [7]:
np.random.seed(42) 

for n in nodes:
    graph = locals()['g{}_{}'.format(i,n)]
    locals()['r{}_{}'.format(i,n)] = timeResults(graph)

15.5 s ± 3.15 s per loop (mean ± std. dev. of 2 runs, 3 loops each)
14.7 s ± 1.26 s per loop (mean ± std. dev. of 2 runs, 3 loops each)
20.1 s ± 683 ms per loop (mean ± std. dev. of 2 runs, 3 loops each)
23.6 s ± 1.62 s per loop (mean ± std. dev. of 2 runs, 3 loops each)
26.4 s ± 2.87 s per loop (mean ± std. dev. of 2 runs, 3 loops each)
31.4 s ± 4.97 s per loop (mean ± std. dev. of 2 runs, 3 loops each)
1min 10s ± 764 ms per loop (mean ± std. dev. of 2 runs, 3 loops each)


In [17]:
times_lst = []

for n in nodes:
    times_lst.append((i,n,str(locals()['r{}_{}'.format(i,n)])))

In [18]:
def getMean(ts):
    splt =  ts.split('±')[0].split(' ')[0:2]
    #if time in seconds, we dont need to do anything 
    if splt[1] == 's':
        time = splt[0]
        return time 
    else: #if time is 1min 10s for example
        mins = splt[0][0] #take the number before 'min'
        secs = splt[1][:-1] #take everything except the 's'
        time = int(mins)*60 + int(secs) #convert to seconds
        return time

In [19]:
ts = times_lst[0][2]
ts

'15.5 s ± 3.15 s per loop (mean ± std. dev. of 2 runs, 3 loops each)'

In [20]:
ts.split('±')[1].split(' ')[1:3]

['3.15', 's']

In [21]:
def getSD(ts):
    splt = ts.split('±')[1].split(' ')[1:3]
    if splt[1] == 'ms':
        time = int(splt[0])/1000
        return time
    else:
        time = splt[0]
        return time

In [22]:
times_dict = {}
times_dict['p'] = [i for i,n,ts in times_lst]
times_dict['nodes'] = [n for i,n,ts in times_lst]
times_dict['mean'] = [getMean(ts) for i,n,ts in times_lst]
times_dict['sd'] = [getSD(ts) for i,n,ts in times_lst]

In [23]:
times_dict

{'p': [2, 2, 2, 2, 2, 2, 2],
 'nodes': [10, 32, 100, 316, 1000, 3162, 10000],
 'mean': ['15.5', '14.7', '20.1', '23.6', '26.4', '31.4', 70],
 'sd': ['3.15', '1.26', 0.683, '1.62', '2.87', '4.97', 0.764]}

In [24]:
times_df = pd.DataFrame(data=times_dict)
times_df.sort_values(by=['p','nodes'])

Unnamed: 0,p,nodes,mean,sd
0,2,10,15.5,3.15
1,2,32,14.7,1.26
2,2,100,20.1,0.683
3,2,316,23.6,1.62
4,2,1000,26.4,2.87
5,2,3162,31.4,4.97
6,2,10000,70.0,0.764


In [25]:
times_df.to_csv('../Data/p2_times_2.csv')

### PageRank

In [6]:
#function for calculating num of communities
#this is in a separate cell so that the magic %%time function in the next cell works correctly
def pRank(graph):
    return graph.pageRank(resetProbability=0.15, maxIter=10)

In [7]:
def timePR(graph):
    result = %timeit -n10 -r10 -o pRank(graph)
    return result

In [8]:
np.random.seed(42) 

for n in nodes:
    graph = locals()['g{}_{}'.format(i,n)]
    locals()['r{}_{}'.format(i,n)] = timePR(graph)

1.26 s ± 314 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)
1.27 s ± 194 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)
1.22 s ± 292 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)
1.23 s ± 198 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)
1.31 s ± 237 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)
1.26 s ± 309 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)
1.34 s ± 244 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)


In [9]:
times_lst = []

for n in nodes:
    times_lst.append((i,n,str(locals()['r{}_{}'.format(i,n)])))

In [10]:
def getMean(ts):
    splt =  ts.split('±')[0].split(' ')[0:2]
    #if time in seconds, we dont need to do anything 
    if splt[1] == 's':
        time = splt[0]
        return time 
    else: #if time is 1min 10s for example
        mins = splt[0][0] #take the number before 'min'
        secs = splt[1][:-1] #take everything except the 's'
        time = int(mins)*60 + int(secs) #convert to seconds
        return time

In [11]:
ts = times_lst[0][2]
ts

'1.26 s ± 314 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)'

In [12]:
ts.split('±')[1].split(' ')[1:3]

['314', 'ms']

In [13]:
def getSD(ts):
    splt = ts.split('±')[1].split(' ')[1:3]
    if splt[1] == 'ms':
        time = int(splt[0])/1000
        return time
    else:
        time = splt[0]
        return time

In [14]:
times_dict = {}
times_dict['p'] = [i for i,n,ts in times_lst]
times_dict['nodes'] = [n for i,n,ts in times_lst]
times_dict['mean'] = [getMean(ts) for i,n,ts in times_lst]
times_dict['sd'] = [getSD(ts) for i,n,ts in times_lst]

In [15]:
times_dict

{'p': [2, 2, 2, 2, 2, 2, 2],
 'nodes': [10, 32, 100, 316, 1000, 3162, 10000],
 'mean': ['1.26', '1.27', '1.22', '1.23', '1.31', '1.26', '1.34'],
 'sd': [0.314, 0.194, 0.292, 0.198, 0.237, 0.309, 0.244]}

In [19]:
times_df = pd.DataFrame(data=times_dict)
times_df.sort_values(by=['p','nodes'])

Unnamed: 0,p,nodes,mean,sd
0,2,10,1.26,0.314
1,2,32,1.27,0.194
2,2,100,1.22,0.292
3,2,316,1.23,0.198
4,2,1000,1.31,0.237
5,2,3162,1.26,0.309
6,2,10000,1.34,0.244


In [20]:
times_df.to_csv('../Data/p2_times_pr.csv')