# 8.1 Appendix (PySpark P1 Times)

Most of the code here is adjusted from Section 5 and so not many comments are made. This is essentially to run the same code without the issues of slowing down. 

**NOTE: THE WAY THIS APPENDIX IS STRUCTURED, YOU'D NEED TO RUN EVERYTHING FOR N2R2 AND SAVE YOUR RESULTS THEN RESTART THE KERNEL AND RUN FOR N3R2 ETC. DEPENDING ON DEVICE THIS MAY OR MAY NOT WORK AND MAY LEAD TO `WINERROR10054` IF RUNTIME IS TOO LONG. THE PURPOSE OF INCLUDING THIS APPENDIX IS MOSTLY FOR COMPLETENESS**

In [1]:
from graphframes import *
from pyspark import *
from pyspark.sql import *
from pyspark.sql.types import IntegerType

import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

In [2]:
#initialise sparkContext and set checkpoint 
sc = spark.sparkContext
sc.setCheckpointDir('times1_cps')

In [3]:
#number of nodes that correspond to our ER graphs
nodes = [10,32, 100, 316, 1000, 3162, 10000]

## Loading in Graph Connection CSVs

In [4]:
i=1
# loop over nodes
for n in nodes:
    #set path to grab correct edges csv e.g. 'G_ 100 _p1'
    path = '../Data/G_ ' + str(n) + ' _p' + str(i)
    edges = spark.read.csv(path,header=True)
    #rename to match the 'src' and 'dst format that spark prefers, drop index col '_c0'
    edges = edges.withColumnRenamed('V1','src').withColumnRenamed('V2','dst').drop('_c0')
    #create a list of ids; these are the vertices for our erdos-renyi graphs
    ids = list(range(1,n+1))
    #assign vertices to ids
    vertices = spark.createDataFrame(ids, IntegerType()).withColumnRenamed('value','id')
    #create g's
    locals()['g{}_{}'.format(i,n)] = GraphFrame(v=vertices,e=edges)
    #retrieve edge count and assign it to dataframe
    
#delete our temp vars
del edges
del ids
del vertices

## DataFrame

### N2R2

In [5]:
#function for calculating num of communities
#this is in a separate cell so that the magic %%time function in the next cell works correctly
def countComps(graph):
    return graph.connectedComponents().select('component').distinct().count()

In [6]:
def timeResults(graph):
    result = %timeit -n2 -r2 -o countComps(graph)
    return result

In [7]:
np.random.seed(42) 

for n in nodes:
    graph = locals()['g{}_{}'.format(i,n)]
    locals()['r{}_{}'.format(i,n)] = timeResults(graph)

13.2 s ± 2.53 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
18.3 s ± 1.13 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
21.1 s ± 1.18 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
24.2 s ± 2.44 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
25.9 s ± 2.86 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
38.4 s ± 6.06 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
58.4 s ± 15.4 s per loop (mean ± std. dev. of 2 runs, 2 loops each)


In [8]:
times_lst = []

for n in nodes:
    times_lst.append((i,n,str(locals()['r{}_{}'.format(i,n)])))

In [9]:
def getMean(ts):
    splt =  ts.split('±')[0].split(' ')[0:2]
    #if time in seconds, we dont need to do anything 
    if splt[1] == 's':
        time = splt[0]
        return time 
    else: #if time is 1min 10s for example
        mins = splt[0][0] #take the number before 'min'
        secs = splt[1][:-1] #take everything except the 's'
        time = int(mins)*60 + int(secs) #convert to seconds
        return time

In [10]:
ts = times_lst[0][2]
ts

'13.2 s ± 2.53 s per loop (mean ± std. dev. of 2 runs, 2 loops each)'

In [11]:
ts.split('±')[1].split(' ')[1:3]

['2.53', 's']

In [12]:
def getSD(ts):
    splt = ts.split('±')[1].split(' ')[1:3]
    if splt[1] == 'ms':
        time = int(splt[0])/1000
        return time
    else:
        time = splt[0]
        return time

In [13]:
times_dict = {}
times_dict['p'] = [i for i,n,ts in times_lst]
times_dict['nodes'] = [n for i,n,ts in times_lst]
times_dict['mean'] = [getMean(ts) for i,n,ts in times_lst]
times_dict['sd'] = [getSD(ts) for i,n,ts in times_lst]

In [14]:
times_dict

{'p': [1, 1, 1, 1, 1, 1, 1],
 'nodes': [10, 32, 100, 316, 1000, 3162, 10000],
 'mean': ['13.2', '18.3', '21.1', '24.2', '25.9', '38.4', '58.4'],
 'sd': ['2.53', '1.13', '1.18', '2.44', '2.86', '6.06', '15.4']}

In [17]:
times_df = pd.DataFrame(data=times_dict)
times_df.sort_values(by=['p','nodes'])

Unnamed: 0,p,nodes,mean,sd
0,1,10,13.2,2.53
1,1,32,18.3,1.13
2,1,100,21.1,1.18
3,1,316,24.2,2.44
4,1,1000,25.9,2.86
5,1,3162,38.4,6.06
6,1,10000,58.4,15.4


In [16]:
times_df.to_csv('../Data/p1_times_1.csv')

### N3 R2

In [5]:
#function for calculating num of communities
#this is in a separate cell so that the magic %%time function in the next cell works correctly
def countComps(graph):
    return graph.connectedComponents().select('component').distinct().count()

In [6]:
def timeResults(graph):
    result = %timeit -n3 -r2 -o countComps(graph)
    return result

In [7]:
np.random.seed(42) 

for n in nodes:
    graph = locals()['g{}_{}'.format(i,n)]
    locals()['r{}_{}'.format(i,n)] = timeResults(graph)

12.5 s ± 1.94 s per loop (mean ± std. dev. of 2 runs, 3 loops each)
18.1 s ± 780 ms per loop (mean ± std. dev. of 2 runs, 3 loops each)
19.5 s ± 1.29 s per loop (mean ± std. dev. of 2 runs, 3 loops each)
21.7 s ± 1.2 s per loop (mean ± std. dev. of 2 runs, 3 loops each)
24.2 s ± 2.11 s per loop (mean ± std. dev. of 2 runs, 3 loops each)
36.3 s ± 4.09 s per loop (mean ± std. dev. of 2 runs, 3 loops each)
1min 34s ± 5.93 s per loop (mean ± std. dev. of 2 runs, 3 loops each)


In [8]:
times_lst = []

for n in nodes:
    times_lst.append((i,n,str(locals()['r{}_{}'.format(i,n)])))

In [9]:
def getMean(ts):
    splt =  ts.split('±')[0].split(' ')[0:2]
    #if time in seconds, we dont need to do anything 
    if splt[1] == 's':
        time = splt[0]
        return time 
    else: #if time is 1min 10s for example
        mins = splt[0][0] #take the number before 'min'
        secs = splt[1][:-1] #take everything except the 's'
        time = int(mins)*60 + int(secs) #convert to seconds
        return time

In [10]:
ts = times_lst[0][2]
ts

'12.5 s ± 1.94 s per loop (mean ± std. dev. of 2 runs, 3 loops each)'

In [11]:
ts.split('±')[1].split(' ')[1:3]

['1.94', 's']

In [12]:
def getSD(ts):
    splt = ts.split('±')[1].split(' ')[1:3]
    if splt[1] == 'ms':
        time = int(splt[0])/1000
        return time
    else:
        time = splt[0]
        return time

In [13]:
times_dict = {}
times_dict['p'] = [i for i,n,ts in times_lst]
times_dict['nodes'] = [n for i,n,ts in times_lst]
times_dict['mean'] = [getMean(ts) for i,n,ts in times_lst]
times_dict['sd'] = [getSD(ts) for i,n,ts in times_lst]

In [14]:
times_dict

{'p': [1, 1, 1, 1, 1, 1, 1],
 'nodes': [10, 32, 100, 316, 1000, 3162, 10000],
 'mean': ['12.5', '18.1', '19.5', '21.7', '24.2', '36.3', 94],
 'sd': ['1.94', 0.78, '1.29', '1.2', '2.11', '4.09', '5.93']}

In [15]:
times_df = pd.DataFrame(data=times_dict)
times_df.sort_values(by=['p','nodes'])

Unnamed: 0,p,nodes,mean,sd
0,1,10,12.5,1.94
1,1,32,18.1,0.78
2,1,100,19.5,1.29
3,1,316,21.7,1.2
4,1,1000,24.2,2.11
5,1,3162,36.3,4.09
6,1,10000,94.0,5.93


In [16]:
times_df.to_csv('../Data/p1_times_2.csv')

## PageRank

In [17]:
#function for calculating num of communities
#this is in a separate cell so that the magic %%time function in the next cell works correctly
def pRank(graph):
    return graph.pageRank(resetProbability=0.15, maxIter=10)

In [8]:
def timePR(graph):
    result = %timeit -n10 -r10 -o pRank(graph)
    return result

In [9]:
np.random.seed(42) 

for n in nodes:
    graph = locals()['g{}_{}'.format(i,n)]
    locals()['r{}_{}'.format(i,n)] = timePR(graph)

1.14 s ± 269 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)
1.17 s ± 225 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)
1.24 s ± 235 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)
1.32 s ± 192 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)
1.4 s ± 239 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)
1.34 s ± 355 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)
1.43 s ± 321 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)


In [11]:
times_lst = []

for n in nodes:
    times_lst.append((i,n,str(locals()['r{}_{}'.format(i,n)])))

In [12]:
def getMean(ts):
    splt =  ts.split('±')[0].split(' ')[0:2]
    #if time in seconds, we dont need to do anything 
    if splt[1] == 's':
        time = splt[0]
        return time 
    elif splt[1] == 'ms':
        time = int(splt[0])/1000
    else: #if time is 1min 10s for example
        mins = splt[0][0] #take the number before 'min'
        secs = splt[1][:-1] #take everything except the 's'
        time = int(mins)*60 + int(secs) #convert to seconds
        return time

In [13]:
ts = times_lst[0][2]
ts

'1.14 s ± 269 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)'

In [14]:
ts.split('±')[1].split(' ')[1:3]

['269', 'ms']

In [15]:
def getSD(ts):
    splt = ts.split('±')[1].split(' ')[1:3]
    if splt[1] == 'ms':
        time = float(splt[0])/1000
        return time
    else:
        time = splt[0]
        return time

In [16]:
times_dict = {}
times_dict['p'] = [i for i,n,ts in times_lst]
times_dict['nodes'] = [n for i,n,ts in times_lst]
times_dict['mean'] = [getMean(ts) for i,n,ts in times_lst]
times_dict['sd'] = [getSD(ts) for i,n,ts in times_lst]

In [17]:
times_dict

{'p': [1, 1, 1, 1, 1, 1, 1],
 'nodes': [10, 32, 100, 316, 1000, 3162, 10000],
 'mean': ['1.14', '1.17', '1.24', '1.32', '1.4', '1.34', '1.43'],
 'sd': [0.269, 0.225, 0.235, 0.192, 0.239, 0.355, 0.321]}

In [18]:
times_df = pd.DataFrame(data=times_dict)
times_df.sort_values(by=['p','nodes'])

Unnamed: 0,p,nodes,mean,sd
0,1,10,1.14,0.269
1,1,32,1.17,0.225
2,1,100,1.24,0.235
3,1,316,1.32,0.192
4,1,1000,1.4,0.239
5,1,3162,1.34,0.355
6,1,10000,1.43,0.321


In [19]:
times_df.to_csv('../Data/p1_times_pr.csv')