-
Notifications
You must be signed in to change notification settings - Fork 0
/
gpu-sssp.py
executable file
·246 lines (223 loc) · 9.53 KB
/
gpu-sssp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
#!/usr/bin/env python2.6
# coding=UTF8
#
# Adaptation of single source shortest path algorithm presented in:
# P. Harish and P. J. Narayanan, "Accelerating large graph algorithms on the GPU using CUDA",
# in IEEE International Conference on High Performance Computing (HiPC), December 2007.
#
# CUDA implementation of SSSP: The SSSP problem does not traverse the graph in
# levels. The cost of a visited vertex may change due to a low cost path being discovered
# later. The termination is based on the change in cost.
# In our implementation, we use a vertex array Va an edge array Ea , boolean mask
# Ma of size |V |, and a weight array Wa of size |E|. In each iteration each vertex checks
# if it is in the mask Ma . If yes, it fetches its current cost from the cost array Ca and its
# neighbor’s weights from the weight array Wa. The cost of each neighbor is updated if
# greater than the cost of current vertex plus the edge weight to that neighbor. The new
# cost is not reflected in the cost array but is updated in an alternate array Ua . At the end
# of the execution of the kernel, a second kernel compares cost Ca with updating cost
# Ua . It updates the cost Ca only if it is more than Ua and makes its own entry in the
# mask Ma . The updating cost array reflects the cost array after each kernel execution for
# consistency.
# The second stage of kernel execution is required as there is no synchronization
# between the CUDA multiprocessors. Updating the cost at the time of modification itself
# can result in read after write inconsistencies. The second stage kernel also toggles a flag
# if any mask is set. If this flag is not set the execution stops.
# Newer version of CUDA hardware (ver 1.1) supports atomic read/write operations
# in the global memory which can help resolve inconsistencies. 8800 GTX is CUDA
# version 1.0 GPU and does not support such operations. Timings for SSSP CUDA im-
# plementations are given in Figure 4.
#
# Algorithm 3 CUDA SSSP (Graph G(V, E, W), Source Vertex S)
# 1: Create vertex array Va from all vertices in G(V, E , W ),
# 2: Create edge array Ea from all edges in G(V, E , W )
# 3: Create weight array Wa from all weights in G(V, E , W )
# 4: Create mask array Ma , cost array Ca and Updating cost array Ua of size V
# 5: Initialize mask Ma to false, cost array Ca and Updating cost array Ua to ∞
# 6: Ma [S] ← true
# 7: Ca [S] ← 0
# 8: Ua [S] ← 0
# 9: while Ma not Empty do
# 10: for each vertex V in parallel do
# 11: Invoke CUDA SSSP KERNEL1(Va , Ea , Wa , Ma , Ca , Ua ) on the grid
# 12: Invoke CUDA SSSP KERNEL2(Va , Ea , Wa , Ma , Ca , Ua ) on the grid
# 13: end for
# 14: end while
#
# Algorithm 4 CUDA SSSP KERNEL1 (Va , Ea , Wa , Ma , Ca , Ua )
# 1: t id ← getThreadID
# 2: if Ma [t id ] then
# 3: Ma [t id ] ← false
# 4: for all neighbors nid of t id do
# 5: if Ua [nid ]> Ca [t id ]+Wa [nid ] then
# 6: Ua [nid ] ← Ca [t id ]+Wa [nid ]
# 7: end if
# 8: end for
# 9: end if
#
# Algorithm 5 CUDA SSSP KERNEL2 (Va , Ea , Wa , Ma , Ca , Ua )
# 1: t id ← getThreadID
# 2: if Ca [t id ] > Ua [t id ] then
# 3: Ca [t id ] ← Ua [t id ]
# 4: Ma [t id ] ← true
# 5: end if
# 6: Ua [t id ] ← Ca [t id ]
#
# ...
#
# Another alternative to find all pair shortest paths is to run SSSP algorithm from every
# vertex in graph G(V, E, W) (Algorithm 7). This will require only the final output
# size to be of O(V^2), all intermediate calculations do not require this space. The final
# output could be stored in the CPU memory. Each iteration of SSSP will output a vector
# of size O(V), which can be copied back to the CPU memory. This approach does not
# require the graph to be represented as an adjacency matrix, hence the representation
# given in section 3.1 can be used , which makes this algorithm suitable for large graphs.
# We implemented this approach and the results are given in Figure 6. This runs faster
# than the parallel Floyd Warshall algorithm because it is a single O(V) operation looping
# over O(V) threads. In contrast, the Floyd Warshall algorithm requires a single O(V) op-
# eration looping over O(V^2) threads which creates extra overhead for context switching
# the threads on the SIMD processors. Thus, due to the overhead for context switching of
# threads, the Floyd Warshall algorithm exhibits a slow down.
#
# Algorithm 7 APSP USING SSSP(G(V, E, W))
# 1: Create vertex array Va, edge array Ea, weight array Wa from G(V, E, W)
# 2: Create mask array Ma, cost array Ca and updating cost array Ua of size V
# 3: for S from 1 to V do
# 4: Ma [S] ← true
# 5: Ca [S] ← 0
# 6: while Ma not Empty do
# 7: for each vertex V in parallel do
# 8: Invoke CUDA SSSP KERNEL1(Va , Ea , Wa , Ma , Ca , Ua ) on the grid
# 9: Invoke CUDA SSSP KERNEL2(Va , Ea , Wa , Ma , Ca , Ua ) on the grid
# 10: end for
# 11: end while
# 12: end for
import time, math, pyproj, sys, cPickle
import numpy as np
from graphserver.graphdb import GraphDatabase
from graphserver.ext.gtfs.gtfsdb import GTFSDatabase
import pycuda.driver as cuda
import pycuda.autoinit as autoinit
import pycuda.gpuarray as gpuarray
from pycuda.compiler import SourceModule
# Best to put a multiple of 32 threads (1 warp) in a block. Multiples of 64 schedule better.
CUDA_BLOCK_SHAPE = (64, 1, 1)
#GSDB_FILENAME = '../gsdata/bart.linked.gsdb'
GSDB_FILENAME = '../gsdata/trimet_13sep2009.linked.gsdb'
def from_gsdb(gsdb) :
nV = gsdb.num_vertices()
nE = gsdb.num_edges()
V = np.empty( nV, dtype=np.int32 )
E = np.empty( nE, dtype=np.int32 )
W = np.empty( nE, dtype=np.float32 )
print 'Indexing edges...'
gsdb.execute("CREATE INDEX IF NOT EXISTS edges_vertex1 ON edges (vertex1)")
print 'Building vertex hash...'
v_idx = dict( zip(gsdb.all_vertex_labels(), range(nV)) )
v_tbl = [[] for e in range(nV)]
print 'Fetching incoming edges... '
# original was designed for undirected graphs.
# also : alignment, textures, broadcasting/packing, calendars, and paths should be worked on.
# Atomic min to global memory instead of 2 kernels. This should improve cache use. Or even load into registers.
# seems to go very slow when ordering by vertex2 not vertex1
edges = gsdb.all_edges()
iE = 0
for vo_label, vd_label, edge in edges :
print repr(edge)
v_tbl[v_idx[vo_label]].append(v_idx[vd_label])
iE += 1
if iE % 10000 == 0 :
sys.stdout.write('\rProcessing edge %i / %i (%02i%%)' % (iE, nE, iE * 100. / nE))
sys.stdout.flush()
print 'Making ndarray edgelist representation... '
iE = 0
for iV in range(nV) :
V[iV] = iE
for e in v_tbl[iV] :
E[iE] = e
W[iE] = 1 # must be filled in...
iE += 1
print V
print E
print W
print '\nSaving in numpy gzipped format...'
np.savez( 'edgelist.npz', V=V, E=E, W=W)
return (V, E, W)
src = """
// should use texture references for read-only parameters, allows caching
// could also arrange vertices for spatial locality
#define INF 0x7f800000
texture<int, 1, cudaReadModeElementType> tV;
texture<int, 1, cudaReadModeElementType> tE;
texture<int, 1, cudaReadModeElementType> tW;
__global__ void init ( int *M, float *C, float *U, int S )
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid == S) {
M[tid] = 1;
C[tid] = 0;
U[tid] = 0;
} else {
M[tid] = 0;
C[tid] = INF;
U[tid] = INF;
}
}
__global__ void SSSP1 ( int *V, int *E, float *W, int *M, float *C, float *U )
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int e_off = V[tid];
int e_len = V[tid + 1] - e_off;
if (M[tid]) {
M[tid] = 0;
for (int i = 0; i < e_len; i++) {
int nid = E[e_off + i];
if (U[nid] > C[tid] + W[e_off + i]) {
U[nid] = C[tid] + W[e_off + i];
}
}
}
}
__global__ void SSSP2 ( int *V, int *E, float *W, int *M, float *C, float *U )
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (C[tid] > U[tid]) {
C[tid] = U[tid];
M[tid] = 1;
}
U[tid] = C[tid];
}
"""
gsdb = GraphDatabase(GSDB_FILENAME)
V, E, W = from_gsdb(gsdb)
#npz = np.load( 'edgelist.npz' )
#V = npz['V']
#E = npz['E']
#W = npz['W']
V = gpuarray.to_gpu(V)
E = gpuarray.to_gpu(E)
W = gpuarray.to_gpu(W)
M = gpuarray.zeros(V.shape, dtype=np.int32) # necessary to fill with 0 for now, since edges of blocks are not handled.
C = gpuarray.empty(V.shape, dtype=np.float32)
U = gpuarray.empty(V.shape, dtype=np.float32)
mod = SourceModule(src, options=["--ptxas-options=-v"])
init_kernel = mod.get_function("init")
SSSP_kernel_1 = mod.get_function("SSSP1")
SSSP_kernel_2 = mod.get_function("SSSP2")
cuda_grid_shape = (len(V) / CUDA_BLOCK_SHAPE[0], 1)
np.set_printoptions(threshold=np.nan)
t0 = time.time()
for c in range(100) :
print c
n_iter = 0
init_kernel (M, C, U, np.int32(15), block=CUDA_BLOCK_SHAPE, grid=cuda_grid_shape)
autoinit.context.synchronize()
#while np.any(M.get()) :
for i in range(40) :
#print n_iter
SSSP_kernel_1 (V, E, W, M, C, U, block=CUDA_BLOCK_SHAPE, grid=cuda_grid_shape)
autoinit.context.synchronize()
SSSP_kernel_2 (V, E, W, M, C, U, block=CUDA_BLOCK_SHAPE, grid=cuda_grid_shape)
autoinit.context.synchronize()
#print C
#n_iter += 1
print (time.time() - t0) / 100.