Skip to content

Commit

Permalink
Merge pull request #25 from nibeditabh/analysis-hemesh
Browse files Browse the repository at this point in the history
Analysis code merged for static cuda and openACC
  • Loading branch information
Ashwina Kumar committed Apr 23, 2023
2 parents ba2adc2 + d63567a commit 8c97c29
Show file tree
Hide file tree
Showing 70 changed files with 4,325 additions and 332 deletions.
10 changes: 9 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,12 @@ src/bin/
../.history/*
StarPlat
StarPlat.exe

debug.txt
dump
*.pdf
To_be_done.txt
.github/
dump/
*.starplat
runtests.sh
cudadump/
51 changes: 8 additions & 43 deletions Changes_list.txt
Original file line number Diff line number Diff line change
@@ -1,46 +1,11 @@
lexer.l : return, Incremental, Decremental, Static, updates.
Lrparser.y: 1. function nonterminal rules updated.
2. IfStmt nonterminal rules updated.
3. rightList nonterminal rules updated.
4. forall nonterminal rules updated.
5. ‘updates’ new datatype added.
6. staticGen/dynamicGen commandline option added.
ASTHelper.cpp: 1. Functype count updation.
2. func variant node creation.
3. return statement node creation.
ASTNodeTypes.hpp: 1. New field added to Identifier class.
2. function class is updated. ( field to decide on lock initialization in initial parts
of the function).
3. return statement’s class definition is added.
lrparser.y : LINE 574 uncomment the line to activate the -o (optimize) option

MainContext.cpp/MainContext.hpp: 1. currentFunc’s info addition is achieved through fields and methods.
For fixing the segmentation fault in PageRank_DSL_V2 :-
SymbolTableBuilder.cpp : LINE 375 added a check to see if the forall statement is a for loop or a forall loop
deviceVarsAnalyser.h : getTempVar() LINE 284 removed static cast and used strcpy since it previously generated random characters

SymbolTableBuilder.cpp: 1. InitialLockDecl for omp.
2. push itrBFS to parallelConstruct as well.

SymbolTableBuilder.h: 1. Addition currentFunc field.
2. change of ‘parallelConstruct’ stack to a vector datastructure.

dsl_cpp_generator.cpp: 1. Added various function_counts’ field.
2. info regarding the current function type.
3. generateWhileStmt logic added.
4. return statement generation logic added.
5. generateReductionCallStmt logic modified.
6. generatePropertyDefination graphid extraction logic slightly modified.


MakeFile: 1. Compilation commands changed.

Generation places where graphId vector’s content was used:

1. ‘generateAssignment’ call.
2. ‘generatePropertyDefination’ call.
3. ‘generatefixedpt_filter’ call.
4.






added fp_idnode to identifier
analyser/deviceVars/getUsedVars.cpp: LINE 230 fixed fp for sssp
src/symbolutil/SymbolTableBuilder.cpp: LINE 278 added fp node

formal parameters generation:
119 changes: 119 additions & 0 deletions graphcode/generated_cuda/PageRankDSLV2.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
// FOR BC: nvcc bc_dsl_v2.cu -arch=sm_60 -std=c++14 -rdc=true # HW must support CC 6.0+ Pascal or after
#include "PageRankDSLV2.h"

void Compute_PR(graph& g,float beta,float delta,int maxIter,
float* pageRank)

{
// CSR BEGIN
int V = g.num_nodes();
int E = g.num_edges();

printf("#nodes:%d\n",V);
printf("#edges:%d\n",E);
int* edgeLen = g.getEdgeLen();

int *h_meta;
int *h_src;
int *h_rev_meta;

h_meta = (int *)malloc( (V+1)*sizeof(int));
h_src = (int *)malloc( (E)*sizeof(int));
h_rev_meta = (int *)malloc( (V+1)*sizeof(int));

for(int i=0; i<= V; i++) {
int temp;
temp = g.indexofNodes[i];
h_meta[i] = temp;
temp = g.rev_indexofNodes[i];
h_rev_meta[i] = temp;
}

for(int i=0; i< E; i++) {
int temp;
temp = g.srcList[i];
h_src[i] = temp;
}


int* d_meta;
int* d_src;
int* d_rev_meta;

cudaMalloc(&d_meta, sizeof(int)*(1+V));
cudaMalloc(&d_src, sizeof(int)*(E));
cudaMalloc(&d_rev_meta, sizeof(int)*(V+1));

cudaMemcpy( d_meta, h_meta, sizeof(int)*(V+1), cudaMemcpyHostToDevice);
cudaMemcpy( d_src, h_src, sizeof(int)*(E), cudaMemcpyHostToDevice);
cudaMemcpy(d_rev_meta, h_rev_meta, sizeof(int)*((V+1)), cudaMemcpyHostToDevice);

// CSR END
//LAUNCH CONFIG
const unsigned threadsPerBlock = 512;
unsigned numThreads = (V < threadsPerBlock)? 512: V;
unsigned numBlocks = (V+threadsPerBlock-1)/threadsPerBlock;


// TIMER START
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float milliseconds = 0;
cudaEventRecord(start,0);


//DECLAR DEVICE AND HOST vars in params
float* d_pageRank;
cudaMalloc(&d_pageRank, sizeof(float)*(V));


//BEGIN DSL PARSING
float* d_pageRank_nxt;
cudaMalloc(&d_pageRank_nxt, sizeof(float)*(V));

float numNodes = (float)g.num_nodes( ); // asst in .cu

initKernel<float> <<<numBlocks,threadsPerBlock>>>(V,d_pageRank,(float)1 / numNodes);

int iterCount = 0; // asst in .cu

float diff; // asst in .cu

bool tempVar_0 = false; // asst in .cu

do{
if (tempVar_0){ // if filter begin

} // if filter end
tempVar_0 = true;
diff = 0.000000;
cudaMemcpyToSymbol(::diff, &diff, sizeof(float), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(::delta, &delta, sizeof(float), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(::numNodes, &numNodes, sizeof(float), 0, cudaMemcpyHostToDevice);
Compute_PR_kernel<<<numBlocks, threadsPerBlock>>>(V,E,d_meta,d_src,d_rev_meta,d_pageRank,d_pageRank_nxt);
cudaDeviceSynchronize();



; // asst in .cu

; // asst in .cu

cudaMemcpy(d_pageRank, d_pageRank_nxt, sizeof(float)*V, cudaMemcpyDeviceToDevice);
iterCount++;
cudaMemcpyFromSymbol(&diff, ::diff, sizeof(float), 0, cudaMemcpyDeviceToHost);

}while((diff > beta) && (iterCount < maxIter));

//cudaFree up!! all propVars in this BLOCK!
cudaFree(d_pageRank_nxt);

//TIMER STOP
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
printf("GPU Time: %.6f ms\n", milliseconds);

cudaMemcpy(pageRank, d_pageRank, sizeof(float)*(V), cudaMemcpyDeviceToHost);
} //end FUN
42 changes: 42 additions & 0 deletions graphcode/generated_cuda/PageRankDSLV2.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// FOR BC: nvcc bc_dsl_v2.cu -arch=sm_60 -std=c++14 -rdc=true # HW must support CC 6.0+ Pascal or after
#ifndef GENCPP_PAGERANKDSLV2_H
#define GENCPP_PAGERANKDSLV2_H
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <cuda.h>
#include "../graph.hpp"
#include "../libcuda.cuh"
#include <cooperative_groups.h>

void Compute_PR(graph& g,float beta,float delta,int maxIter,
float* pageRank);

__device__ float beta ;
__device__ float delta ;
__device__ int maxIter ;


__device__ float numNodes ; // DEVICE ASSTMENT in .h

__device__ float diff ; // DEVICE ASSTMENT in .h

__global__ void Compute_PR_kernel(int V, int E, int* d_meta, int* d_src, int *d_rev_meta,float* d_pageRank,float* d_pageRank_nxt){ // BEGIN KER FUN via ADDKERNEL
float num_nodes = V;
unsigned v = blockIdx.x * blockDim.x + threadIdx.x;
if(v >= V) return;
float sum = 0.000000; // DEVICE ASSTMENT in .h

for (int edge = d_rev_meta[v]; edge < d_rev_meta[v+1]; edge++)
{
int nbr = d_src[edge] ;
sum = sum + d_pageRank[nbr] / (d_meta[nbr+1]-d_meta[nbr]);

} // end FOR NBR ITR. TMP FIX!
float val = (1 - delta) / numNodes + delta * sum; // DEVICE ASSTMENT in .h

atomicAdd(& diff, (float)val - d_pageRank[v]);
d_pageRank_nxt[v] = val;
} // end KER FUNC

#endif
41 changes: 24 additions & 17 deletions graphcode/generated_cuda/PageRank_DSL_V2.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,22 +28,19 @@ void Compute_PR(graph& g,float beta,float delta,int maxIter,
for(int i=0; i<= V; i++) {
int temp = g.indexofNodes[i];
h_meta[i] = temp;
temp = g.rev_indexofNodes[i];
h_rev_meta[i] = temp;
}

for(int i=0; i< E; i++) {
int temp = g.edgeList[i];
h_data[i] = temp;
temp = srcList[i];
temp = g.srcList[i];
h_src[i] = temp;
temp = edgeLen[i];
h_weight[i] = temp;
}

for(int i=0; i<= V; i++) {
int temp = g.rev_indexofNodes[i];
h_rev_meta[i] = temp;
}


int* d_meta;
int* d_data;
Expand All @@ -68,7 +65,7 @@ void Compute_PR(graph& g,float beta,float delta,int maxIter,
// CSR END
//LAUNCH CONFIG
const unsigned threadsPerBlock = 512;
unsigned numThreads = (V < threadsPerBlock)? V: 512;
unsigned numThreads = (V < threadsPerBlock)? 512: V;
unsigned numBlocks = (V+threadsPerBlock-1)/threadsPerBlock;


Expand All @@ -86,27 +83,30 @@ void Compute_PR(graph& g,float beta,float delta,int maxIter,


//BEGIN DSL PARSING
float num_nodes = (float)g.num_nodes( ); // asst in .cu

float* d_pageRank_nxt;
cudaMalloc(&d_pageRank_nxt, sizeof(float)*(V));

float num_nodes = (float)g.num_nodes( ); // asst in .cu

initKernel<float> <<<numBlocks,threadsPerBlock>>>(V,d_pageRank,(float)1 / num_nodes);

int iterCount = 0; // asst in .cu

float diff; // asst in .cu

do
{diff = 0.000000;
bool tempVar_0 = false; // asst in .cu

do{
if (tempVar_0){ // if filter begin

} // if filter end
tempVar_0 = true;
diff = 0.000000;
cudaMemcpyToSymbol(::diff, &diff, sizeof(float), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(::delta, &delta, sizeof(float), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(::num_nodes, &num_nodes, sizeof(float), 0, cudaMemcpyHostToDevice);
Compute_PR_kernel<<<numBlocks, numThreads>>>(V,E,d_meta,d_data,d_src,d_weight,d_rev_meta,d_modified_next,d_pageRank,d_pageRank_nxt);
cudaMemcpyToSymbol(::delta, &delta, sizeof(float), 0, cudaMemcpyHostToDevice);
Compute_PR_kernel<<<numBlocks, threadsPerBlock>>>(V,E,d_meta,d_data,d_src,d_weight,d_rev_meta,d_modified_next,d_pageRank,d_pageRank_nxt);
cudaDeviceSynchronize();
cudaMemcpyFromSymbol(&diff, ::diff, sizeof(float), 0, cudaMemcpyDeviceToHost);
cudaMemcpyFromSymbol(&delta, ::delta, sizeof(float), 0, cudaMemcpyDeviceToHost);
cudaMemcpyFromSymbol(&num_nodes, ::num_nodes, sizeof(float), 0, cudaMemcpyDeviceToHost);



Expand All @@ -116,7 +116,14 @@ void Compute_PR(graph& g,float beta,float delta,int maxIter,

cudaMemcpy(d_pageRank, d_pageRank_nxt, sizeof(float)*V, cudaMemcpyDeviceToDevice);
iterCount++;
}while((diff > beta) && (iterCount < maxIter));//TIMER STOP
cudaMemcpyFromSymbol(&diff, ::diff, sizeof(float), 0, cudaMemcpyDeviceToHost);

}while((diff > beta) && (iterCount < maxIter));

//cudaFree up!! all propVars in this BLOCK!
cudaFree(d_pageRank_nxt);

//TIMER STOP
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
Expand Down
8 changes: 6 additions & 2 deletions graphcode/generated_cuda/PageRank_DSL_V2.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,23 @@ __device__ int maxIter ;

__device__ float num_nodes ; // DEVICE ASSTMENT in .h

__device__ int iterCount ; // DEVICE ASSTMENT in .h
; // DEVICE ASSTMENT in .h

__device__ float diff ; // DEVICE ASSTMENT in .h

; // DEVICE ASSTMENT in .h

__global__ void Compute_PR_kernel(int V, int E, int* d_meta, int* d_data, int* d_src, int* d_weight, int *d_rev_meta,bool *d_modified_next,float* d_pageRank,float* d_pageRank_nxt){ // BEGIN KER FUN via ADDKERNEL
float num_nodes = V;
unsigned v = blockIdx.x * blockDim.x + threadIdx.x;
if(v >= V) return;
float sum = 0.000000; // DEVICE ASSTMENT in .h

for (int edge = d_rev_meta[v]; edge < d_rev_meta[v+1]; edge++)
{int nbr = d_src[edge] ;
{
int nbr = d_src[edge] ;
sum = sum + d_pageRank[nbr] / (d_meta[nbr+1]-d_meta[nbr]);

} // end FOR NBR ITR. TMP FIX!
float val = (1 - delta) / num_nodes + delta * sum; // DEVICE ASSTMENT in .h

Expand Down
Binary file added graphcode/generated_cuda/SSSP
Binary file not shown.
Loading

0 comments on commit 8c97c29

Please sign in to comment.