merging the analysis

ashwinktpu · Apr 22, 2023 · 3ce367d · 3ce367d
1 parent 0b8f686
commit 3ce367d
Show file tree

Hide file tree

Showing 25 changed files with 2,796 additions and 1,179 deletions.
diff --git a/graphcode/generated_cuda/PageRankDSLV2.cu b/graphcode/generated_cuda/PageRankDSLV2.cu
@@ -0,0 +1,132 @@
+// FOR BC: nvcc bc_dsl_v2.cu -arch=sm_60 -std=c++14 -rdc=true # HW must support CC 6.0+ Pascal or after
+#include "PageRankDSLV2.h"
+
+void Compute_PR(graph& g,float beta,float delta,int maxIter,
+  float* pageRank)
+
+{
+  // CSR BEGIN
+  int V = g.num_nodes();
+  int E = g.num_edges();
+
+  printf("#nodes:%d\n",V);
+  printf("#edges:%d\n",E);
+  int* edgeLen = g.getEdgeLen();
+
+  int *h_meta;
+  int *h_data;
+  int *h_src;
+  int *h_weight;
+  int *h_rev_meta;
+
+  h_meta = (int *)malloc( (V+1)*sizeof(int));
+  h_data = (int *)malloc( (E)*sizeof(int));
+  h_src = (int *)malloc( (E)*sizeof(int));
+  h_weight = (int *)malloc( (E)*sizeof(int));
+  h_rev_meta = (int *)malloc( (V+1)*sizeof(int));
+
+  for(int i=0; i<= V; i++) {
+    int temp = g.indexofNodes[i];
+    h_meta[i] = temp;
+    temp = g.rev_indexofNodes[i];
+    h_rev_meta[i] = temp;
+  }
+
+  for(int i=0; i< E; i++) {
+    int temp = g.edgeList[i];
+    h_data[i] = temp;
+    temp = g.srcList[i];
+    h_src[i] = temp;
+    temp = edgeLen[i];
+    h_weight[i] = temp;
+  }
+
+
+  int* d_meta;
+  int* d_data;
+  int* d_src;
+  int* d_weight;
+  int* d_rev_meta;
+  bool* d_modified_next;
+
+  cudaMalloc(&d_meta, sizeof(int)*(1+V));
+  cudaMalloc(&d_data, sizeof(int)*(E));
+  cudaMalloc(&d_src, sizeof(int)*(E));
+  cudaMalloc(&d_weight, sizeof(int)*(E));
+  cudaMalloc(&d_rev_meta, sizeof(int)*(V+1));
+  cudaMalloc(&d_modified_next, sizeof(bool)*(V));
+
+  cudaMemcpy(  d_meta,   h_meta, sizeof(int)*(V+1), cudaMemcpyHostToDevice);
+  cudaMemcpy(  d_data,   h_data, sizeof(int)*(E), cudaMemcpyHostToDevice);
+  cudaMemcpy(   d_src,    h_src, sizeof(int)*(E), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_weight, h_weight, sizeof(int)*(E), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_rev_meta, h_rev_meta, sizeof(int)*((V+1)), cudaMemcpyHostToDevice);
+
+  // CSR END
+  //LAUNCH CONFIG
+  const unsigned threadsPerBlock = 512;
+  unsigned numThreads   = (V < threadsPerBlock)? 512: V;
+  unsigned numBlocks    = (V+threadsPerBlock-1)/threadsPerBlock;
+
+
+  // TIMER START
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  float milliseconds = 0;
+  cudaEventRecord(start,0);
+
+
+  //DECLAR DEVICE AND HOST vars in params
+  float* d_pageRank;
+  cudaMalloc(&d_pageRank, sizeof(float)*(V));
+
+
+  //BEGIN DSL PARSING 
+  float* d_pageRank_nxt;
+  cudaMalloc(&d_pageRank_nxt, sizeof(float)*(V));
+
+  float num_nodes = (float)g.num_nodes( ); // asst in .cu
+
+  merged_kernel_1<<<numBlocks,threadsPerBlock>>>(V, d_pageRank, (float)1 / num_nodes, d_pageRank_nxt, (float)0);
+  int iterCount = 0; // asst in .cu
+
+  float diff; // asst in .cu
+
+  bool tempVar_0 = false; // asst in .cu
+
+  do{
+    if (tempVar_0){ // if filter begin 
+
+    } // if filter end
+    tempVar_0 = true;
+    diff = 0.000000;
+    cudaMemcpyToSymbol(::diff, &diff, sizeof(float), 0, cudaMemcpyHostToDevice);
+    cudaMemcpyToSymbol(::delta, &delta, sizeof(float), 0, cudaMemcpyHostToDevice);
+    cudaMemcpyToSymbol(::num_nodes, &num_nodes, sizeof(float), 0, cudaMemcpyHostToDevice);
+    Compute_PR_kernel<<<numBlocks, threadsPerBlock>>>(V,E,d_meta,d_data,d_src,d_weight,d_rev_meta,d_modified_next,d_pageRank,d_pageRank_nxt);
+    cudaDeviceSynchronize();
+
+
+
+    ; // asst in .cu
+
+    ; // asst in .cu
+
+    cudaMemcpy(d_pageRank, d_pageRank_nxt, sizeof(float)*V, cudaMemcpyDeviceToDevice);
+    iterCount++;
+    cudaMemcpyFromSymbol(&diff, ::diff, sizeof(float), 0, cudaMemcpyDeviceToHost);
+
+  }while((diff > beta) && (iterCount < maxIter));
+
+  //cudaFree up!! all propVars in this BLOCK!
+  cudaFree(d_pageRank_nxt);
+
+  //TIMER STOP
+  cudaEventRecord(stop,0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  printf("GPU Time: %.6f ms\n", milliseconds);
+
+  cudaMemcpy(pageRank, d_pageRank, sizeof(float)*(V), cudaMemcpyDeviceToHost);
+} //end FUN
diff --git a/graphcode/generated_cuda/PageRankDSLV2.h b/graphcode/generated_cuda/PageRankDSLV2.h
@@ -0,0 +1,52 @@
+// FOR BC: nvcc bc_dsl_v2.cu -arch=sm_60 -std=c++14 -rdc=true # HW must support CC 6.0+ Pascal or after
+#ifndef GENCPP_PAGERANKDSLV2_H
+#define GENCPP_PAGERANKDSLV2_H
+#include <stdio.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <cuda.h>
+#include "../graph.hpp"
+#include "../libcuda.cuh"
+#include <cooperative_groups.h>
+
+void Compute_PR(graph& g,float beta,float delta,int maxIter,
+  float* pageRank);
+
+__device__ float beta ;
+__device__ float delta ;
+__device__ int maxIter ;
+
+
+__device__ float num_nodes ; // DEVICE ASSTMENT in .h
+
+__global__ void merged_kernel_1(unsigned V, float* array_1, float val_1, float* array_2, float val_2){
+    unsigned id = threadIdx.x + blockDim.x * blockIdx.x;
+    if (id < V) {
+        array_1[id] = val_1;
+        array_2[id] = val_2;
+    }
+}
+; // DEVICE ASSTMENT in .h
+
+__device__ float diff ; // DEVICE ASSTMENT in .h
+
+; // DEVICE ASSTMENT in .h
+
+__global__ void Compute_PR_kernel(int V, int E, int* d_meta, int* d_data, int* d_src, int* d_weight, int *d_rev_meta,bool *d_modified_next,float* d_pageRank,float* d_pageRank_nxt){ // BEGIN KER FUN via ADDKERNEL
+  float num_nodes  = V;
+  unsigned v = blockIdx.x * blockDim.x + threadIdx.x;
+  if(v >= V) return;
+  float sum = 0.000000; // DEVICE ASSTMENT in .h
+
+  for (int edge = d_rev_meta[v]; edge < d_rev_meta[v+1]; edge++)
+  {int nbr = d_src[edge] ;
+    sum = sum + d_pageRank[nbr] / (d_meta[nbr+1]-d_meta[nbr]);
+
+  } //  end FOR NBR ITR. TMP FIX!
+  float val = (1 - delta) / num_nodes + delta * sum; // DEVICE ASSTMENT in .h
+
+  atomicAdd(& diff, (float)val - d_pageRank[v]);
+  d_pageRank_nxt[v] = val;
+} // end KER FUNC
+
+#endif
diff --git a/graphcode/generated_cuda/bc_dslV2.cu b/graphcode/generated_cuda/bc_dslV2.cu
@@ -0,0 +1,155 @@
+// FOR BC: nvcc bc_dsl_v2.cu -arch=sm_60 -std=c++14 -rdc=true # HW must support CC 6.0+ Pascal or after
+#include "bc_dslV2.h"
+
+void Compute_BC(graph& g,float* BC,std::set<int>& sourceSet)
+
+{
+  // CSR BEGIN
+  int V = g.num_nodes();
+  int E = g.num_edges();
+
+  printf("#nodes:%d\n",V);
+  printf("#edges:%d\n",E);
+  int* edgeLen = g.getEdgeLen();
+
+  int *h_meta;
+  int *h_data;
+  int *h_src;
+  int *h_weight;
+  int *h_rev_meta;
+
+  h_meta = (int *)malloc( (V+1)*sizeof(int));
+  h_data = (int *)malloc( (E)*sizeof(int));
+  h_src = (int *)malloc( (E)*sizeof(int));
+  h_weight = (int *)malloc( (E)*sizeof(int));
+  h_rev_meta = (int *)malloc( (V+1)*sizeof(int));
+
+  for(int i=0; i<= V; i++) {
+    int temp = g.indexofNodes[i];
+    h_meta[i] = temp;
+    temp = g.rev_indexofNodes[i];
+    h_rev_meta[i] = temp;
+  }
+
+  for(int i=0; i< E; i++) {
+    int temp = g.edgeList[i];
+    h_data[i] = temp;
+    temp = g.srcList[i];
+    h_src[i] = temp;
+    temp = edgeLen[i];
+    h_weight[i] = temp;
+  }
+
+
+  int* d_meta;
+  int* d_data;
+  int* d_src;
+  int* d_weight;
+  int* d_rev_meta;
+  bool* d_modified_next;
+
+  cudaMalloc(&d_meta, sizeof(int)*(1+V));
+  cudaMalloc(&d_data, sizeof(int)*(E));
+  cudaMalloc(&d_src, sizeof(int)*(E));
+  cudaMalloc(&d_weight, sizeof(int)*(E));
+  cudaMalloc(&d_rev_meta, sizeof(int)*(V+1));
+  cudaMalloc(&d_modified_next, sizeof(bool)*(V));
+
+  cudaMemcpy(  d_meta,   h_meta, sizeof(int)*(V+1), cudaMemcpyHostToDevice);
+  cudaMemcpy(  d_data,   h_data, sizeof(int)*(E), cudaMemcpyHostToDevice);
+  cudaMemcpy(   d_src,    h_src, sizeof(int)*(E), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_weight, h_weight, sizeof(int)*(E), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_rev_meta, h_rev_meta, sizeof(int)*((V+1)), cudaMemcpyHostToDevice);
+
+  // CSR END
+  //LAUNCH CONFIG
+  const unsigned threadsPerBlock = 512;
+  unsigned numThreads   = (V < threadsPerBlock)? 512: V;
+  unsigned numBlocks    = (V+threadsPerBlock-1)/threadsPerBlock;
+
+
+  // TIMER START
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  float milliseconds = 0;
+  cudaEventRecord(start,0);
+
+
+  //DECLAR DEVICE AND HOST vars in params
+  float* d_BC;
+  cudaMalloc(&d_BC, sizeof(float)*(V));
+
+
+  //BEGIN DSL PARSING 
+  float* d_sigma;
+  cudaMalloc(&d_sigma, sizeof(float)*(V));
+
+  float* d_delta;
+  cudaMalloc(&d_delta, sizeof(float)*(V));
+
+  initKernel<float> <<<numBlocks,threadsPerBlock>>>(V,d_BC,(float)0);
+
+  //FOR SIGNATURE of SET - Assumes set for on .cu only
+  std::set<int>::iterator itr;
+  for(itr=sourceSet.begin();itr!=sourceSet.end();itr++) 
+  {
+    int src = *itr;
+    merged_kernel_1<<<numBlocks,threadsPerBlock>>>(V, d_delta, (float)0, d_sigma, (float)0, d_sigma, src, (float)1);
+
+    //EXTRA vars for ITBFS AND REVBFS
+    bool finished;
+    int hops_from_source=0;
+    bool* d_finished;       cudaMalloc(&d_finished,sizeof(bool) *(1));
+    int* d_hops_from_source;cudaMalloc(&d_hops_from_source, sizeof(int));  cudaMemset(d_hops_from_source,0,sizeof(int));
+    int* d_level;           cudaMalloc(&d_level,sizeof(int) *(V));
+
+    //EXTRA vars INITIALIZATION
+    initKernel<int> <<<numBlocks,threadsPerBlock>>>(V,d_level,-1);
+    initIndex<int><<<1,1>>>(V,d_level,src, 0);
+
+    // long k =0 ;// For DEBUG
+    do {
+      finished = true;
+      cudaMemcpy(d_finished, &finished, sizeof(bool)*(1), cudaMemcpyHostToDevice);
+
+      //Kernel LAUNCH
+      fwd_pass<<<numBlocks,threadsPerBlock>>>(V, d_meta, d_data,d_weight, d_delta, d_sigma, d_level, d_hops_from_source, d_finished,d_BC); ///DONE from varList
+
+      incrementDeviceVar<<<1,1>>>(d_hops_from_source);
+      cudaDeviceSynchronize(); //MUST - rupesh
+      ++hops_from_source; // updating the level to process in the next iteration
+      // k++; //DEBUG
+
+      cudaMemcpy(&finished, d_finished, sizeof(bool)*(1), cudaMemcpyDeviceToHost);
+    }while(!finished);
+
+    hops_from_source--;
+    cudaMemcpy(d_hops_from_source, &hops_from_source, sizeof(int)*(1), cudaMemcpyHostToDevice);
+
+    //BACKWARD PASS
+    while(hops_from_source > 1) {
+
+      //KERNEL Launch
+      back_pass<<<numBlocks,threadsPerBlock>>>(V, d_meta, d_data, d_weight, d_delta, d_sigma, d_level, d_hops_from_source, d_finished
+        ,d_BC); ///DONE from varList
+
+      hops_from_source--;
+      cudaMemcpy(d_hops_from_source, &hops_from_source, sizeof(int)*(1), cudaMemcpyHostToDevice);
+    }
+    //accumulate_bc<<<numBlocks,threadsPerBlock>>>(V,d_delta, d_BC, d_level, src);
+
+  }
+
+  //cudaFree up!! all propVars in this BLOCK!
+  cudaFree(d_delta);
+  cudaFree(d_sigma);
+
+  //TIMER STOP
+  cudaEventRecord(stop,0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  printf("GPU Time: %.6f ms\n", milliseconds);
+
+  cudaMemcpy(      BC,     d_BC, sizeof(float)*(V), cudaMemcpyDeviceToHost);
+} //end FUN