<a href="https://colab.research.google.com/github/nazbeh/I_C_M_E_2020/blob/master/Workshop4/OpenMP_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# OpenMP Tutorial 

## Hello World

In [None]:
%%file  hello_world_openmp.c
  
// OpenMP header 
#include <omp.h>
#include <stdio.h>

int main(){


    #pragma omp parallel
    {
        int thread_id = omp_get_thread_num();
        int total_thread = omp_get_num_threads();
        printf("Hello I am thread %d out of %d\n",thread_id, total_thread);
    }
}



Compile with ```-fopenmp``` flag



In [None]:
!gcc -o hello_openmp -fopenmp hello_world_openmp.c

Execute with environment variable ```OMP_NUM_THREADS=```

In [None]:
!OMP_NUM_THREADS=4 ./hello_openmp

## Example: Computing Pi

### Serial Version

In [None]:
%%file pi_serial.c
#include <stdio.h>

int main(){
    double sum = 0;
    int n = 10000;
    for(int i = 0; i<n; ++i){
        sum += 4./ (1.+(i+0.5)*(i+0.5)/(double)(n*n))*1./(double) n; 
    }
    printf("Pi is %f\n",sum);
}

In [None]:
!gcc -o pi_serial pi_serial.c

In [None]:
!./pi_serial

### OpenMP version

In [None]:
%%file pi_openmp.c
// OpenMP header 
#include <omp.h>
#include <stdio.h>

int main(){
    double sum = 0;
    int n = 10000;
    #pragma omp parallel for reduction(+:sum)
    for(int i = 0; i<n; ++i){
        sum += 4./ (1.+(i+0.5)*(i+0.5)/(double)(n*n))*1./(double) n; 
    }
    printf("Pi is %f\n",sum);
}

In [None]:
!gcc -o pi_openmp -fopenmp pi_openmp.c

In [None]:
!OMP_NUM_THREADS=4 ./pi_openmp

## Example: Difference in Variable types
Notice the difference when defining the variable types inside parallel region. Change ```private``` by ```firstprivate```, ```lastprivate``` or ```shared``` and see how the output changes 



In [None]:
%%file variable_types.cpp
#include <omp.h>
#include <stdio.h>
int main(){
    int value = 6;
    int n = 3;
    #pragma omp parallel for num_threads(n) shared(value)
    for (int i = 0; i<n; ++i){
        int id = omp_get_thread_num();
        value = value + id;
        printf("value is %d in proc %d\n", value, id);
    }
    printf("value is %d\n", value);
}



In [None]:
!g++ -o variable_types -fopenmp variable_types.cpp

In [None]:
!OMP_NUM_THREADS=3 ./variable_types

## Exercise: Page Rank

### Serial Version

In [None]:
%%file pagerank.cpp
#include <stdio.h>
#include <stdlib.h>
#include <vector>

void addEdge(std::vector<int> adj[],double w[],int u, int v) 
{ 
    adj[v].push_back(u); 
    w[u] += 1;
}

int main() 
{ 
    //Create vector of data
    int V = 6; 
    std::vector<int> adj[V];
    double w[V] = {0};
    
    // Creates a graph
    addEdge(adj,w,0,1);
    addEdge(adj,w,1,2);
    addEdge(adj,w,2,3);
    addEdge(adj,w,5,2);
    addEdge(adj,w,5,3);
    addEdge(adj,w,1,3);
    addEdge(adj,w,4,5);
    addEdge(adj,w,3,2);
    addEdge(adj,w,3,4);
    addEdge(adj,w,3,5);
    addEdge(adj,w,0,2);
    
    size_t niter = 10;
 
    double r[V] = {0};
    double rn[V] = {0};

    for(int point = 0; point < V; ++point)
    { 
      r[point] = 1;
    }


    for(size_t it = 0; it < niter; ++it)
    {
       for(int point = 0; point < V; ++point)
       { 
           rn[point] = 0;
           for(auto& neigh:adj[point]){
               rn[point] += r[neigh]/w[neigh];
           }
       }
       for(int point = 0; point < V; ++point)
       {
           r[point] = rn[point];
           printf("r[%d]=%f, ",point,r[point]);
       }
      printf("\n");
    }
    return 0; 
} 

In [None]:
!g++ -o pagerank pagerank.cpp

In [None]:
!./pagerank

### OpenMP version

Complete the ```//To do``` sections



In [None]:
%%file pagerank_openmp.cpp
//To do here: Include omp header

#include <stdio.h>
#include <stdlib.h>
#include <vector>

void addEdge(std::vector<int> adj[],double w[],int u, int v) 
{ 
    adj[v].push_back(u); 
    w[u] += 1;
}

int main() 
{ 
    //Create vector of data
    int V = 6; 
    std::vector<int> adj[V];
    double w[V] = {0};
    
    // Creates a graph
    addEdge(adj,w,0,1);
    addEdge(adj,w,1,2);
    addEdge(adj,w,2,3);
    addEdge(adj,w,5,2);
    addEdge(adj,w,5,3);
    addEdge(adj,w,1,3);
    addEdge(adj,w,4,5);
    addEdge(adj,w,3,2);
    addEdge(adj,w,3,4);
    addEdge(adj,w,3,5);
    addEdge(adj,w,0,2);
    
    size_t niter = 10;
 
    double r[V] = {0};
    double rn[V] = {0};

    for(int point = 0; point < V; ++point)
    { 
      r[point] = 1;
    }

    //To do here: Create parallelizable regions wherever is possible
 
    for(size_t it = 0; it < niter; ++it)
    {
       for(int point = 0; point < V; ++point)
       { 
           rn[point] = 0;
           for(auto& neigh:adj[point]){
               rn[point] += r[neigh]/w[neigh];
           }
       }
     
       for(int point = 0; point < V; ++point)
       {
           r[point] = rn[point];
           printf("r[%d]=%f, ",point,r[point]);
       }
      printf("\n");
    }
    return 0; 
}

Once it is ready, compile and execute

In [None]:
!g++ -o pagerank_openmp -fopenmp pagerank_openmp.cpp

In [None]:
!OMP_NUM_THREADS=3 ./pagerank_openmp

## Exercise: K Means

### Serial Version

Code inspired in the blog: http://www.goldsborough.me/c++/python/cuda/2017/09/10/20-32-46-exploring_k-means_in_python,_c++_and_cuda/ that compares C++ ,Python and CUDA approaches

In [None]:
%%file kmeans.cpp
#include <algorithm>
#include <cstdlib>
#include <limits>
#include <random>
#include <vector>

struct Point {
  double x{0}, y{0};
};

using DataFrame = std::vector<Point>;

double squared_l2_distance(Point first, Point second) {
  return std::pow(first.x - second.x,2) + std::pow(first.y - second.y,2);
}

int main (){
  size_t niter = 10;
  size_t k = 2;
  DataFrame data = {Point{1,2},Point{1,2},Point{3,4},Point{10,4},Point{3,4}};
  
  // Pick centroids as random points from the dataset.
  static std::random_device seed;
  static std::mt19937 random_number_generator(seed());
  std::uniform_int_distribution<size_t> indices(0, data.size() - 1);

  DataFrame means(k);
  for (auto& cluster : means) {
    cluster = data[indices(random_number_generator)];
  }


  // Find assignments 
  std::vector<size_t> assignments(data.size());
  for (size_t it = 0; it < niter; ++it) {
    for (size_t point = 0; point < data.size(); ++point) {
      
      double min_distance = std::numeric_limits<double>::max();
      size_t best_cluster = 0;
      
      for (size_t cluster = 0; cluster < k; ++cluster) {
        const double distance =
            squared_l2_distance(data[point], means[cluster]);
        if (distance < min_distance) {
          min_distance = distance;
          best_cluster = cluster;
        }
      }

      assignments[point] = best_cluster;
    }

    // Sum up and count points for each cluster.
    DataFrame new_means(k);
    std::vector<size_t> counts(k, 0);
    for (size_t point = 0; point < data.size(); ++point) {
      const auto cluster = assignments[point];
      new_means[cluster].x += data[point].x;
      new_means[cluster].y += data[point].y;
      counts[cluster] += 1;
    }

    // Divide sums by counts to get new centroids.
    for (size_t cluster = 0; cluster < k; ++cluster) {
      // Turn 0/0 into 0/1 to avoid zero division.
      const auto count = std::max<size_t>(1, counts[cluster]);
      means[cluster].x = new_means[cluster].x / count;
      means[cluster].y = new_means[cluster].y / count;
    }
  }

  for (auto& centroid : means) {
    printf("(%f,%f)",centroid.x,centroid.y);
  }
}


In [None]:
!g++ -o kmeans kmeans.cpp

In [None]:
!./kmeans

### OpenMP version

To do: Modify the serial version using the least effort to make the code parallel using OpenMP, compile accordingly and execute using multiple threads

In [None]:
%%file kmeans_openmp.cpp



In [None]:
!g++ 

In [None]:
!