<a href="https://colab.research.google.com/github/andiub97/CovidPubRank/blob/master/CovidPageRank.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Page Ranking Algorithms on Google Cloud Dataproc

- Use the [Cloud Resource Manager](https://cloud.google.com/resource-manager) to create a project if you do not already have one.
- Enable Dataproc and Cloud Storage services for the project 
- [Enable billing](https://support.google.com/cloud/answer/6293499#enable-billing) for the project.
- See [Google Cloud Storage (GCS) Documentation](https://cloud.google.com/storage/) for more info.


## Set Environment variables(optional) 
Here you can choose if install colab-env dependency and allow Google Drive to access to your Drive and create environment file containing enviroment variables that you can choose for all later tasks involving bucket, cluster properties and so on.

In [None]:
!pip install colab-env --upgrade

In [None]:
import colab_env

colab_env.envvar_handler.add_env("GOOGLE_PROJECT_ID", "insert_project_name", overwrite=True)
colab_env.envvar_handler.add_env("CITATIONS_BUCKET", "insert_citations_bucket_name", overwrite=True)
colab_env.envvar_handler.add_env("OUTPUT_BUCKET", "insert_output_bucket_name", overwrite=True)
colab_env.envvar_handler.add_env("CLUSTER_REGION", "insert_region", overwrite=True)
colab_env.envvar_handler.add_env("CLUSTER_ZONE", "insert_zone", overwrite=True)
colab_env.envvar_handler.add_env("CLUSTER_MACHINES_TYPE", "insert_machine_type", overwrite=True)

!more gdrive/My\ Drive/vars.env

## Allow Google Cloud access to the notebook and set GC project for this session

In [None]:
import os
from google.colab import auth
auth.authenticate_user()

project_id = os.getenv("GOOGLE_PROJECT_ID")
!gcloud config set project {project_id}

## Clone CovidPubRank repo from Github, unzip citation archive and move files to "citations" folder 

In [None]:
import tarfile
!git clone https://github.com/andiub97/CovidPubRank.git

tar = tarfile.open("./CovidPubRank/data/citations.tar.gz")
tar.extractall()
tar.close()

!mkdir ./sample_data/citations
!mv ./citations_1.txt ./sample_data/citations
!mv ./citations_10.txt ./sample_data/citations
!mv ./citations_50.txt ./sample_data/citations
!mv ./citations_100.txt ./sample_data/citations

## Create buckets and load datasets into them
remember create a bucket for jar file and upload it using GCP Graphic Interface or shell, but we suggest using Google Cloud CLI by your machine

In [None]:
bucket_name = os.getenv("CITATIONS_BUCKET")

!gsutil mb -l us-central1 -b on gs://{bucket_name}

# Copy files to new bucket.
!gsutil cp ./sample_data/citations/citations_100.txt gs://{bucket_name}/
!gsutil cp ./sample_data/citations/citations_50.txt gs://{bucket_name}/
!gsutil cp ./sample_data/citations/citations_10.txt gs://{bucket_name}/
!gsutil cp ./sample_data/citations/citations_1.txt gs://{bucket_name}/

Creating gs://colab-citations-bucket/...
Copying file://./sample_data/citations/citations_100.txt [Content-Type=text/plain]...
/ [1 files][  1.4 MiB/  1.4 MiB]                                                
Operation completed over 1 objects/1.4 MiB.                                      
Copying file://./sample_data/citations/citations_50.txt [Content-Type=text/plain]...
-
Operation completed over 1 objects/2.3 MiB.                                      
Copying file://./sample_data/citations/citations_10.txt [Content-Type=text/plain]...
-
Operation completed over 1 objects/6.5 MiB.                                      
Copying file://./sample_data/citations/citations_1.txt [Content-Type=text/plain]...
|
Operation completed over 1 objects/125.3 MiB.                                    


## Create output bucket

In [None]:
output_bucket_name = os.getenv("OUTPUT_BUCKET")
!gsutil mb -l us-central1 -b on gs://{output_bucket_name}

Creating gs://ranking_output_bucket/...
ServiceException: 409 A Cloud Storage bucket named 'ranking_output_bucket' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.


# Weak Scalability 
perform all algorithms on same cluster varying in dataset size

## Create single node cluster

In [None]:
region = os.getenv("CLUSTER_REGION")
zone = os.getenv("CLUSTER_ZONE")
!gcloud dataproc clusters create single-node-cluster \
  --region {region} \
  --zone {zone} \
  --single-node 

## Send Dataproc jobs to single-node cluster for all datasets

In [None]:
!gcloud dataproc jobs submit spark \
    --cluster=single-node-cluster \
    --region={os.getenv("CLUSTER_REGION")} \
    --jar=gs://covid_program/covidpubrank_2.12-0.1.0-SNAPSHOT.jar \
    -- "single-node" "allAlgorithms" "gs://"+ {os.getenv("CITATIONS_BUCKET")} +"/citations_500.txt" "gs://" + {os.getenv("OUTPUT_BUCKET")} + "/single-node/distributed"

In [None]:
!gcloud dataproc jobs submit spark \
    --cluster=single-node-cluster \
    --region={os.getenv("CLUSTER_REGION")} \
    --jar=gs://covid_program/covidpubrank_2.12-0.1.0-SNAPSHOT.jar \
    -- "single-node" "allAlgorithms" "gs://"+ {os.getenv("CITATIONS_BUCKET")} + "/citations_100.txt" "gs://" + {os.getenv("OUTPUT_BUCKET")} +"/single-node/notDistributed"

In [None]:
!gcloud dataproc jobs submit spark \
    --cluster=single-node-cluster \
    --region={os.getenv("CLUSTER_REGION")} \
    --jar=gs://covid_program/covidpubrank_2.12-0.1.0-SNAPSHOT.jar \
    -- "single-node" "allAlgorithms" "gs://"+ {os.getenv("CITATIONS_BUCKET")} +"/citations_50.txt" "gs://" + {os.getenv("OUTPUT_BUCKET")} +"/single-node/notDistributed"

In [None]:
!gcloud dataproc jobs submit spark \
    --cluster=single-node-cluster \
    --region={os.getenv("CLUSTER_REGION")} \
    --jar=gs://covid_program/covidpubrank_2.12-0.1.0-SNAPSHOT.jar \
    -- "single-node" "allAlgorithms" "gs://"+ {os.getenv("CITATIONS_BUCKET")} + "/citations_10.txt" "gs://" + {os.getenv("OUTPUT_BUCKET")} + "/output1"

## Delete cluster

In [None]:
!gcloud dataproc clusters delete single-node-cluster \
    --region={os.getenv("CLUSTER_REGION")}

### Get jobs list and delete job specifying its id

In [None]:
!gcloud dataproc jobs list
!gcloud dataproc jobs delete "insert job_id you want delete"

# Strong scalability
perform all algorithms on same dataset varying number of workers per cluster

## Create 2-workers cluster

In [None]:
!gcloud dataproc clusters create 2workers-cluster \
  --region {os.getenv("CLUSTER_REGION")} \
  --zone {os.getenv("CLUSTER_ZONE")} \
  --master-machine-type {os.getenv("MACHINES_TYPE")}\
  --master-boot-disk-size {os.getenv("MACHINES_TYPE")} \
  --worker-machine-type {os.getenv("MACHINES_TYPE")} \
  --num-workers 2 \
  --worker-boot-disk-size {os.getenv("MACHINES_TYPE")}

## Send Dataproc jobs to 2-node cluster for "citations_1.txt" dataset

In [None]:
!gcloud dataproc jobs submit spark \
    --cluster=2workers-cluster \
    --region={os.getenv("CLUSTER_REGION")} \
    --jar=gs://covid_program/covidpubrank_2.12-0.1.0-SNAPSHOT.jar \
    -- "2-worker" "DistributedAlgorithms" "gs://"+ {os.getenv("CITATIONS_BUCKET")} +"/citations_1.txt" "gs://" + {os.getenv("OUTPUT_BUCKET")} + "/2workers/distributed"

### Delete cluster

In [None]:
!gcloud dataproc clusters delete 2workers-cluster \
    --region={os.getenv("CLUSTER_REGION")}

### Get jobs list and delete job specifying its id

In [None]:
!gcloud dataproc jobs list
!gcloud dataproc jobs delete job_id     

## Create 4-workers cluster

In [None]:
!gcloud dataproc clusters create 4workers-cluster \
  --region {os.getenv("CLUSTER_REGION")} \
  --zone {os.getenv("CLUSTER_ZONE")} \
  --master-machine-type {os.getenv("MACHINES_TYPE")} \
  --master-boot-disk-size {os.getenv("MACHINE_DISK")} \
  --worker-machine-type {os.getenv("MACHINES_TYPE")} \
  --num-workers 4 \
  --worker-boot-disk-size {os.getenv("MACHINES_TYPE")}

## Send Dataproc jobs to 4-node cluster for "citations_1.txt" dataset

In [None]:
!gcloud dataproc jobs submit spark \
    --cluster=4workers-cluster \
    --region={os.getenv("CLUSTER_REGION")} \
    --jar=gs://covid_program/covidpubrank_2.12-0.1.0-SNAPSHOT.jar \
    -- "4-worker" "DistributedAlgorithms" "gs://"+ {os.getenv("CITATIONS_BUCKET")} +"/citations_1.txt" "gs://" + {os.getenv("OUTPUT_BUCKET")} + "/4workers/distributed"

### Delete cluster

In [None]:
!gcloud dataproc clusters delete 4workers-cluster \
    --region={os.getenv("CLUSTER_REGION")}

### Get jobs list and delete job specifying its id

In [None]:
!gcloud dataproc jobs list
!gcloud dataproc jobs delete job_id

## Get job execution output, in other words get ranking algorithms' execution time and plot them

In [None]:
# Create output directory
mkdir ./sample_data/output

# Download the file from a given Google Cloud Storage bucket.
!gsutil cp gs://ranking_output_bucket/ ./sample_data/output

## Plot execution time of algorithms

In [None]:
import numpy as np
import matplotlib.pyplot as plt                                                

names = []
times = []
dataset_name = []
chars = "()\n"

results1 = {'citations_500.txt': {
       'ranking.DistributedPageRank': '0.0',
       'ranking.ParallelPageRankLibrary': '0.0',
       'ranking.PageRank':'0.0',
       'ranking.PageRankLibrary':'0.0'
   },
   'citations_100.txt': {
       'ranking.DistributedPageRank': '0.0',
       'ranking.ParallelPageRankLibrary': '0.0',
       'ranking.PageRank':'0.0',
       'ranking.PageRankLibrary':'0.0'
   },
   'citations_50.txt': {
       'ranking.DistributedPageRank': '0.0',
       'ranking.ParallelPageRankLibrary': '0.0',
       'ranking.PageRank':'0.0',
       'ranking.PageRankLibrary':'0.0'
   },
   'citations_1.txt': {
       'ranking.DistributedPageRank': '0.0',
       'ranking.ParallelPageRankLibrary': '0.0',
       'ranking.PageRank':'0.0',
       'ranking.PageRankLibrary':'0.0'
   }
}

f = open('/content/sample_data/Algorithm.txt','r')

for row in f:
    for c in chars:
        row = row.replace(c,"")
    for c in ",":
        row = row.replace(c," ")
    row = row.split(' ')
    if ("citations_500.txt" == row[2]):
      results1["citations_500.txt"][row[0]] = (row[1])
    if ("citations_100.txt" == row[2]):
      results1["citations_100.txt"][row[0]] = (row[1])
    if ("citations_50.txt" == row[2]):
      results1["citations_50.txt"][row[0]] = (row[1])    
    if ("citations_1.txt" == row[2]):
      results1["citations_1.txt"][row[0]] = (row[1])

# set width of bar
barWidth = 0.1
fig = plt.subplots(figsize =(16, 9))

# set height of bar
citations_500 = []
for i in results1["citations_500.txt"]:
    citations_500.append(float(results1["citations_500.txt"][i]))

citations_100 = []
for i in results1["citations_100.txt"]:
    citations_100.append(float(results1["citations_100.txt"][i]))

citations_50 = []
for i in results1["citations_50.txt"]:
    citations_50.append(float(results1["citations_50.txt"][i]))

citations_1 = []
for i in results1["citations_1.txt"]:
    citations_1.append(float(results1["citations_1.txt"][i]))

# Set position of bar on X axis
br1 = np.arange(len(citations_500))
br2 = [x + barWidth for x in br1]
br3 = [x + barWidth for x in br2]
br4 = [x + barWidth for x in br3]

# Make the plot
plt.bar(br1, citations_500, color ='r', width = barWidth,
		edgecolor ='grey', label ='citations_500.txt')

plt.bar(br2, citations_100, color ='g', width = barWidth,
		edgecolor ='grey', label ='citations_100.txt')

plt.bar(br3, citations_50, color ='b', width = barWidth,
		edgecolor ='grey', label ='citations_50.txt')

plt.bar(br4, citations_1, color ='yellow', width = barWidth,
		edgecolor ='grey', label ='citations_1.txt')

# Adding Xticks
plt.xlabel('Algorithms Names', fontweight ='bold', fontsize = 15)
plt.ylabel('Execution time [s] in log scale', fontweight ='bold', fontsize = 15)
# Setting a logarithmic scale for y-axis
x = results1["citations_500.txt"]

ticks = []
for k in x.keys():
  if k:
      ticks.append(k)
plt.xticks([r + barWidth for r in range(len(br1))],ticks)
plt.yscale('log')
plt.ylim(ymin= pow(10,-3))
plt.legend()
plt.show()