# DSCI 617 - Homework 01
**Jeffery Boczkaja**

In [0]:
import sys
import pyspark
import math
import numpy as np
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
from pyspark.mllib.random import RandomRDDs

In [0]:
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

## Problem 1: Terminology

1. Scala
2. SparkSession
3. SparkContext
4. Resilient Distributed Dataset
5. Partitions
6. Transformation
7. Action
8. Transformation
9. Action
10. Transformation
11. Action
12. Action
13. Master Node
14. Worker Nodes
15. Cluster Manager
16. Executors

## Problem 2: Working with a Numberical RDD

In [0]:
random_rdd = RandomRDDs.uniformRDD(sc, size=1200000,
seed=1)
print('Sum:     ', random_rdd.sum())
print('Mean:    ', random_rdd.mean())
print('Std Dev: ', random_rdd.stdev())
print('Minimum: ', random_rdd.min())
print('Maximum: ', random_rdd.max())

Sum:      599429.7862981893
Mean:     0.4995248219151624
Std Dev:  0.28868906841278813
Minimum:  1.0351479373671424e-07
Maximum:  0.9999991929309536


In [0]:
print('Number of Partitions in RDD:', random_rdd.getNumPartitions())
gmc = random_rdd.glom().map(len).collect()
print('Size of Partitions')
print(gmc)

Number of Partitions in RDD: 2
Size of Partitions
[600000, 600000]


## Problem 3: Transformations

In [0]:
scaled_rdd = random_rdd.map(lambda x : x*10)
print('Sum:     ', scaled_rdd.sum())
print('Mean:    ', scaled_rdd.mean())
print('Std Dev: ', scaled_rdd.stdev())
print('Minimum: ', scaled_rdd.min())
print('Maximum: ', scaled_rdd.max())

Sum:      5994297.86298199
Mean:     4.995248219151683
Std Dev:  2.8868906841278412
Minimum:  1.0351479373671424e-06
Maximum:  9.999991929309536


In [0]:
log_rdd = scaled_rdd.map(lambda x: math.log(x))
print('Sum:     ', log_rdd.sum())
print('Mean:    ', log_rdd.mean())
print('Std Dev: ', log_rdd.stdev())
print('Minimum: ', log_rdd.min())
print('Maximum: ', log_rdd.max())

Sum:      1561532.1140514212
Mean:     1.3012767617095178
Std Dev:  1.0004767070222784
Minimum:  -13.780966206806882
Maximum:  2.3025842859246737


## Problem 4: Calculating SSE

In [0]:
pairs_raw = sc.textFile('/FileStore/tables/pairs_data.txt')
print(pairs_raw.count())

12743548


In [0]:
for element in pairs_raw.take(5):
    print(element)

12.3 12.1
9.1 8.7
9.3 9.9
8.5 8.5
11.2 10.8


In [0]:
def process_line(row):
    tokens = row.split()
    value0 = float(tokens[0])
    value1 = float(tokens[1])
    return (value0, value1)

pairs = pairs_raw.map(process_line)

for element in pairs_raw.take(5):
    processed_tuple = process_line(element)
    print(processed_tuple)

(12.3, 12.1)
(9.1, 8.7)
(9.3, 9.9)
(8.5, 8.5)
(11.2, 10.8)


In [0]:
squared_diff_rdd = pairs.map(lambda pair: (pair[0] - pair[1])**2)
SSE = squared_diff_rdd.sum()
print(SSE)

4597380.190042952


## Problem 5: Calculating r-Squared

In [0]:
first_elements_rdd = pairs.map(lambda pair: pair[0])
mean = first_elements_rdd.mean()
print(mean)

10.00013136059118


In [0]:
squared_diff_mean_rdd = pairs.map(lambda pair: (pair[0] - mean)**2)
SST = squared_diff_mean_rdd.sum()
print(SST)

24980514.859974924


In [0]:
r2= 1 - (SSE/SST)
print(r2)

0.815961351644953


## Problem 6: NASA Server Logs

In [0]:
 nasa = sc.textFile('/FileStore/tables/NASA_server_logs_Aug_1995.txt')
 print(nasa.count())

1569888


In [0]:
for element in nasa.take(5):
    print(element)

in24.inetnebr.com [01/Aug/1995:00:00:01] "GET /shuttle/missions/sts-68/news/sts-68-mcc-05.txt" 200 1839
uplherc.upl.com [01/Aug/1995:00:00:07] "GET /" 304 0
uplherc.upl.com [01/Aug/1995:00:00:08] "GET /images/ksclogo-medium.gif" 304 0
uplherc.upl.com [01/Aug/1995:00:00:08] "GET /images/MOSAIC-logosmall.gif" 304 0
uplherc.upl.com [01/Aug/1995:00:00:08] "GET /images/USA-logosmall.gif" 304 0


In [0]:
get_requests_rdd = nasa.map(lambda x: 'GET' in x)
num_get_requests = get_requests_rdd.sum()

In [0]:
post_requests_rdd = nasa.map(lambda x: 'POST' in x)
num_post_requests = post_requests_rdd.sum()
head_requests_rdd = nasa.map(lambda x: 'HEAD' in x)
num_head_requests = head_requests_rdd.sum()

In [0]:
print("Number of GET requests  " , num_get_requests)
print("Number of POST Requests:", num_post_requests)
print("Number of HEAD Requests:", num_head_requests)

Number of GET requests   1565812
Number of POST Requests: 111
Number of HEAD Requests: 3965
