# Part 1: HDFS Deployment and Data Upload

In [1]:
!wget https://pages.cs.wisc.edu/~harter/cs639/data/hdma-wi-2021.csv

--2023-03-20 20:40:10--  https://pages.cs.wisc.edu/~harter/cs639/data/hdma-wi-2021.csv
Resolving pages.cs.wisc.edu (pages.cs.wisc.edu)... 128.105.7.9
Connecting to pages.cs.wisc.edu (pages.cs.wisc.edu)|128.105.7.9|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 174944099 (167M) [text/csv]
Saving to: ‘hdma-wi-2021.csv’


2023-03-20 20:40:12 (87.2 MB/s) - ‘hdma-wi-2021.csv’ saved [174944099/174944099]



In [2]:
#!hdfs dfs -rm -R hdfs://main:9000/single.csv
#!hdfs dfs -rm -R hdfs://main:9000/double.csv
!hdfs dfs -D dfs.block.size=1048576 -D dfs.replication=1 -cp hdma-wi-2021.csv hdfs://main:9000/single.csv
!hdfs dfs -D dfs.block.size=1048576 -D dfs.replication=2 -cp hdma-wi-2021.csv hdfs://main:9000/double.csv

In [3]:
!hdfs dfs -du -h hdfs://main:9000/

166.8 M  333.7 M  hdfs://main:9000/double.csv
166.8 M  166.8 M  hdfs://main:9000/single.csv


# Part 2: Block Locations

In [4]:
import requests

In [5]:
result = {}
for i in range(167):
    url = "http://main:9870/webhdfs/v1/single.csv?op=OPEN&offset=" + str(i * 1048576)
    resp = requests.get(url, allow_redirects=False)
    offset = resp.headers["Location"].split("?")[0]
    if offset not in result.keys():
        result[offset] = 1
    else:
        result[offset] += 1

In [6]:
result

{'http://2d2db9d9de36:9864/webhdfs/v1/single.csv': 77,
 'http://9878e36b04c8:9864/webhdfs/v1/single.csv': 90}

# Part 3: Reading the Data

In [7]:
import time

In [8]:
import io

class hdfsFile(io.RawIOBase):
    def __init__(self, path):
        self.path = path
        self.offset = 0
        url = "http://main:9870/webhdfs/v1/"+ path + "?op=GETFILESTATUS"
        resp = requests.get(url, allow_redirects=True)
        data = resp.json()
        self.length = data['FileStatus']['length']

    def readable(self):
        return True

    def readinto(self, b):
        if self.offset >= self.length:
            return 0
        if self.offset + len(b) < self.length:
            length = len(b)
        else:
            length = self.length - self.offset
            
        url = f'http://main:9870/webhdfs/v1/{self.path}?op=OPEN&offset={self.offset}&length={length}'
        resp = requests.get(url, allow_redirects=True)
        self.offset += length
        try:
            b[0:length] = resp.content
        except ValueError:
            newline = bytearray('\n', 'utf-8')
            b[0:1] = newline
            return 1
        return length

In [9]:
time_0 = time.time()
n_1 = 0
n_2 = 0
for line in io.BufferedReader(hdfsFile("single.csv"), buffer_size=262144):
    line = str(line, "utf-8")
    if "Single Family" in line:
        n_1 += 1
    if "Multifamily" in line:
        n_2 += 1
time_1 = time.time()
print("Counts from single.csv")
print(f'Single Family: {n_1}')
print(f'Multi Family: {n_2}')
print(f'Seconds: {time_1 - time_0}')

Counts from single.csv
Single Family: 444874
Multi Family: 2493
Seconds: 27.837336540222168


In [10]:
time_0 = time.time()
n_1 = 0
n_2 = 0
for line in io.BufferedReader(hdfsFile("single.csv"), buffer_size=524288):
    line = str(line, "utf-8")
    if "Single Family" in line:
        n_1 += 1
    if "Multifamily" in line:
        n_2 += 1
time_1 = time.time()
print("Counts from single.csv")
print(f'Single Family: {n_1}')
print(f'Multi Family: {n_2}')
print(f'Seconds: {(time_1 - time_0)}')

Counts from single.csv
Single Family: 444874
Multi Family: 2493
Seconds: 8.11428427696228


# Part 4: Disaster Strikes

In [14]:
!hdfs dfsadmin -fs hdfs://main:9000/ -report

Configured Capacity: 15415644160 (14.36 GB)
Present Capacity: 7094927599 (6.61 GB)
DFS Remaining: 6837219328 (6.37 GB)
DFS Used: 257708271 (245.77 MB)
DFS Used%: 3.63%
Replicated Blocks:
	Under replicated blocks: 167
	Blocks with corrupt replicas: 0
	Missing blocks: 90
	Missing blocks (with replication factor 1): 90
	Low redundancy blocks with highest priority to recover: 167
	Pending deletion blocks: 0
Erasure Coded Block Groups: 
	Low redundancy block groups: 0
	Block groups with corrupt internal blocks: 0
	Missing block groups: 0
	Low redundancy blocks with highest priority to recover: 0
	Pending deletion blocks: 0

-------------------------------------------------
Live datanodes (1):

Name: 172.19.0.4:9866 (project-3-kkk-worker-2.cs544net)
Hostname: 2d2db9d9de36
Decommission Status : Normal
Configured Capacity: 15415644160 (14.36 GB)
DFS Used: 257708271 (245.77 MB)
Non DFS Used: 8303939345 (7.73 GB)
DFS Remaining: 6837219328 (6.37 GB)
DFS Used%: 1.67%
DFS Remaining%: 44.35%
Configu

In [15]:
n_1 = 0
n_2 = 0
for line in io.BufferedReader(hdfsFile("double.csv"), buffer_size=524288):
    line = str(line, "utf-8")
    if "Single Family" in line:
        n_1 += 1
    if "Multifamily" in line:
        n_2 += 1
print("Counts from double.csv")
print(f'Single Family: {n_1}')
print(f'Multi Family: {n_2}')

Counts from double.csv
Single Family: 444874
Multi Family: 2493


In [16]:
n_1 = 0
n_2 = 0
for line in io.BufferedReader(hdfsFile("single.csv"), buffer_size=524288):
    line = str(line, "utf-8")
    if "Single Family" in line:
        n_1 += 1
    if "Multifamily" in line:
        n_2 += 1
print("Counts from single.csv")
print(f'Single Family: {n_1}')
print(f'Multi Family: {n_2}')

Counts from single.csv
Single Family: 205723
Multi Family: 1004
