## Part 1

In [1]:
!wget https://pages.cs.wisc.edu/~harter/cs639/data/hdma-wi-2021.csv

--2023-03-24 00:48:04--  https://pages.cs.wisc.edu/~harter/cs639/data/hdma-wi-2021.csv
Resolving pages.cs.wisc.edu (pages.cs.wisc.edu)... 128.105.7.9
Connecting to pages.cs.wisc.edu (pages.cs.wisc.edu)|128.105.7.9|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 174944099 (167M) [text/csv]
Saving to: ‘hdma-wi-2021.csv.1’


2023-03-24 00:48:13 (18.0 MB/s) - ‘hdma-wi-2021.csv.1’ saved [174944099/174944099]



In [2]:
!hdfs dfs -D dfs.block.size=1048576 -D dfs.replication=1 -put hdma-wi-2021.csv hdfs://main:9000/single.csv
!hdfs dfs -D dfs.block.size=1048576 -D dfs.replication=2 -put hdma-wi-2021.csv hdfs://main:9000/double.csv

In [3]:
!hdfs dfs -du -h hdfs://main:9000/

166.8 M  333.7 M  hdfs://main:9000/double.csv
166.8 M  166.8 M  hdfs://main:9000/single.csv


## Part 2

In [4]:
import requests
import json

def get_datanode(offset, node_url):
    url = node_url + '/webhdfs/v1/single.csv?op=OPEN&offset=' + str(offset)
    response = requests.get(url, allow_redirects=False)
    return response.headers['Location']

def get_blocks(namenode_url):
    block_size = 1048576
    datanodes = {}
    block_offsets = range(0, block_size * 167, block_size)
    for offset in block_offsets:
        datanode_url = get_datanode(offset, namenode_url)
        datanode = datanode_url.split('/')[2]
        if datanode in datanodes:
            datanodes[datanode] += 1
        else:
            datanodes[datanode] = 1
    return datanodes

namenode_url = 'http://main:9870'
blocks = get_blocks(namenode_url)
print(blocks)

{'7d3db3a79dc4:9864': 80, '2990d032d772:9864': 87}


## Part 3

In [5]:
import io
import requests
import json

class hdfsFile(io.RawIOBase):
    def __init__(self, path):
        self.path = path
        self.offset = 0
        self.length = 0
        self.block_size = 1048576

        url = f'http://main:9870/webhdfs/v1{path}?op=GETFILESTATUS'
        response = requests.get(url)
        data = response.json()
        self.length = data['FileStatus']['length']

        self.num_blocks = (self.length + self.block_size - 1) // self.block_size
        self.block_offsets = range(0, self.block_size * self.num_blocks, self.block_size)

    def readable(self):
        return True

    def readinto(self, b):
        if self.offset >= self.length:
            return 0

        block_idx = self.offset // self.block_size
        block_offset = self.offset % self.block_size
        bufferlength=len(b)
        remaining_bytes = self.length - self.offset
        bytes_to_read = min(remaining_bytes, self.block_size - block_offset)

        url = f'http://main:9870/webhdfs/v1{self.path}'
        response = requests.get(url = url, params = {"op": "OPEN", "offset": self.offset, "length": bufferlength})
        data=response.content

        data_len = len(data)
        b[:data_len] = data

        self.offset += data_len
        return data_len


In [6]:
import time

hdfs_file = hdfsFile("/single.csv")

single_count = 0
multi_count = 0
t0 = time.time()
for line in io.BufferedReader(hdfs_file, buffer_size=int(1048576 / 2)): # 0.5MB buffer
    line = str(line, "utf-8")
    if "Single Family" in line:
        single_count += 1
    elif "Multifamily" in line:
        multi_count += 1
t1 = time.time()
print("Counts from single.csv (0.5 MB Buffer)")
print(f"Single Family: {single_count}")
print(f"Multi Family: {multi_count}")
print(f"{(t1-t0)} Seconds")


hdfs_file = hdfsFile("/single.csv")
single_count = 0
multi_count = 0
t0 = time.time()
for line in io.BufferedReader(hdfs_file, buffer_size=1048576): # 1MB buffer
    line = str(line, "utf-8")
    if "Single Family" in line:
        single_count += 1
    elif "Multifamily" in line:
        multi_count += 1
t1 = time.time()
print()
print("Counts from single.csv (1 MB Buffer)")
print(f"Single Family: {single_count}")
print(f"Multi Family: {multi_count}")
print(f"{(t1-t0)} Seconds")

hdfs_file = hdfsFile("/single.csv")
single_count = 0
multi_count = 0
t0 = time.time()
for line in io.BufferedReader(hdfs_file, buffer_size=1048576 * 2): # 2MB buffer
    line = str(line, "utf-8")
    if "Single Family" in line:
        single_count += 1
    elif "Multifamily" in line:
        multi_count += 1
t1 = time.time()
print()
print("Counts from single.csv (2 MB Buffer)")
print(f"Single Family: {single_count}")
print(f"Multi Family: {multi_count}")
print(f"{(t1-t0)} Seconds")

Counts from single.csv (0.5 MB Buffer)
Single Family: 444874
Multi Family: 2493
19.165616035461426 Seconds

Counts from single.csv (1 MB Buffer)
Single Family: 444874
Multi Family: 2493
5.896963834762573 Seconds

Counts from single.csv (2 MB Buffer)
Single Family: 444874
Multi Family: 2493
3.8786168098449707 Seconds


## Part 4

In [13]:
!hdfs dfsadmin -fs hdfs://main:9000/ -report

Configured Capacity: 51642105856 (48.10 GB)
Present Capacity: 36326984828 (33.83 GB)
DFS Remaining: 35797991424 (33.34 GB)
DFS Used: 528993404 (504.49 MB)
DFS Used%: 1.46%
Replicated Blocks:
	Under replicated blocks: 0
	Blocks with corrupt replicas: 0
	Missing blocks: 0
	Missing blocks (with replication factor 1): 0
	Low redundancy blocks with highest priority to recover: 0
	Pending deletion blocks: 0
Erasure Coded Block Groups: 
	Low redundancy block groups: 0
	Block groups with corrupt internal blocks: 0
	Missing block groups: 0
	Low redundancy blocks with highest priority to recover: 0
	Pending deletion blocks: 0

-------------------------------------------------
Live datanodes (1):

Name: 172.20.0.4:9866 (project-3-bc-worker-1.cs544net)
Hostname: 7d3db3a79dc4
Decommission Status : Normal
Configured Capacity: 25821052928 (24.05 GB)
DFS Used: 260713287 (248.64 MB)
Non DFS Used: 7644583097 (7.12 GB)
DFS Remaining: 17898979328 (16.67 GB)
DFS Used%: 1.01%
DFS Remaining%: 69.32%
Configur

In [14]:
import io
import requests
import json

class hdfsFile(io.RawIOBase):
    def __init__(self, path):
        self.path = path
        self.offset = 0
        self.length = 0
        self.block_size = 1048576

        url = f'http://main:9870/webhdfs/v1{path}?op=GETFILESTATUS'
        response = requests.get(url)
        data = response.json()
        self.length = data['FileStatus']['length']

        self.num_blocks = (self.length + self.block_size - 1) // self.block_size
        self.block_offsets = range(0, self.block_size * self.num_blocks, self.block_size)

    def readable(self):
        return True

    def readinto(self, b):
        if self.offset >= self.length:
            return 0
        block_idx = self.offset // self.block_size
        block_offset = self.offset % self.block_size
        bufferlength = len(b)


        url = f'http://main:9870/webhdfs/v1{self.path}'
        response = requests.get(url = url, params = {"op": "OPEN", "offset": self.offset, "length": bufferlength})
        data = response.content
        if response.status_code == 200:
            b[:len(data)] = data
            data_len = len(data)
            self.offset += data_len
        else:
            # Block is missing, insert newline character and move offset to next block
            b[:len(b'\n')] = b'\n'
            data_len = 1
            self.offset = (block_idx + 1) * self.block_size

        return data_len

In [17]:
hdfs_file = hdfsFile("/single.csv")

single_count = 0
multi_count = 0
for line in io.BufferedReader(hdfs_file, buffer_size=1048576):
    line = str(line, "utf-8")
    if "Single Family" in line:
        single_count += 1
    elif "Multifamily" in line:
        multi_count += 1
print("Counts from single.csv (1 MB Buffer)")
print(f"Single Family: {single_count}")
print(f"Multi Family: {multi_count}")


hdfs_file = hdfsFile("/double.csv")

single_count = 0
multi_count = 0
for line in io.BufferedReader(hdfs_file, buffer_size=1048576):
    line = str(line, "utf-8")
    if "Single Family" in line:
        single_count += 1
    elif "Multifamily" in line:
        multi_count += 1

print()
print("Counts from double.csv (1 MB Buffer)")
print(f"Single Family: {single_count}")
print(f"Multi Family: {multi_count}")

Counts from single.csv (1 MB Buffer)
Single Family: 212304
Multi Family: 1513

Counts from double.csv (1 MB Buffer)
Single Family: 444874
Multi Family: 2493
