<a target="_blank" href="../cluster" style="font-size:20px">All Applications (YARN)</a>

# HDFS configuration

https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/hdfs-default.xml

In [1]:
! cat $HADOOP_HOME/etc/hadoop/hdfs-site.xml

<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
	<property>
		<name>dfs.blocksize</name>
		<value>16m</value>
	</property>
	<property>
		<name>dfs.replication</name>
		<value>1</value>
	</property>
	<property>
		<name>dfs.permissions.enabled</name>
		<value>false</value>
	</property>
	<property>
		<name>dfs.namenode.name.dir</name>
		<value>file:///usr/local/hadoop/hdfs/namenode</value>
	</property>
	<property>
		<name>dfs.datanode.data.dir</name>
		<value>file:///usr/local/hadoop/hdfs/datanode</value>
	</property>
</configuration>


In [2]:
! ls /usr/local/hadoop/hdfs

datanode  namenode


# Available commands

In [3]:
! hadoop fs

Usage: hadoop fs [generic options]
	[-appendToFile <localsrc> ... <dst>]
	[-cat [-ignoreCrc] <src> ...]
	[-checksum [-v] <src> ...]
	[-chgrp [-R] GROUP PATH...]
	[-chmod [-R] <MODE[,MODE]... | OCTALMODE> PATH...]
	[-chown [-R] [OWNER][:[GROUP]] PATH...]
	[-copyFromLocal [-f] [-p] [-l] [-d] [-t <thread count>] <localsrc> ... <dst>]
	[-copyToLocal [-f] [-p] [-ignoreCrc] [-crc] <src> ... <localdst>]
	[-count [-q] [-h] [-v] [-t [<storage type>]] [-u] [-x] [-e] <path> ...]
	[-cp [-f] [-p | -p[topax]] [-d] <src> ... <dst>]
	[-createSnapshot <snapshotDir> [<snapshotName>]]
	[-deleteSnapshot <snapshotDir> <snapshotName>]
	[-df [-h] [<path> ...]]
	[-du [-s] [-h] [-v] [-x] <path> ...]
	[-expunge [-immediate] [-fs <path>]]
	[-find <path> ... <expression> ...]
	[-get [-f] [-p] [-ignoreCrc] [-crc] <src> ... <localdst>]
	[-getfacl [-R] <path>]
	[-getfattr [-R] {-n name | -d} [-e en] <path>]
	[-getmerge [-nl] [-skip-empty-file] <src> <localdst>]
	[-head <file>]
	[-help [cmd ...]

# Copy files to/from HDFS

In [4]:
! echo "Test file" > test.txt

In [5]:
! ls -l .

total 384
-rw-r--r-- 1 jovyan root   49768 Oct 18 15:54 bow.png
-rw-r--r-- 1 jovyan root    6114 Oct 27 18:00 hdfs-basics.ipynb
-rw-r--r-- 1 jovyan root  139941 Oct 18 15:54 lr.png
-rw-r--r-- 1 jovyan users  25998 Oct 27 18:00 mapreduce-hw.ipynb
-rw-r--r-- 1 jovyan root   11576 Oct 27 18:00 mapreduce-wordcount.ipynb
-rw-r--r-- 1 jovyan root   33040 Oct 21 02:18 recsys.ipynb
-rw-r--r-- 1 jovyan root   73278 Oct 18 17:14 spark-advanced.ipynb
-rw-r--r-- 1 jovyan root   10381 Sep 30 00:19 spark-basics.ipynb
-rw-r--r-- 1 jovyan root    8905 Sep 30 00:19 spark-sql.ipynb
-rw-r--r-- 1 jovyan users     10 Oct 27 18:00 test.txt
drwxr-xr-x 1 jovyan root    4096 Oct 16 00:48 wiki
drwxr-xr-x 1 jovyan root    4096 Oct 19 00:36 yandex_music


In [6]:
! hadoop fs -copyFromLocal test.txt /

In [7]:
! hadoop fs -ls /

Found 2 items
-rw-r--r--   1 jovyan supergroup         10 2020-10-27 18:00 /test.txt
drwxrwx---   - root   supergroup          0 2020-10-27 17:59 /tmp


In [8]:
! hadoop fs -copyToLocal /test.txt test2.txt

In [9]:
! cat test2.txt

Test file


# Streaming from/to HDFS

In [10]:
! hadoop fs -cat /test.txt | wc -w

2


In [11]:
! echo "1 2 3 4" | hadoop fs -copyFromLocal - /test3.txt

In [12]:
! hadoop fs -cat /test3.txt

1 2 3 4


# Change the replication factor

In [13]:
! cat $HADOOP_HOME/etc/hadoop/hdfs-site.xml | grep -A 1 replication 

		<name>dfs.replication</name>
		<value>1</value>


In [14]:
! hadoop fs -stat %r /test.txt

1


In [15]:
! hadoop fs -setrep 2 /test.txt

Replication 2 set: /test.txt


In [16]:
! hadoop fs -stat %r /test.txt

2


# HDFS block size

In [17]:
! cat $HADOOP_HOME/etc/hadoop/hdfs-site.xml | grep -A 1 block 

		<name>dfs.blocksize</name>
		<value>16m</value>


In [18]:
%%file generate.py
import sys

megabytes = int(sys.argv[1])
string = "{}M\n".format(megabytes)
count = 1024 * 1024 * megabytes // len(string) - 1
print(string * count)

Writing generate.py


In [19]:
! python ./generate.py 1 | head -n 5

1M
1M
1M
1M
1M
Traceback (most recent call last):
  File "./generate.py", line 6, in <module>
    print(string * count)
BrokenPipeError: [Errno 32] Broken pipe


In [20]:
! hadoop fs -rm /16.txt /32.txt

rm: `/16.txt': No such file or directory
rm: `/32.txt': No such file or directory


In [21]:
# 16 MB file
! python ./generate.py 16 | hadoop fs -copyFromLocal - /16.txt

# 32 MB file
! python ./generate.py 32 | hadoop fs -copyFromLocal - /32.txt

In [22]:
! hdfs fsck / -files -blocks

Connecting to namenode via http://localhost:9870/fsck?ugi=jovyan&files=1&blocks=1&path=%2F
FSCK started by jovyan (auth:SIMPLE) from /127.0.0.1 for path / at Tue Oct 27 18:01:09 GMT 2020

/ <dir>
/16.txt 16777213 bytes, replicated: replication=1, 1 block(s):  OK
0. BP-417124827-172.18.0.2-1603454947911:blk_1073741827_1003 len=16777213 Live_repl=1

/32.txt 33554429 bytes, replicated: replication=1, 2 block(s):  OK
0. BP-417124827-172.18.0.2-1603454947911:blk_1073741828_1004 len=16777216 Live_repl=1
1. BP-417124827-172.18.0.2-1603454947911:blk_1073741829_1005 len=16777213 Live_repl=1

/test.txt 10 bytes, replicated: replication=2, 1 block(s):  Under replicated BP-417124827-172.18.0.2-1603454947911:blk_1073741825_1001. Target Replicas is 2 but found 1 live replica(s), 0 decommissioned replica(s), 0 decommissioning replica(s).
0. BP-417124827-172.18.0.2-1603454947911:blk_1073741825_1001 len=10 Live_repl=1

/test3.txt 8 bytes, replicated: replication=1, 1 block(s):  OK
0. BP-

# What HDFS looks like on a disk

In [23]:
! cat $HADOOP_HOME/etc/hadoop/hdfs-site.xml | grep -A 1 dir

		<name>dfs.namenode.name.dir</name>
		<value>file:///usr/local/hadoop/hdfs/namenode</value>
--
		<name>dfs.datanode.data.dir</name>
		<value>file:///usr/local/hadoop/hdfs/datanode</value>


In [24]:
! ls -R $HADOOP_HOME/hdfs/datanode

/usr/local/hadoop/hdfs/datanode:
current  in_use.lock

/usr/local/hadoop/hdfs/datanode/current:
BP-417124827-172.18.0.2-1603454947911  VERSION

/usr/local/hadoop/hdfs/datanode/current/BP-417124827-172.18.0.2-1603454947911:
current  scanner.cursor  tmp

/usr/local/hadoop/hdfs/datanode/current/BP-417124827-172.18.0.2-1603454947911/current:
finalized  rbw	VERSION

/usr/local/hadoop/hdfs/datanode/current/BP-417124827-172.18.0.2-1603454947911/current/finalized:
subdir0

/usr/local/hadoop/hdfs/datanode/current/BP-417124827-172.18.0.2-1603454947911/current/finalized/subdir0:
subdir0

/usr/local/hadoop/hdfs/datanode/current/BP-417124827-172.18.0.2-1603454947911/current/finalized/subdir0/subdir0:
blk_1073741825		  blk_1073741827	    blk_1073741829
blk_1073741825_1001.meta  blk_1073741827_1003.meta  blk_1073741829_1005.meta
blk_1073741826		  blk_1073741828
blk_1073741826_1002.meta  blk_1073741828_1004.meta

/usr/local/hadoop/hdfs/datanode/current/BP-417124827-172.18.0.2-1

In [25]:
! find $HADOOP_HOME/hdfs/datanode | grep blk | xargs head -c 32

==> /usr/local/hadoop/hdfs/datanode/current/BP-417124827-172.18.0.2-1603454947911/current/finalized/subdir0/subdir0/blk_1073741827 <==
16M
16M
16M
16M
16M
16M
16M
16M

==> /usr/local/hadoop/hdfs/datanode/current/BP-417124827-172.18.0.2-1603454947911/current/finalized/subdir0/subdir0/blk_1073741826_1002.meta <==
    ,��
==> /usr/local/hadoop/hdfs/datanode/current/BP-417124827-172.18.0.2-1603454947911/current/finalized/subdir0/subdir0/blk_1073741829 <==
32M
32M
32M
32M
32M
32M
32M
32M

==> /usr/local/hadoop/hdfs/datanode/current/BP-417124827-172.18.0.2-1603454947911/current/finalized/subdir0/subdir0/blk_1073741829_1005.meta <==
    �FmG�FmG�FmG�FmG�FmG�FmG�
==> /usr/local/hadoop/hdfs/datanode/current/BP-417124827-172.18.0.2-1603454947911/current/finalized/subdir0/subdir0/blk_1073741828 <==
32M
32M
32M
32M
32M
32M
32M
32M

==> /usr/local/hadoop/hdfs/datanode/current/BP-417124827-172.18.0.2-1603454947911/current/finalized/subdir0/subdir0/blk_10737418

In [26]:
! ls -R $HADOOP_HOME/hdfs/namenode

/usr/local/hadoop/hdfs/namenode:
current  in_use.lock

/usr/local/hadoop/hdfs/namenode/current:
edits_0000000000000000001-0000000000000000009  fsimage_0000000000000000000.md5
edits_inprogress_0000000000000000010	       seen_txid
fsimage_0000000000000000000		       VERSION
