# Installing and Configuring Environment

## Installing Java 8
Hadoop is a java programming-based data processing framework

OpenJDK is a development environment for building applications, applets, and components using the Java programming language.

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!java -version

!update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
!update-alternatives --set javac /usr/lib/jvm/java-8-openjdk-amd64/bin/javac
!update-alternatives --set jps /usr/lib/jvm/java-8-openjdk-amd64/bin/jps
!java -version

#Finding the default Java path
!readlink -f /usr/bin/java | sed "s:bin/java::"
!apt-get install openssh-server -qq > /dev/null
!service ssh start

!grep Port /etc/ssh/sshd_config

#Creating a new rsa key pair with empty password
!ssh-keygen -t rsa -P "" -f ~/.ssh/id_rsa <<< y

# See id_rsa.pub content
!more /root/.ssh/id_rsa.pub

#Copying the key to autorized keys
!cat $HOME/.ssh/id_rsa.pub > $HOME/.ssh/authorized_keys
#Changing the permissions on the key
!chmod 0600 ~/.ssh/authorized_keys

#Conneting with the local machine
!ssh -o StrictHostKeyChecking=no localhost uptime


#Downloading Hadoop 3.2.3
!wget -q https://archive.apache.org/dist/hadoop/common/hadoop-3.2.3/hadoop-3.2.3.tar.gz

#Untarring the file
!sudo tar -xzf hadoop-3.2.3.tar.gz
#Removing the tar file
!rm hadoop-3.2.3.tar.gz


#Copying the hadoop files to user/local
!cp -r hadoop-3.2.3/ /usr/local/
#-r copy directories recursively

#Adding JAVA_HOME directory to hadoop-env.sh file
!sed -i '/export JAVA_HOME=/a export JAVA_HOME=\/usr\/lib\/jvm\/java-8-openjdk-amd64' /usr/local/hadoop-3.2.3/etc/hadoop/hadoop-env.sh

import os
#Creating environment variables
#Creating Hadoop home variable

os.environ["HADOOP_HOME"] = "/usr/local/hadoop-3.2.3"
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["JRE_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64/jre"
os.environ["PATH"] += f'{os.environ["JAVA_HOME"]}/bin:{os.environ["JRE_HOME"]}/bin:{os.environ["HADOOP_HOME"]}/sbin'

openjdk version "11.0.25" 2024-10-15
OpenJDK Runtime Environment (build 11.0.25+9-post-Ubuntu-1ubuntu122.04)
OpenJDK 64-Bit Server VM (build 11.0.25+9-post-Ubuntu-1ubuntu122.04, mixed mode, sharing)
update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java to provide /usr/bin/java (java) in manual mode
update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/bin/javac to provide /usr/bin/javac (javac) in manual mode
update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/bin/jps to provide /usr/bin/jps (jps) in manual mode
openjdk version "1.8.0_432"
OpenJDK Runtime Environment (build 1.8.0_432-8u432-ga~us1-0ubuntu2~22.04-ga)
OpenJDK 64-Bit Server VM (build 25.432-bga, mixed mode)
/usr/lib/jvm/java-8-openjdk-amd64/jre/
 * Starting OpenBSD Secure Shell server sshd
   ...done.
#Port 22
#GatewayPorts no
Generating public/private rsa key pair.
Created directory '/root/.ssh'.
Your identification has been saved in /root/.ssh/id_rsa
Your public key has been saved in /

## Adding Required Property to core-site.xml file

In [2]:
%%bash
cat <<EOF > $HADOOP_HOME/etc/hadoop/core-site.xml
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->

<!-- Put site-specific property overrides in this file. -->

<configuration>
  <property>
          <name>fs.defaultFS</name>
          <value>hdfs://localhost:9000</value>
          <description>Where HDFS NameNode can be found on the network</description>
  </property>
</configuration>
EOF

In [3]:
%%bash
cat <<EOF > $HADOOP_HOME/etc/hadoop/hdfs-site.xml
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->

<!-- Put site-specific property overrides in this file. -->

<configuration>
<property>
    <name>dfs.replication</name>
    <value>1</value>
  </property>

</configuration>
EOF

In [4]:
%%bash
cat <<EOF > $HADOOP_HOME/etc/hadoop/mapred-site.xml
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->

<!-- Put site-specific property overrides in this file. -->

<configuration>
<property>
    <name>mapreduce.framework.name</name>
    <value>yarn</value>
  </property>
  <property>
    <name>mapreduce.application.classpath</name>
    <value>$HADOOP_HOME/share/hadoop/mapreduce/*:$HADOOP_HOME/share/hadoop/mapreduce/lib/*</value>
  </property>

</configuration>
EOF

In [5]:
%%bash
cat <<EOF > $HADOOP_HOME/etc/hadoop/yarn-site.xml
<?xml version="1.0"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->
<configuration>
<property>
    <description>The hostname of the RM.</description>
    <name>yarn.resourcemanager.hostname</name>
    <value>localhost</value>
  </property>
  <property>
    <name>yarn.nodemanager.aux-services</name>
    <value>mapreduce_shuffle</value>
  </property>
  <property>
    <name>yarn.nodemanager.env-whitelist</name>
    <value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_HOME,PATH,LANG,TZ,HADOOP_MAPRED_HOME</value>
  </property>

<!-- Site specific YARN configuration properties -->

</configuration>
EOF

## Formatting the HDFS File System
Before HDFS can be used for the first time the file system must be formatted. The formatting process creates an empty file system by creating the storage directories and the initial versions of the NameNodes

In [6]:
!$HADOOP_HOME/bin/hdfs namenode -format

#Creating other necessary enviroment variables before starting nodes
os.environ["HDFS_NAMENODE_USER"] = "root"
os.environ["HDFS_DATANODE_USER"] = "root"
os.environ["HDFS_SECONDARYNAMENODE_USER"] = "root"
os.environ["YARN_RESOURCEMANAGER_USER"] = "root"
os.environ["YARN_NODEMANAGER_USER"] = "root"

#Launching hdfs deamons
!$HADOOP_HOME/sbin/start-dfs.sh

#Launching yarn deamons
#nohup causes a process to ignore a SIGHUP signal
!nohup $HADOOP_HOME/sbin/start-yarn.sh

#Listing the running deamons
!jps

2024-12-04 11:33:29,261 INFO namenode.NameNode: STARTUP_MSG: 
/************************************************************
STARTUP_MSG: Starting NameNode
STARTUP_MSG:   host = cdbf65065d6e/172.28.0.12
STARTUP_MSG:   args = [-format]
STARTUP_MSG:   version = 3.2.3
STARTUP_MSG:   classpath = /usr/local/hadoop-3.2.3/etc/hadoop:/usr/local/hadoop-3.2.3/share/hadoop/common/lib/kerb-identity-1.0.1.jar:/usr/local/hadoop-3.2.3/share/hadoop/common/lib/commons-configuration2-2.1.1.jar:/usr/local/hadoop-3.2.3/share/hadoop/common/lib/netty-3.10.6.Final.jar:/usr/local/hadoop-3.2.3/share/hadoop/common/lib/kerb-core-1.0.1.jar:/usr/local/hadoop-3.2.3/share/hadoop/common/lib/javax.servlet-api-3.1.0.jar:/usr/local/hadoop-3.2.3/share/hadoop/common/lib/jetty-util-ajax-9.4.40.v20210413.jar:/usr/local/hadoop-3.2.3/share/hadoop/common/lib/woodstox-core-5.3.0.jar:/usr/local/hadoop-3.2.3/share/hadoop/common/lib/jackson-core-2.10.5.jar:/usr/local/hadoop-3.2.3/share/hadoop/common/lib/accessors-smart-2.4.7.jar:/u

### Monitoring Hadoop cluster with hadoop admin commands

In [7]:
#Report the basic file system information and statistics
!$HADOOP_HOME/bin/hdfs dfsadmin -report

Configured Capacity: 115658190848 (107.72 GB)
Present Capacity: 78070607872 (72.71 GB)
DFS Remaining: 78070583296 (72.71 GB)
DFS Used: 24576 (24 KB)
DFS Used%: 0.00%
Replicated Blocks:
	Under replicated blocks: 0
	Blocks with corrupt replicas: 0
	Missing blocks: 0
	Missing blocks (with replication factor 1): 0
	Low redundancy blocks with highest priority to recover: 0
	Pending deletion blocks: 0
Erasure Coded Block Groups: 
	Low redundancy block groups: 0
	Block groups with corrupt internal blocks: 0
	Missing block groups: 0
	Low redundancy blocks with highest priority to recover: 0
	Pending deletion blocks: 0

-------------------------------------------------
Live datanodes (1):

Name: 127.0.0.1:9866 (localhost)
Hostname: cdbf65065d6e
Decommission Status : Normal
Configured Capacity: 115658190848 (107.72 GB)
DFS Used: 24576 (24 KB)
Non DFS Used: 37570813952 (34.99 GB)
DFS Remaining: 78070575104 (72.71 GB)
DFS Used%: 0.00%
DFS Remaining%: 67.50%
Configured Cache Capacity: 0 (0 B)
Cache

# Map and Reduce with Hadoop Streaming Using Python
Hadoop Streaming is a feature that comes with Hadoop and allows users or developers to use various different languages for writing MapReduce programs like Python, C++, Ruby, etc.

The utility will create a Map/Reduce job, submit the job to an appropriate cluster, and monitor the progress of the job until it completes.

In [8]:
#Exploring Hadoop utilities available
!ls $HADOOP_HOME/share/hadoop/tools/lib/

aliyun-java-sdk-core-4.5.10.jar      hadoop-gridmix-3.2.3.jar
aliyun-java-sdk-kms-2.11.0.jar	     hadoop-kafka-3.2.3.jar
aliyun-java-sdk-ram-3.1.0.jar	     hadoop-openstack-3.2.3.jar
aliyun-sdk-oss-3.13.0.jar	     hadoop-resourceestimator-3.2.3.jar
aws-java-sdk-bundle-1.11.901.jar     hadoop-rumen-3.2.3.jar
azure-data-lake-store-sdk-2.2.9.jar  hadoop-sls-3.2.3.jar
azure-keyvault-core-1.0.0.jar	     hadoop-streaming-3.2.3.jar
azure-storage-7.0.0.jar		     ini4j-0.5.4.jar
hadoop-aliyun-3.2.3.jar		     jdom2-2.0.6.jar
hadoop-archive-logs-3.2.3.jar	     kafka-clients-2.8.1.jar
hadoop-archives-3.2.3.jar	     lz4-java-1.7.1.jar
hadoop-aws-3.2.3.jar		     ojalgo-43.0.jar
hadoop-azure-3.2.3.jar		     opentracing-api-0.33.0.jar
hadoop-azure-datalake-3.2.3.jar      opentracing-noop-0.33.0.jar
hadoop-datajoin-3.2.3.jar	     opentracing-util-0.33.0.jar
hadoop-distcp-3.2.3.jar		     org.jacoco.agent-0.8.5-runtime.jar
hadoop-extras-3.2.3.jar		     wildfly-openssl-1.0.7.Final.jar
hadoop-fs2img-3.2.3.

## Creating directory in HDFS

In [9]:
!$HADOOP_HOME/bin/hdfs dfs -mkdir /word_count_with_python

## Copying the file from local file system to Hadoop distributed file system (HDFS)

In [10]:
!$HADOOP_HOME/bin/hdfs dfs -put /content/uud-1945.txt /word_count_with_python

## Create Mapper File
The mapper is an executable that reads all input records from a file/s and generates an output in the form of key-value pairs which works as input for the Reducer.

In [11]:
%%writefile mapper.py

#!/usr/bin/env python

#'#!' is known as shebang and used for interpreting the script

# import sys because we need to read and write data to STDIN and STDOUT
import sys

# reading entire line from STDIN (standard input)
for line in sys.stdin:
  # to remove leading and trailing whitespace
  ###
  ### "sadsadas dsdasda"
  ### sadsadas
  ### sadsadas
  line = line.strip()
  # split the line into words, output data type list
  words = line.split()

  # we are looping over the words array and printing the word
  # with the count of 1 to the STDOUT
  for word in words:
    # write the results to STDOUT (standard output);
    # what we output here will be the input for the
    # Reduce step, i.e. the input for reducer.py
    print('%s\t%s' % (word, 1))

Writing mapper.py


## Create Reducer File
The reducer is an executable that reads all the intermediate key-value pairs generated by the mapper and generates a final output as a result of a computation operation like addition, filtration, and aggregation.

Both the mapper and the reducer read the input from stdin (line by line) and emit the output to stdout.

In [12]:
%%writefile reducer.py

#!/usr/bin/env python

from operator import itemgetter
import sys

current_word = None
current_count = 0
word = None

# read the entire line from STDIN
for line in sys.stdin:
  # remove leading and trailing whitespace
  line = line.strip()
  # splitting the data on the basis of tab we have provided in mapper.py
  word, count = line.split('\t', 1)
  # convert count (currently a string) to int
  try:
    count = int(count)
  except ValueError:
    # count was not a number, so silently
    # ignore/discard this line
    continue

  # this IF-switch only works because Hadoop sorts map output
  # by key (here: word) before it is passed to the reducer
  if current_word == word:
    current_count += count
  else:
    if current_word: #to not print current_word=None
      # write result to STDOUT
      print('%s\t%s' % (current_word, current_count))
    current_count = count
    current_word = word

# do not forget to output the last word if needed!
if current_word == word:
  print('%s\t%s' % (current_word, current_count))

Writing reducer.py


## Changing the permissions of the files

In [13]:
!chmod 777 /content/mapper.py /content/reducer.py
#Setting 777 permissions to a file or directory means that it will be readable, writable and executable by all users

## Running MapReduce Programs

In [14]:
!$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-3.2.3.jar \
  -input /word_count_with_python/uud-1945.txt \
  -output /word_count_with_python/output \
  -mapper "python /content/mapper.py" \
  -reducer "python /content/reducer.py"

packageJobJar: [/tmp/hadoop-unjar6357925457218400261/] [] /tmp/streamjob2589170746446444940.jar tmpDir=null
2024-12-04 11:34:11,634 INFO client.RMProxy: Connecting to ResourceManager at localhost/127.0.0.1:8032
2024-12-04 11:34:12,004 INFO client.RMProxy: Connecting to ResourceManager at localhost/127.0.0.1:8032
2024-12-04 11:34:12,356 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/root/.staging/job_1733312036894_0001
2024-12-04 11:34:12,765 INFO mapred.FileInputFormat: Total input files to process : 1
2024-12-04 11:34:12,899 INFO mapreduce.JobSubmitter: number of splits:2
2024-12-04 11:34:13,570 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1733312036894_0001
2024-12-04 11:34:13,572 INFO mapreduce.JobSubmitter: Executing with tokens: []
2024-12-04 11:34:13,837 INFO conf.Configuration: resource-types.xml not found
2024-12-04 11:34:13,838 INFO resource.ResourceUtils: Unable to find 'resource-types.xml'.
2024-12-04 11:34:14,2

## Exploring the created output directory

In [15]:
!$HADOOP_HOME/bin/hdfs dfs -ls /word_count_with_python/output
#part-r-00000 contains the actual ouput

Found 2 items
-rw-r--r--   1 root supergroup          0 2024-12-04 11:34 /word_count_with_python/output/_SUCCESS
-rw-r--r--   1 root supergroup       1403 2024-12-04 11:34 /word_count_with_python/output/part-00000


In [16]:
# printing out first 50 lines
!$HADOOP_HOME/bin/hdfs dfs -cat /word_count_with_python/output/part-00000 | head -50

(	1
1945	1
Allah	1
Atas	1
Bahwa	1
DASAR	1
Dan	1
Dasar	1
Esa,	1
INDONESIA	1
Indonesia	9
Indonesia,	2
Indonesia.	1
Keadilan	1
Kebangsaan	1
Kemanusiaan	1
Kemerdekaan	2
Kemudian	1
Kerakyatan	1
Ketuhanan	1
Kuasa	1
Maha	2
NEGARA	1
Negara	4
P	1
PEMBUKAAN	1
Pemerintah	1
Permusyawaratan/Perwakilan,	1
Persatuan	1
REPUBLIK	1
Republik	1
TAHUN	1
UNDANG-UNDANG	1
Undang-Undang	1
Yang	2
a	1
abadi	1
adil	2
atas	1
b	1
bagi	1
bangsa	2
bangsa,	1
bebas,	1
beradab,	1
berbahagia	1
berdasar	1
berdasarkan	1
berdaulat,	1
berkat	1


## Copy file hdfs-wordcount.txt dari HDFS ke local

In [17]:
!$HADOOP_HOME/bin/hdfs dfs -copyToLocal /word_count_with_python/output/part-00000 /content/hdfs-wordcount.txt