# Installation de Hadoop

In [None]:
import os
os.environ["HADOOP_VERSION"]="3.3.6"
!wget -nc https://dlcdn.apache.org/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz
!tar -xzf hadoop-$HADOOP_VERSION.tar.gz
!git clone https://github.com/bamedro/training-bigdata.git

os.environ["JAVA_HOME"]="/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["HADOOP_HOME"]="/content/hadoop-" + os.environ["HADOOP_VERSION"]
os.environ["PATH"]=os.environ["PATH"] + ":" + os.environ["HADOOP_HOME"]+"/bin"

# On ajoute les chemins importants au fichier de configuration de Hadoop
!sed -i "s:# export JAVA_HOME=:export JAVA_HOME=$JAVA_HOME:g" $HADOOP_HOME/etc/hadoop/hadoop-env.sh
!sed -i "s:# export HADOOP_HOME=:export HADOOP_HOME=$HADOOP_HOME:g" $HADOOP_HOME/etc/hadoop/hadoop-env.sh

# On vérifie que l'ajout s'est bien passé
!cat $HADOOP_HOME/etc/hadoop/hadoop-env.sh | grep HOME

In [31]:
# Testons que l'installation fonctionne
!hadoop version

Hadoop 3.3.6
Source code repository https://github.com/apache/hadoop.git -r 1be78238728da9266a4f88195058f08fd012bf9c
Compiled by ubuntu on 2023-06-18T08:22Z
Compiled on platform linux-x86_64
Compiled with protoc 3.7.1
From source with checksum 5652179ad55f76cb287d9c633bb53bbd
This command was run using /content/hadoop-3.3.6/share/hadoop/common/hadoop-common-3.3.6.jar


# Premier test en mode "Standalone"

Test d'Hadoop en version minimale, sans stockage distribué (HDFS) ni distribution (YARN).

In [None]:
# On lance un job Hadoop dont le rôle est de lister tous les termes commencant par 'dfs' parmis les fichiers de configuration XML d'Hadoop.
!mkdir input
!cp $HADOOP_HOME/etc/hadoop/*.xml input
!hadoop jar $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-$HADOOP_VERSION.jar grep input output 'dfs[a-z.]+'
!cat output/*

# Installation avec HDFS

In [None]:
# SSH est un pré-requis car les Hadoop utilise SSH pour lancer des commandes sur des noeuds distants.
!sudo apt-get install openssh-server
!sudo service ssh start

# On doit créer une clé SSH par défaut et sans mot de passe
!ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa
!cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
!chmod 0600 ~/.ssh/authorized_keys

# On initialise la première connexion
!ssh -o StrictHostKeyChecking=no localhost hostname

In [None]:
# On récupère les fichiers de configuration
!cp /content/training-bigdata/1-initial-setup/core-site.xml /content/hadoop-$HADOOP_VERSION/etc/hadoop/
!cp /content/training-bigdata/1-initial-setup/hdfs-site.xml /content/hadoop-$HADOOP_VERSION/etc/hadoop/

# Regardez leur contenu
!cat /content/hadoop-$HADOOP_VERSION/etc/hadoop/core-site.xml
!cat /content/hadoop-$HADOOP_VERSION/etc/hadoop/hdfs-site.xml

In [43]:
# On peut enfin initialiser notre 'nameNode' et démarrer le service qui va gérer le système de fichier distribué (DFS)
import os
os.environ["HDFS_NAMENODE_USER"]="root"
os.environ["HDFS_DATANODE_USER"]="root"
os.environ["HDFS_SECONDARYNAMENODE_USER"]="root"

!hdfs namenode -format -noninteractive
!$HADOOP_HOME/sbin/start-dfs.sh

Starting namenodes on [localhost]
localhost: namenode is running as process 21172.  Stop it first and ensure /tmp/hadoop-root-namenode.pid file is empty before retry.
Starting datanodes
localhost: datanode is running as process 21282.  Stop it first and ensure /tmp/hadoop-root-datanode.pid file is empty before retry.
Starting secondary namenodes [a9ecafc7324b]
a9ecafc7324b: secondarynamenode is running as process 21504.  Stop it first and ensure /tmp/hadoop-root-secondarynamenode.pid file is empty before retry.


In [None]:
# Faites quelques exercices pratiques à partir des exemples donnés dans le dossier 2-hdfs
...

# Installation avec YARN (sur un seul Node)

In [None]:
# On récupère les fichiers de configuration
!cp /content/training-bigdata/1-initial-setup/mapred-site.xml /content/hadoop-$HADOOP_VERSION/etc/hadoop/
!cp /content/training-bigdata/1-initial-setup/yarn-site.xml /content/hadoop-$HADOOP_VERSION/etc/hadoop/

# N'hésitez pas à observer leur contenu
!cat /content/hadoop-$HADOOP_VERSION/etc/hadoop/mapred-site.xml
!cat /content/hadoop-$HADOOP_VERSION/etc/hadoop/yarn-site.xml

In [38]:
# On peut démarrer le service Yarn
import os
os.environ["YARN_RESOURCEMANAGER_USER"] = "root"
os.environ["YARN_NODEMANAGER_USER"] = "root"

!$HADOOP_HOME/sbin/start-yarn.sh

Starting resourcemanager
resourcemanager is running as process 17996.  Stop it first and ensure /tmp/hadoop-root-resourcemanager.pid file is empty before retry.
Starting nodemanagers
localhost: nodemanager is running as process 18101.  Stop it first and ensure /tmp/hadoop-root-nodemanager.pid file is empty before retry.


In [36]:
# On peut lister les exemples de codes MapReduce embarqués dans la distribution Hadoop
!hadoop jar $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-$HADOOP_VERSION.jar

An example program must be given as the first argument.
Valid program names are:
  aggregatewordcount: An Aggregate based map/reduce program that counts the words in the input files.
  aggregatewordhist: An Aggregate based map/reduce program that computes the histogram of the words in the input files.
  bbp: A map/reduce program that uses Bailey-Borwein-Plouffe to compute exact digits of Pi.
  dbcount: An example job that count the pageview counts from a database.
  distbbp: A map/reduce program that uses a BBP-type formula to compute exact bits of Pi.
  grep: A map/reduce program that counts the matches of a regex in the input.
  join: A job that effects a join over sorted, equally partitioned datasets
  multifilewc: A job that counts words from several files.
  pentomino: A map/reduce tile laying program to find solutions to pentomino problems.
  pi: A map/reduce program that estimates Pi using a quasi-Monte Carlo method.
  randomtextwriter: A map/reduce program that writes 10GB of r

# Déploiement d'un calcul pour estimer la valeur de Pi

In [None]:
# Observer les arguments de la commande Pi
!hadoop jar $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.3.6.jar pi

# Tester un découpage avec 5 Map, chacun effectuant 100 tirs aléatoires
!hadoop jar $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.3.6.jar pi 5 100

# Déployer un calcul à partir d'un code Python

Deux fonctions, Map et Reduce, sont présentes dans le dossier training-bigdata/3-hadoop-streaming

Étudiez leur fonctionnement

In [45]:
!cat /content/training-bigdata/3-hadoop-streaming/mapper.py

#!/usr/bin/env python3

import sys

# reading entire line from STDIN (standard input)
for line in sys.stdin:
  # to remove leading and trailing whitespace
  line = line.strip()

  # split the line into words
  words = line.split()
    
  # we are looping over the words array and printing the word
  # with the count of 1 to the STDOUT
  for word in words:
    # write the results to STDOUT (standard output);
    # what we output here will be the input for the
    # Reduce step, i.e. the input for reducer.py
    print('%s\t%s' % (word, 1))

In [None]:
!cat /content/training-bigdata/3-hadoop-streaming/reducer.py

#!/usr/bin/env python3

import sys

from operator import itemgetter
import sys
  
current_word = None
current_count = 0
word = None
  
# read the entire line from STDIN
for line in sys.stdin:
  # remove leading and trailing whitespace
  line = line.strip()
  # splitting the data on the basis of tab we have provided in mapper.py
  word, count = line.split('\t', 1)
  # convert count (currently a string) to int
  try:
    count = int(count)
  except ValueError:
    # count was not a number, so silently
    # ignore/discard this line
    continue
  
  # this IF-switch only works because Hadoop sorts map output
  # by key (here: word) before it is passed to the reducer
  if current_word == word:
    current_count += count
  else:
    if current_word: #to not print current_word=None
      # write result to STDOUT
      print('%s\t%s' % (current_word, current_count))
    current_count = count
    current_word = word
  
# do not forget to output the last word if needed!
if current_word == word

In [47]:
import os
os.chdir('/content/training-bigdata/3-hadoop-streaming/')
!head ../LICENSE

Creative Commons Legal Code

CC0 1.0 Universal

    CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
    LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
    ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
    INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
    REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
    PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM


In [None]:
!head ../LICENSE | python mapper.py

In [None]:
!head ../LICENSE | python mapper.py | sort -k1,1

In [None]:
!head ../LICENSE | python mapper.py | sort -k1,1 | python reducer.py