1. Write a Python program to read a Hadoop configuration file and display the core components of Hadoop.




```
import configparser

def display_hadoop_components(config_file):
    config = configparser.ConfigParser()
    config.read(config_file)

    if 'core-site' in config:
        print('Core components:')
        core_components = config['core-site'].get('fs.defaultFS')
        print(core_components)
    else:
        print('No core components found in the configuration file.')

    if 'hdfs-site' in config:
        print('\nHDFS components:')
        namenode = config['hdfs-site'].get('dfs.namenode.rpc-address')
        datanodes = config['hdfs-site'].get('dfs.datanode.data.dir')
        print(f'Namenode: {namenode}')
        print(f'Datanodes: {datanodes}')
    else:
        print('No HDFS components found in the configuration file.')

# Specify the path to your Hadoop configuration file
config_file = '/path/to/hadoop/config/file.xml'

# Call the function to display the components
display_hadoop_components(config_file)

```



2. Implement a Python function that calculates the total file size in a Hadoop Distributed File System (HDFS) directory



```
import pyarrow.hdfs as hdfs

def calculate_directory_size(hdfs_host, hdfs_port, hdfs_directory):
    fs = hdfs.connect(host=hdfs_host, port=hdfs_port)
    total_size = 0

    # Recursively iterate over the files in the directory
    for path, _, files in fs.walk(hdfs_directory):
        for file_name in files:
            file_path = f"{path}/{file_name}"
            file_info = fs.info(file_path)
            file_size = file_info.size
            total_size += file_size

    fs.close()

    return total_size

# Set the HDFS host, port, and directory path
hdfs_host = 'localhost'
hdfs_port = 9000
hdfs_directory = '/path/to/hdfs/directory'

# Call the function to calculate the total file size
total_file_size = calculate_directory_size(hdfs_host, hdfs_port, hdfs_directory)

print(f"Total file size in HDFS directory '{hdfs_directory}': {total_file_size} bytes")
```



3. Create a Python program that extracts and displays the top N most frequent words from a large text file using the MapReduce approach.



```
from mrjob.job import MRJob
from mrjob.step import MRStep
import re

class TopNWords(MRJob):

    def configure_args(self):
        super(TopNWords, self).configure_args()
        self.add_passthru_arg('--N', type=int, default=10, help='Number of top words to display')

    def steps(self):
        return [
            MRStep(mapper=self.mapper,
                   combiner=self.combiner,
                   reducer=self.reducer),
            MRStep(reducer=self.final_reducer)
        ]

    def mapper(self, _, line):
        # Split the line into words
        words = re.findall(r'\w+', line.lower())

        # Emit each word with a count of 1
        for word in words:
            yield word, 1

    def combiner(self, word, counts):
        # Sum the counts of the words
        yield word, sum(counts)

    def reducer(self, word, counts):
        # Sum the counts of the words
        yield None, (sum(counts), word)

    def final_reducer(self, _, word_counts):
        N = self.options.N
        # Sort the word counts in descending order
        sorted_word_counts = sorted(word_counts, reverse=True)

        # Extract the top N most frequent words
        top_words = sorted_word_counts[:N]

        # Yield each top word with its count
        for count, word in top_words:
            yield word, count

    def steps(self):
        return [
            MRStep(mapper=self.mapper,
                   combiner=self.combiner,
                   reducer=self.reducer),
            MRStep(reducer=self.final_reducer)
        ]

if __name__ == '__main__':
    TopNWords.run()
```
To run the program, save it to a file (e.g., top_n_words.py) and execute it with the following command, specifying the input file and the value of N for the number of top words to display:


```
python top_n_words.py 5
```





4. Write a Python script that checks the health status of the NameNode and DataNodes in a Hadoop cluster using Hadoop's REST API.




```
import requests

def check_namenode_health(nn_host, nn_port):
    url = f"http://{nn_host}:{nn_port}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        live_nodes = data['beans'][0]['LiveNodes']
        return live_nodes['NumLiveDataNodes']
    else:
        return -1

def check_datanode_health(nn_host, nn_port):
    url = f"http://{nn_host}:{nn_port}/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        live_nodes = data['beans'][0]['LiveNodes']
        return len(live_nodes)
    else:
        return -1

# Set the NameNode host and port
nn_host = 'localhost'
nn_port = 50070

# Check NameNode health status
nn_health = check_namenode_health(nn_host, nn_port)
if nn_health != -1:
    print(f"NameNode is healthy. Number of live DataNodes: {nn_health}")
else:
    print("Failed to retrieve NameNode health status.")

# Check DataNode health status
dn_health = check_datanode_health(nn_host, nn_port)
if dn_health != -1:
    print(f"Number of live DataNodes: {dn_health}")
else:
    print("Failed to retrieve DataNode health status.")
```



5. Develop a Python program that lists all the files and directories in a specific HDFS path.




```
import pyarrow.hdfs as hdfs

def list_hdfs_path(hdfs_host, hdfs_port, hdfs_path):
    fs = hdfs.connect(host=hdfs_host, port=hdfs_port)

    # List all files and directories in the HDFS path
    file_status_list = fs.ls(hdfs_path)

    # Print the files and directories
    print(f"Contents of {hdfs_path}:")
    for file_status in file_status_list:
        print(file_status['name'])

    fs.close()

# Set the HDFS host, port, and path
hdfs_host = 'localhost'
hdfs_port = 9000
hdfs_path = '/path/to/hdfs/directory'

# Call the function to list the files and directories
list_hdfs_path(hdfs_host, hdfs_port, hdfs_path)
```



6. Implement a Python program that analyzes the storage utilization of DataNodes in a Hadoop cluster and identifies the nodes with the highest and lowest storage capacities.




```
import requests

def analyze_data_nodes_storage(nn_host, nn_port):
    url = f"http://{nn_host}:{nn_port}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        live_nodes = data['beans'][0]['LiveNodes']
        
        # Retrieve storage utilization information for each DataNode
        node_usages = {}
        for node_id, node_info in live_nodes.items():
            node_name = node_info['name']
            storage_capacity = node_info['capacity']
            storage_used = node_info['used']
            storage_remaining = node_info['remaining']
            storage_utilization = (storage_used / storage_capacity) * 100
            node_usages[node_name] = storage_utilization
        
        # Find node with the highest storage utilization
        highest_utilization_node = max(node_usages, key=node_usages.get)
        
        # Find node with the lowest storage utilization
        lowest_utilization_node = min(node_usages, key=node_usages.get)
        
        return node_usages, highest_utilization_node, lowest_utilization_node
    else:
        return {}, None, None

# Set the NameNode host and port
nn_host = 'localhost'
nn_port = 50070

# Analyze data nodes storage utilization
node_usages, highest_utilization_node, lowest_utilization_node = analyze_data_nodes_storage(nn_host, nn_port)

if node_usages:
    print("Data Node Storage Utilization:")
    for node, utilization in node_usages.items():
        print(f"{node}: {utilization:.2f}%")

    print(f"\nNode with the highest storage utilization: {highest_utilization_node} ({node_usages[highest_utilization_node]:.2f}%)")
    print(f"Node with the lowest storage utilization: {lowest_utilization_node} ({node_usages[lowest_utilization_node]:.2f}%)")
else:
    print("Failed to retrieve data node storage utilization.")
```



7. Create a Python script that interacts with YARN's ResourceManager API to submit a Hadoop job, monitor its progress, and retrieve the final output.




```
import requests
import time

def submit_hadoop_job(resourcemanager_host, resourcemanager_port, job_name, jar_path, main_class, input_path, output_path):
    url = f"http://{resourcemanager_host}:{resourcemanager_port}/ws/v1/cluster/apps"
    headers = {'Content-Type': 'application/json'}

    # Prepare the job submission payload
    payload = {
        "application": {
            "application-name": job_name,
            "am-container-spec": {
                "commands": {
                    "command": f"yarn jar {jar_path} {main_class} {input_path} {output_path}"
                }
            },
            "max-attempts": 1,
            "resource": {
                "memory": 1024,
                "vCores": 1
            }
        }
    }

    # Submit the job
    response = requests.post(url, headers=headers, json=payload)
    if response.status_code == 202:
        data = response.json()
        app_id = data['app']['id']
        print(f"Job submitted successfully. Application ID: {app_id}")
        return app_id
    else:
        print("Failed to submit the job.")
        return None

def monitor_job_progress(resourcemanager_host, resourcemanager_port, app_id):
    url = f"http://{resourcemanager_host}:{resourcemanager_port}/ws/v1/cluster/apps/{app_id}"
    headers = {'Content-Type': 'application/json'}

    while True:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            data = response.json()
            app_status = data['app']['state']
            if app_status in ('FINISHED', 'FAILED', 'KILLED'):
                print(f"Job status: {app_status}")
                break
            else:
                print(f"Job status: {app_status}")
        else:
            print("Failed to get job status.")
            break

        time.sleep(5)

def retrieve_job_output(resourcemanager_host, resourcemanager_port, app_id, output_path):
    url = f"http://{resourcemanager_host}:{resourcemanager_port}/ws/v1/cluster/apps/{app_id}/appattempts"
    headers = {'Content-Type': 'application/json'}

    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        data = response.json()
        attempt_id = data['appAttempts']['appAttempt'][0]['appAttemptId']
        container_id = data['appAttempts']['appAttempt'][0]['containerId']
        logs_url = f"http://{resourcemanager_host}:{resourcemanager_port}/proxy/{app_id}/node/{container_id}/logFiles/syslog/?start=-4096"

        response = requests.get(logs_url)
        if response.status_code == 200:
            with open(output_path, 'wb') as f:
                f.write(response.content)
            print(f"Job output retrieved and saved to: {output_path}")
        else:
            print("Failed to retrieve job output.")
    else:
        print("Failed to retrieve job attempts.")

# Set the ResourceManager host and port
resourcemanager_host = 'localhost'
resourcemanager_port = 8088

# Set the Hadoop job details
job_name = 'MyJob'
jar_path = '/path/to/job.jar'
main_class = 'com.example.MyJobClass'
input_path = '/path/to/input'
output_path = '/path/to/output'

# Submit the Hadoop job
app_id = submit_hadoop_job(resourcemanager_host, resourcemanager_port, job_name, jar_path, main_class, input_path, output_path)

# Monitor the job progress
if app_id:
    monitor_job_progress(resourcemanager_host, resourcemanager_port, app_id)

    # Retrieve the job output
    retrieve_job_output(resourcemanager_host, resourcemanager_port, app_id, output_path)
```



8. Create a Python script that interacts with YARN's ResourceManager API to submit a Hadoop job, set resource requirements, and track resource usage during job execution.




```
import requests
import time

def submit_hadoop_job(resourcemanager_host, resourcemanager_port, job_name, jar_path, main_class, input_path, output_path, memory_mb, vcores):
    url = f"http://{resourcemanager_host}:{resourcemanager_port}/ws/v1/cluster/apps"
    headers = {'Content-Type': 'application/json'}

    # Prepare the job submission payload
    payload = {
        "application": {
            "application-name": job_name,
            "am-container-spec": {
                "commands": {
                    "command": f"yarn jar {jar_path} {main_class} {input_path} {output_path}"
                },
                "resource": {
                    "memory": memory_mb,
                    "vCores": vcores
                }
            },
            "max-attempts": 1,
            "resource": {
                "memory": memory_mb,
                "vCores": vcores
            }
        }
    }

    # Submit the job
    response = requests.post(url, headers=headers, json=payload)
    if response.status_code == 202:
        data = response.json()
        app_id = data['app']['id']
        print(f"Job submitted successfully. Application ID: {app_id}")
        return app_id
    else:
        print("Failed to submit the job.")
        return None

def track_resource_usage(resourcemanager_host, resourcemanager_port, app_id):
    url = f"http://{resourcemanager_host}:{resourcemanager_port}/ws/v1/cluster/apps/{app_id}/appattempts"
    headers = {'Content-Type': 'application/json'}

    while True:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            data = response.json()
            app_status = data['appAttempts']['appAttempt'][0]['appAttemptState']
            resources = data['appAttempts']['appAttempt'][0]['resources']
            allocated_memory_mb = resources['allocatedMB']
            allocated_vcores = resources['allocatedVirtualCores']
            print(f"Job status: {app_status}")
            print(f"Allocated memory: {allocated_memory_mb} MB")
            print(f"Allocated vCores: {allocated_vcores}")
            print("-" * 30)
            if app_status == "FINISHED" or app_status == "FAILED" or app_status == "KILLED":
                break
        else:
            print("Failed to get job status.")
            break

        time.sleep(5)

# Set the ResourceManager host and port
resourcemanager_host = 'localhost'
resourcemanager_port = 8088

# Set the Hadoop job details
job_name = 'MyJob'
jar_path = '/path/to/job.jar'
main_class = 'com.example.MyJobClass'
input_path = '/path/to/input'
output_path = '/path/to/output'

# Set the resource requirements
memory_mb = 1024
vcores = 1

# Submit the Hadoop job
app_id = submit_hadoop_job(resourcemanager_host, resourcemanager_port, job_name, jar_path, main_class, input_path, output_path, memory_mb, vcores)

# Track resource usage during job execution
if app_id:
    track_resource_usage(resourcemanager_host, resourcemanager_port, app_id)

```



9. Write a Python program that compares the performance of a MapReduce job with different input split sizes, showcasing the impact on overall job execution time.



```
from mrjob.job import MRJob
import time

class MyMapReduceJob(MRJob):

    def configure_args(self):
        super(MyMapReduceJob, self).configure_args()
        self.add_passthru_arg('--split-size', type=int, default=64, help='Input split size in MB')

    def mapper(self, _, line):
        # Mapper logic goes here
        # Replace with your actual mapper implementation
        yield line.strip(), 1

    def reducer(self, key, values):
        # Reducer logic goes here
        # Replace with your actual reducer implementation
        yield key, sum(values)

if __name__ == '__main__':
    # Set the input data file path
    input_data = '/path/to/input/data.txt'

    # Set the different split sizes to test
    split_sizes = [64, 128, 256, 512]

    for split_size in split_sizes:
        start_time = time.time()

        # Run the MapReduce job with the current split size
        job_args = [input_data, '--split-size', str(split_size)]
        job = MyMapReduceJob(args=job_args)
        with job.make_runner() as runner:
            runner.run()

        end_time = time.time()
        execution_time = end_time - start_time

        print(f"Split Size: {split_size} MB")
        print(f"Execution Time: {execution_time} seconds")
        print("-" * 30)
```

