Import needed Libraries

In [97]:
#!pip install pandas

In [98]:
import pandas as pd
import os
import datetime
import subprocess
import hashlib

In [99]:
local_path = "/data/Spark_project"

#### Create the directories in HDFS (If not exists)

In [100]:
def create_hdfs_directory(*paths):
    for path in paths:
        try:
            subprocess.run(["hdfs", "dfs", "-mkdir", "-p", path], check=True)
            print(f"Directory {path} created successfully in HDFS.")
        except subprocess.CalledProcessError as e:
            print(f"Failed to create directory {path} in HDFS. Error: {e}")

create_hdfs_directory("/Spark_Project/data/Q_company/branches", 
                     "/Spark_Project/data/Q_company/sales_agents",
                     "/Spark_Project/data/Q_company/sales_transactions")

Directory /Spark_Project/data/Q_company/branches created successfully in HDFS.
Directory /Spark_Project/data/Q_company/sales_agents created successfully in HDFS.
Directory /Spark_Project/data/Q_company/sales_transactions created successfully in HDFS.


Create list of unique files to be uploaded to the Data Lake

In [101]:
def caculate_checksum(file_path, chunk_size = 4096):
    # Read the file chunk by chunk to handle large files efficiently
    with open(file_path, "rb") as f:
        hasher = hashlib.md5()
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            hasher.update(chunk)

    # Return the hexadecimal representation of the hash
    return hasher.hexdigest()

In [140]:
def get_files(path):
    file_hashes = {}
    for root, dirs, files in os.walk(path):
        if os.path.basename(root) == "archive":
            continue
        elif os.path.basename(os.path.dirname(root)) == "archive":
            continue
        for file in files:
            file_path = os.path.join(root, file)
            # Check the uniqueness of the file using its check sum
            checksum = caculate_checksum(file_path)
            if checksum in file_hashes:
                print(f"{file} is duplicated!\nIt is the same as the file: {file_hashes[checksum]}")
                print("File Ignored")
                print("--" * 20)
                continue
            file_hashes[checksum] = file_path
    return list(file_hashes.values())
files = get_files(local_path)

Add Quality Columns to the Data

In [111]:
def get_hdfs_path(local_source):
    file_name = os.path.basename(local_source)
    parent_directory = os.path.basename(os.path.dirname(local_source))
    hdfs_file_name = parent_directory + '_' + file_name
    
    if 'branch' in file_name.lower():
        hdfs_directory = "/Spark_Project/data/Q_company/branches"
    elif 'agent' in file_name.lower():
        hdfs_directory = "/Spark_Project/data/Q_company/sales_agents"
    elif'transaction'in file_name.lower():
        hdfs_directory = "/Spark_Project/data/Q_company/sales_transactions"
    else:
        print("Unknown File!!")
    return os.path.join(hdfs_directory, hdfs_file_name)

In [107]:
def upload_file(local_source):
    hdfs_destination = get_hdfs_path(local_source)
    if hdfs_destination is None:
        return False
    try:
        subprocess.run(["hdfs", "dfs", "-put", "-f", local_source, hdfs_destination], check=True)
        print(f"Successfully copied {local_source} to {hdfs_destination} in HDFS.")
        return True
    except subprocess.CalledProcessError as e:
        print(f"Failed to copy {local_source} to {hdfs_destination} in HDFS. Error: {e}")
        return False

In [129]:
def archive_file(path):
    # Create the Archive folder if it does not exist
    folder_path =  os.path.join(local_path, 'archive')

    # Create the archive directory
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        print(f"Folder '{folder_path}' created successfully.")
    else:
        print(f"Folder '{folder_path}' already exists.")

    # Move the file to the Archive folder
    parent_directory = os.path.basename(os.path.dirname(path))
    archive_group = os.path.join(folder_path, parent_directory)
    # Create folder for this files group in the archive directory
    if not os.path.exists(archive_group):
        os.makedirs(archive_group)
    try:
        subprocess.run(["mv", path, archive_group], check=True)
        print(f"Successfully moved {path} to Archive.")
    except subprocess.CalledProcessError as e:
        print(f"Failed to move {path} to Archive. Error: {e}")

In [126]:
def move_files(files):
    for path in files:
        df = pd.read_csv(path)
        current_timestamp = datetime.datetime.now()
        df['load_time'] = str(current_timestamp)
        df['source'] = 'Local File System'
        df['source_path'] = path
        df.to_csv(path, index=False)
        upload_status = upload_file(path)
        if upload_status:
            archive_file(path)

Move the data to Data Lake

In [130]:
move_files(files)

Successfully copied /data/Spark_project/group1/branches_SS_raw_1.csv to /Spark_Project/data/Q_company/branches/group1_branches_SS_raw_1.csv in HDFS.
Folder '/data/Spark_project/archive' already exists.
Successfully moved /data/Spark_project/group1/branches_SS_raw_1.csv to Archive.
Successfully copied /data/Spark_project/group1/sales_agents_SS_raw_1.csv to /Spark_Project/data/Q_company/sales_agents/group1_sales_agents_SS_raw_1.csv in HDFS.
Folder '/data/Spark_project/archive' already exists.
Successfully moved /data/Spark_project/group1/sales_agents_SS_raw_1.csv to Archive.
Successfully copied /data/Spark_project/group1/sales_transactions_SS_raw_1.csv to /Spark_Project/data/Q_company/sales_transactions/group1_sales_transactions_SS_raw_1.csv in HDFS.
Folder '/data/Spark_project/archive' already exists.
Successfully moved /data/Spark_project/group1/sales_transactions_SS_raw_1.csv to Archive.


In [136]:
files

['/data/Spark_project/archive/group1/branches_SS_raw_1.csv',
 '/data/Spark_project/archive/group1/sales_agents_SS_raw_1.csv',
 '/data/Spark_project/archive/group1/sales_transactions_SS_raw_1.csv']

<hr>