In [1]:
import os
import argparse
import glob
from azure.storage.blob import BlobServiceClient

def download_files_from_blob_storage(connection_string, folder_name, local_folder_path, container_name):
    try:
        print("Creating BlobServiceClient...")
        blob_service_client = BlobServiceClient.from_connection_string(connection_string)
        print("BlobServiceClient created successfully.")

        print("Creating ContainerClient...")
        container_client = blob_service_client.get_container_client(container_name)
        print("ContainerClient created successfully.")

        print("Listing blobs in the container...")
        blob_list = container_client.list_blobs(name_starts_with=folder_name)
        # for blob in blob_list:
        #     print(blob)
        # print(f"Found {len(blob_list)} blobs in the container.")

        print("Downloading files from Azure Blob Storage...")
        for blob in blob_list:
            blob_name = blob.name
            blob_client = container_client.get_blob_client(blob_name)
            download_file_path = os.path.join(local_folder_path, blob_name.replace("/", "\\"))
            os.makedirs(os.path.dirname(download_file_path), exist_ok=True)
            with open(download_file_path, "wb") as download_file:
                download_file.write(blob_client.download_blob().readall())
            print(f"Downloaded {blob_name} to {download_file_path}")

        print("Files downloaded from Azure Blob Storage.")

        print("Combining all text files into a single file named corpus.txt...")
        corpus_file_path = os.path.join(local_folder_path, "corpus.txt")
        with open(corpus_file_path, "w") as corpus_file:
            for text_file in glob.glob(os.path.join(local_folder_path, "*.txt")):
                with open(text_file, "r") as file:
                    corpus_file.write(file.read())
        print("All text files combined into corpus.txt.")

    except Exception as e:
        print(f"An error occurred: {e}")

# if __name__ == "__main__":
#     parser = argparse.ArgumentParser(description='Download files from Azure Blob Storage and combine text files.')
#     parser.add_argument('--connection-string', type=str, required=True, help='Azure Blob Storage connection string')
#     parser.add_argument('--folder-name', type=str, required=True, help='Folder name in Azure Blob Storage to download files from')
#     parser.add_argument('--local-folder_path', default="corpus", type=str, help='Local folder path to download files to')
#     parser.add_argument('--container-name', default="data-corpus", type=str, help='Azure Blob Storage container name')
# python download_dataset.py \
#     --connection-string "DefaultEndpointsProtocol=https;AccountName=llmaodatastore;AccountKey=J+go6cxXfi7v4Exq2ggINtxr13JICvHBoPjuKmGN5R0Ly6palZsO1RYiSOjvigxwx3jc1rOVPCI8+AStU/dpYQ==;EndpointSuffix=core.windows.net" \
#     --folder-name "2024_03_09_18_19_21_Archive" \
#     --local-folder_path "./corpus" \
#     --container-name "data-corpus"
# #     args = parser.parse_args()

download_files_from_blob_storage("DefaultEndpointsProtocol=https;AccountName=llmaodatastore;AccountKey=J+go6cxXfi7v4Exq2ggINtxr13JICvHBoPjuKmGN5R0Ly6palZsO1RYiSOjvigxwx3jc1rOVPCI8+AStU/dpYQ==;EndpointSuffix=core.windows.net"
                                 , "2024_03_09_18_19_21_Archive", 
                                 "corpus", 
                                 "data-corpus")

Creating BlobServiceClient...
BlobServiceClient created successfully.
Creating ContainerClient...
ContainerClient created successfully.
Listing blobs in the container...
Downloading files from Azure Blob Storage...
Downloaded 2024_03_09_18_19_21_Archive to corpus/2024_03_09_18_19_21_Archive
Downloaded 2024_03_09_18_19_21_Archive/code to corpus/2024_03_09_18_19_21_Archive\code
Downloaded 2024_03_09_18_19_21_Archive/code/LH.txt to corpus/2024_03_09_18_19_21_Archive\code\LH.txt
Downloaded 2024_03_09_18_19_21_Archive/code/Log.txt to corpus/2024_03_09_18_19_21_Archive\code\Log.txt
Downloaded 2024_03_09_18_19_21_Archive/code/Log_1.txt to corpus/2024_03_09_18_19_21_Archive\code\Log_1.txt
Downloaded 2024_03_09_18_19_21_Archive/code/Log_2.txt to corpus/2024_03_09_18_19_21_Archive\code\Log_2.txt
Downloaded 2024_03_09_18_19_21_Archive/code/data.txt to corpus/2024_03_09_18_19_21_Archive\code\data.txt
Downloaded 2024_03_09_18_19_21_Archive/code/placeholder to corpus/2024_03_09_18_19_21_Archive\code

In [5]:
import os

def parse_and_extract_data(folder_path, output_file="data/extracted_data.txt"):
    # Open the output file in append mode
    with open(output_file, "a", encoding="utf-8") as output_file:

        # Iterate through each file in the folder
        for file_name in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file_name)

            # Check if the file is a text file
            if file_name.endswith(".txt"):

                # Read the content of the file
                with open(file_path, "r", encoding="utf-8") as input_file:
                    file_content = input_file.read().strip()

                # Check if the document is not empty
                if file_content:
                    if file_content!="hello world":
                        # Write the extracted text to the output file
                        output_file.write(file_content + "\n")

                    
                    
# Example usage:
corpus_folder_path = "corpus"
parse_and_extract_data(corpus_folder_path)
