<a href="https://colab.research.google.com/github/ancgate/cis4400-summer-2025/blob/main/pythonscripts/extract.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install azure-storage-blob
!pip install pymongo

In [None]:
import pandas as pd
import pymongo
from pymongo import MongoClient
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient

## Group 01

In [None]:
# read the config file that is json
import json
with open('configg1.json') as config_file:
    config = json.load(config_file)

# read information about the destination
DESTINATION_CONNECTION_STRING = config['DESTINATION_CONNECTION_STRING']
DESTINATION_CONTAINER_NAME = config['DESTINATION_CONTAINER_NAME']
DESTINATION_FILE_NAME = config['DESTINATION_FILE_NAME']

# read information about mongodb
MONGODB_CONNECTION_STRING = config['MONGODB_CONNECTION_STRING']
MONGO_DB_NAME = config['MONGO_DB_NAME']
MONGO_COLLECTION_NAME = config['MONGO_COLLECTION_NAME']
print(MONGO_COLLECTION_NAME)

In [None]:
# connect to mongodb and test the connection using ping
client = MongoClient(MONGODB_CONNECTION_STRING)
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)
db = client[MONGO_DB_NAME]
collection = db[MONGO_COLLECTION_NAME]
print(collection)

In [None]:
# read the collection and store it into azure blob
# Retrieve all documents from the collection
mongo_data = list(collection.find())

# Convert to pandas DataFrame (optional, but useful for many formats)
df = pd.DataFrame(mongo_data)

# Choose a file format and save the data
# For example, saving to CSV:
local_file_name = "mongo_collection_data.csv"
df.to_csv(local_file_name, index=False)

# Upload to Azure Blob Storage
blob_service_client = BlobServiceClient.from_connection_string(DESTINATION_CONNECTION_STRING)
container_client = blob_service_client.get_container_client(DESTINATION_CONTAINER_NAME)

with open(local_file_name, "rb") as data:
    blob_client = container_client.upload_blob(name=DESTINATION_FILE_NAME, data=data, overwrite=True)

print(f"Data from MongoDB collection '{MONGO_COLLECTION_NAME}' uploaded to Azure Blob '{DESTINATION_FILE_NAME}' in container '{DESTINATION_CONTAINER_NAME}'.")

## Group 2

In [None]:
# read the config file that is json
import json
with open('configg2.json') as config_file:
    config = json.load(config_file)

# read information about the source
SOURCE_CONNECTION_STRING = config['SOURCE_CONNECTION_STRING']
SOURCE_CONTAINER_NAME = config['SOURCE_CONTAINER_NAME']
SOURCE_FILE_NAME = config['SOURCE_FILE_NAME']

# read information about the destination
DESTINATION_CONNECTION_STRING = config['DESTINATION_CONNECTION_STRING']
DESTINATION_CONTAINER_NAME = config['DESTINATION_CONTAINER_NAME']
DESTINATION_FILE_NAME = config['DESTINATION_FILE_NAME']
print(DESTINATION_CONNECTION_STRING)

In [None]:
# read from source to destination azure blob

# Instantiate BlobServiceClient for the source
source_blob_service_client = BlobServiceClient.from_connection_string(SOURCE_CONNECTION_STRING)

# Get a client for the source blob to get its URL
source_blob_client = source_blob_service_client.get_blob_client(container=SOURCE_CONTAINER_NAME, blob=SOURCE_FILE_NAME)

# download the file to a specific folder
with open("downloaded_file.csv", "wb") as my_blob:
    download_stream = source_blob_client.download_blob()
    my_blob.write(download_stream.readall())

# upload to the destination
destination_blob_service_client = BlobServiceClient.from_connection_string(DESTINATION_CONNECTION_STRING)

# Get a client for the destination container
destination_container_client = destination_blob_service_client.get_container_client(DESTINATION_CONTAINER_NAME)

# Upload the downloaded file to the destination
with open("downloaded_file.csv", "rb") as data:
    destination_blob_client = destination_container_client.upload_blob(name=DESTINATION_FILE_NAME, data=data, overwrite=True)

print(f"Downloaded file uploaded to Azure Blob '{DESTINATION_FILE_NAME}' in container '{DESTINATION_CONTAINER_NAME}'.")


## Group 3

In [None]:
# read the config file that is json
import json
with open('configg3.json') as config_file:
    config = json.load(config_file)

# read information about the source
SOURCE_CONNECTION_STRING = config['SOURCE_CONNECTION_STRING']
SOURCE_CONTAINER_NAME = config['SOURCE_CONTAINER_NAME']
SOURCE_FILE_NAME = config['SOURCE_FILE_NAME']

# read information about the destination
DESTINATION_CONNECTION_STRING = config['DESTINATION_CONNECTION_STRING']
DESTINATION_CONTAINER_NAME = config['DESTINATION_CONTAINER_NAME']
DESTINATION_FILE_NAME = config['DESTINATION_FILE_NAME']
print(DESTINATION_CONTAINER_NAME)

In [None]:
# Assuming you have SOURCE_CONNECTION_STRING, SOURCE_CONTAINER_NAME,
# DESTINATION_CONNECTION_STRING, and DESTINATION_CONTAINER_NAME defined from your config files.

# Instantiate BlobServiceClient for the source
source_blob_service_client = BlobServiceClient.from_connection_string(SOURCE_CONNECTION_STRING)

# Get a client for the source container
source_container_client = source_blob_service_client.get_container_client(SOURCE_CONTAINER_NAME)

# List all blobs in the source container
blob_list = source_container_client.list_blobs()

# Loop through each blob in the source container
for blob in blob_list:
    print(f"Processing blob: {blob.name}")
    # Download the blob to a local file
    source_blob_name = blob.name
    # take only every after the forward slash
    local_file_name = source_blob_name.split('/')[-1]

    print(f"Downloading blob: {source_blob_name}")

    # Download the blob to a local file
    source_blob_client = source_blob_service_client.get_blob_client(container=SOURCE_CONTAINER_NAME, blob=source_blob_name)
    with open(local_file_name, "wb") as my_blob:
        download_stream = source_blob_client.download_blob()
        my_blob.write(download_stream.readall())

    print(f"Uploading blob: {source_blob_name}")

    # Upload the local file to the destination container
    destination_blob_service_client = BlobServiceClient.from_connection_string(DESTINATION_CONNECTION_STRING)
    destination_container_client = destination_blob_service_client.get_container_client(DESTINATION_CONTAINER_NAME)

    # You can use the same blob name or define a new one for the destination
    # destination_blob_name = source_blob_name
    with open(local_file_name, "rb") as data:
        destination_blob_client = destination_container_client.upload_blob(name=source_blob_name, data=data, overwrite=True)

    print(f"Uploaded blob '{source_blob_name}' to destination as '{source_blob_name}'.")

    # Optional: Clean up the local file
    import os
    os.remove(local_file_name)
    print(f"Removed local file: {local_file_name}")

print("Finished processing all blobs.")