In [2]:
from dotenv import load_dotenv
import os

# Load the environment variables from the .env file
load_dotenv()

mongodb_username = os.getenv('MONGODB_USERNAME')
mongodb_password = os.getenv('MONGODB_PASSWORD')
mongodb_connection_string = os.getenv('MONGODB_CONNECTION_STRING')
snowflake_user = os.getenv('SNOWFLAKE_USER')
snowflake_password = os.getenv('SNOWFLAKE_PASSWORD')
snowflake_account = os.getenv('SNOWFLAKE_ACCOUNT')


In [3]:
import snowflake.connector
from pymongo import MongoClient
import pandas as pd

# Snowflake connection parameters
snowflake_conn_params = {
    'user': snowflake_user,
    'password': snowflake_password,
    'account': snowflake_account,
    'warehouse': 'clinical_trials_warehouse',
    'database': 'clinical_trials_db',
    'schema': 'clinical_trials_schema'
}

# MongoDB Atlas connection parameters
mongodb_conn_params = {
    'connection_string': mongodb_connection_string,
    'database': 'clinical_trial_db',
    'collection': 'clinical_trial_report'
}


# Connect to Snowflake
snowflake_conn = snowflake.connector.connect(**snowflake_conn_params)

# Create a cursor object
snowflake_cur = snowflake_conn.cursor()

try:
    # Extract data from Snowflake
    snowflake_cur.execute("SELECT * FROM clinical_trial")
    df = pd.DataFrame(snowflake_cur.fetchall(), columns=[col[0] for col in snowflake_cur.description])

    # Transform 

finally:
    # Close the cursor and the connection
    snowflake_cur.close()
    snowflake_conn.close()




In [4]:
df.head()

Unnamed: 0,nctid,target_label,number_collaborators,number_officials,num_arms,number_countries,number_sites,no_elig_req,inclusion_lines,exclusion_lines,...,90,91,92,93,94,95,96,97,98,99
0,NCT00005943,0,1,1,0.0,1,1,0,0,0,...,-0.493583,-0.057928,2.48544,1.653217,1.189049,-0.978146,0.187826,5.0299,-0.637591,1.37859
1,NCT00549822,1,2,1,1.0,1,2,0,12,10,...,0.233789,-1.127137,1.152212,0.207231,-0.408253,-2.06163,-2.944832,2.566082,1.306969,0.254373
2,NCT02105480,0,0,1,0.0,1,1,0,2,1,...,-0.052138,-1.899454,2.748823,0.944922,-0.434395,1.071502,-0.891071,0.438525,1.770845,-0.296455
3,NCT01931956,0,0,2,4.0,1,38,0,21,36,...,0.591035,-1.141933,0.246145,2.047257,-1.056454,0.19748,-1.590381,0.00678,-2.048953,3.504484
4,NCT01547364,0,0,1,2.0,1,1,0,7,1,...,1.650844,1.102681,0.090722,1.60617,2.378569,-0.822739,-1.905433,0.803081,3.946472,-2.106883


In [5]:
records = df.to_dict('records')
len(records)

68999

In [5]:
# Connect to MongoDB
mongo_client = MongoClient(mongodb_conn_params['connection_string'])
mongo_db = mongo_client[mongodb_conn_params['database']]
mongo_collection = mongo_db[mongodb_conn_params['collection']]

# Load the data into MongoDB
records = df.to_dict('records')
mongo_collection.insert_many(records)

# Close the MongoDB connection
mongo_client.close()

print("ETL process completed successfully.")