In [55]:
from dotenv import load_dotenv
import os
load_dotenv()
storage_account = os.getenv("AZURE_STORAGE_ACCOUNT")
adls_client_id = os.getenv("ADLS_CLIENT_ID")
adls_client_secret = os.getenv("ADLS_CLIENT_SECRET")
adls_tenant_id = os.getenv("ADLS_TENANT_ID")
conn_string_value = os.getenv('AZURE_DATA_LAKE_CONNECTION_STRING')
container_name = os.getenv('AZURE_CONTAINER_NAME')

In [5]:
pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [56]:
from azure.identity import ClientSecretCredential
from azure.storage.blob import BlobServiceClient

# Get directory path from environment variable
directory_path = os.getenv('AZURE_DIRECTORY_PATH')

# Test credentials
credential = ClientSecretCredential(
    tenant_id= adls_tenant_id,
    client_id= adls_client_id,
    client_secret= adls_client_secret
)

# Test connection
service_client = BlobServiceClient(
    account_url="https://myvisekendatalake.blob.core.windows.net",
    credential=credential
)

# Get container client
container_client = blob_service_client.get_container_client(container_name)

# List all blobs in the specified directory
blob_list = container_client.list_blobs(name_starts_with=directory_path)

# Get list of files
files = [blob.name for blob in container_client.list_blobs(name_starts_with=directory_path)
         if blob.name.lower().endswith(('.csv', '.parquet'))] # filter for .csv or .parquet file only

In [77]:
import io
import pandas as pd


for x in files:
    blob_service_client = BlobServiceClient.from_connection_string(conn_string_value)# Access the container and blob
    container_name = os.getenv('AZURE_CONTAINER_NAME')
    blob_name = x
    processing_date = pd.Timestamp.now().strftime('%Y%m%d')
    if processing_date in blob_name:
        print(blob_name)

raw/carlist/city_20250120.parquet
raw/carlist/myvi_20250120.parquet
raw/carlist/vios_20250120.parquet


In [91]:
from azure.storage.blob import BlobServiceClient
import pandas as pd

blob_service_client = BlobServiceClient.from_connection_string(conn_string_value)

# Access the container and blob
container_name = os.getenv('AZURE_CONTAINER_NAME')
blob_name = "raw/carlist/vios_20250120.parquet"
blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)

# Download the blob content
stream = blob_client.download_blob().readall()

# Load into pandas DataFrame
import io
df = pd.read_parquet(io.BytesIO(stream))
print(df.head())

  model                                              title    year   color  \
0  Vios                         2016 Toyota Vios 1.5 Sedan  2016.0  Silver   
1  Vios  2013 Toyota Vios 1.5 G ONE OWNER VERY LOW MILEAGE  2013.0  Silver   
2  Vios                       2022 Toyota Vios 1.5 G Sedan  2022.0     Red   
3  Vios                         2019 Toyota Vios 1.5 Sedan  2019.0  Silver   
4  Vios                         2019 Toyota Vios 1.5 Sedan  2019.0   White   

    mileage    price  seller          location       state  \
0  119592.0  43000.0  Person  Kuala Terengganu  Terengganu   
1  102500.0  29800.0  Person            Cheras    Selangor   
2   72500.0  69800.0  Person            Cheras    Selangor   
3   77747.0  58900.0  Person           Kuantan      Pahang   
4   77914.0  59400.0  Person            Melaka      Melaka   

                                                 url  \
0  https://www.carlist.my/used-cars/2016-toyota-v...   
1  https://www.carlist.my/used-cars/2013-toyot

In [92]:
from sqlalchemy.engine import create_engine
from datetime import datetime
from sqlalchemy import DateTime, String, Integer, MetaData, Table, Column, text, ForeignKey
from sqlalchemy.types import TEXT


def get_db_connection():
    load_dotenv()
    #LOAD CREDENTIAL FROM .ENV FILE
    server_name = os.getenv("SQL_SERVER")
    database_name = os.getenv("DB_NAME")
    username = os.getenv("DB_USERNAME")
    password = os.getenv("DB_PASSWORD")

    # Connection string using SQL login
    connection_string = (
        f"mssql+pyodbc://{username}:{password}"
        f"@{server_name}.database.windows.net:1433/{database_name}"
        "?driver=ODBC+Driver+18+for+SQL+Server&encrypt=yes&TrustServerCertificate=no&timeout=30"
    )

    # Create the SQLAlchemy engine and connect
    engine = create_engine(connection_string)
    return engine

In [93]:
df['listing_image'] = df['image'].str[0]
df = df[df['state'].isnull() == False]
df.rename(columns={'model': 'car_model'}, inplace=True)
del df['seller']
df.head()

Unnamed: 0,car_model,title,year,color,mileage,price,location,state,url,image,listing_id,installment,variant,transmission,listing_image
0,Vios,2016 Toyota Vios 1.5 Sedan,2016.0,Silver,119592.0,43000.0,Kuala Terengganu,Terengganu,https://www.carlist.my/used-cars/2016-toyota-v...,[https://img1.icarcdn.com/66431751/thumb-l_use...,15713466,RM 557/month,E,Automatic,https://img1.icarcdn.com/66431751/thumb-l_used...
1,Vios,2013 Toyota Vios 1.5 G ONE OWNER VERY LOW MILEAGE,2013.0,Silver,102500.0,29800.0,Cheras,Selangor,https://www.carlist.my/used-cars/2013-toyota-v...,[https://img1.icarcdn.com/91501611/thumb-l_use...,11610519,RM 386/month,G,Automatic,https://img1.icarcdn.com/91501611/thumb-l_used...
2,Vios,2022 Toyota Vios 1.5 G Sedan,2022.0,Red,72500.0,69800.0,Cheras,Selangor,https://www.carlist.my/used-cars/2022-toyota-v...,[https://img1.icarcdn.com/37692951/thumb-l_use...,15929673,RM 905/month,G,Automatic,https://img1.icarcdn.com/37692951/thumb-l_used...
3,Vios,2019 Toyota Vios 1.5 Sedan,2019.0,Silver,77747.0,58900.0,Kuantan,Pahang,https://www.carlist.my/used-cars/2019-toyota-v...,[https://img1.icarcdn.com/39930261/thumb-l_use...,16203993,RM 764/month,E,Automatic,https://img1.icarcdn.com/39930261/thumb-l_used...
4,Vios,2019 Toyota Vios 1.5 Sedan,2019.0,White,77914.0,59400.0,Melaka,Melaka,https://www.carlist.my/used-cars/2019-toyota-v...,[https://img1.icarcdn.com/56830261/thumb-l_use...,16203865,RM 770/month,E,Automatic,https://img1.icarcdn.com/56830261/thumb-l_used...


In [94]:
import json
def insert_data(df):
    engine = get_db_connection()

    # Add new columns to DataFrame
    df['processing_date'] = datetime.now()

    with engine.connect() as connection:
        # Enable identity insert
        # df['image'] = df['image'].apply(lambda x: json.dumps(x) if x is not None else None)
        df.to_sql('test_staging', engine, if_exists='append', index=False)
        
        # Disable identity insert
        
        connection.commit()

# insert_data(df)