# Data Migration: SQL to postgres

In [118]:
import os
import pandas as pd
import pyodbc
import psycopg2
from psycopg2.extras import execute_values
from dotenv import load_dotenv

## 1. Load credentials

In [119]:
load_dotenv()

True

In [120]:
sql_host = os.getenv("SQL_SERVER_HOST")
sql_db = os.getenv("SQL_SERVER_DB")

In [121]:
pg_host = os.getenv("POSTGRES_HOST") 
pg_port = os.getenv("POSTGRES_PORT")
pg_db = os.getenv("POSTGRES_DB")
pg_user = os.getenv("POSTGRES_USER")
pg_password = os.getenv("POSTGRES_PASSWORD")

 
 

In [122]:
print(f"POSTGRES_HOST: {pg_host}")
print(f"POSTGRES_PORT: {pg_port}")
print(f"POSTGRES_DB: {pg_db}")
print(f"POSTGRES_USER: {pg_user}")
print(f"POSTGRES_PASSWORD: {pg_password}")

POSTGRES_HOST: localhost
POSTGRES_PORT: 5432
POSTGRES_DB: transaction_uat
POSTGRES_USER: postgres
POSTGRES_PASSWORD: postgres


## 2. Connect to SQL Server


In [123]:
print ("Connecting to SQL Server...")
print (f"   Server: {sql_host}")
print (f"   Database: {sql_db}")

Connecting to SQL Server...
   Server: INTELI5SSD-LAPT\SQLEXPRESS
   Database: TransactionDB_UAT


In [124]:
try:
    sql_conn_string = (
    f"DRIVER={{ODBC Driver 17 for SQL Server}};"
    f"SERVER={sql_host};"
    f"DATABASE={sql_db};"
    f"Trusted_Connection=yes;"
    )

    sql_conn = pyodbc.connect(sql_conn_string)
    sql_cursor = sql_conn.cursor()
    print ("[SUCCESS] -> Connection to SQL Server completed ")
except Exception as e:
    print(f"SQl Server connection failed {e}")

[SUCCESS] -> Connection to SQL Server completed 


## 3. Coonect to postgres

In [125]:
print ("Connecting to Postgres...")
print (f"  Server: {pg_host}")
print (f"  Database: {pg_db}")

Connecting to Postgres...
  Server: localhost
  Database: transaction_uat


In [126]:
try:
    pg_conn = psycopg2.connect(
        host=pg_host,
        port=pg_port,
        database=pg_db,
        user=pg_user,
        password=pg_password
    )

    pg_cursor=pg_conn.cursor()
    pg_cursor.execute("SELECT version();")
    pg_version = pg_cursor.fetchone()[0]

    print ("Connected to postgres")
    print (f"   Version: {pg_version[:50]}...\n")
except psycopg2.OperationalError as e:
    print(f"  Postgres connection failed: {e}")

except Exception as e:
    print (f"unexpected error {e}")
    raise

Connected to postgres
   Version: PostgreSQL 18.1 on x86_64-windows, compiled by msv...



## Define the tables to migrate

In [127]:
tables_to_migrate = ['Categories', 'Suppliers', 'Customers', 'Products']
print(f"  these are the tables: {tables_to_migrate}")

  these are the tables: ['Categories', 'Suppliers', 'Customers', 'Products']


In [128]:
for i, table in enumerate(tables_to_migrate, 1):
    print(f"  {i}.{table}")
Total_tables_to_migrate = len(tables_to_migrate)    
print (f"\n The total number of tables to migrate :{Total_tables_to_migrate}")

  1.Categories
  2.Suppliers
  3.Customers
  4.Products

 The total number of tables to migrate :4


## Run pre-migration checks

In [129]:
print("=" * 50)
print(">>> ROW COUNTS")
print("=" * 50)

>>> ROW COUNTS


In [130]:
customers = "SELECT COUNT(*) AS TotalCustomer FROM Customers;"
sql_cursor.execute(customers)
customers_count = sql_cursor.fetchone()[0]
print(f" Total customers is: {customers_count}")

 Total customers is: 900000


In [131]:
baseline_counts = {}

try:
    for table in tables_to_migrate:
        row_count_query = f"SELECT COUNT(*) FROM {table}"
        sql_cursor.execute(row_count_query)
        count = sql_cursor.fetchone()[0]

        baseline_counts[table] = count
        print(f"{table:30} {count} rows")
    print("-" * 45)    
    total_count = sum(baseline_counts.values())
    print(f"{'Total':28} {total_count:>10,} rows")
except Exception as e:
    print (f"Failed to get baseline counts: {e}")
    raise



Categories                     8 rows
Suppliers                      5000 rows
Customers                      900000 rows
Products                       150000 rows
---------------------------------------------
Total                         1,055,008 rows


In [None]:
data_quality_issues = []
print("\nCHECK 2: NULL CHECKS (CustomerName)")
try:
    customer_query = """SELECT COUNT(*) AS null_count
                        FROM Customers 
                        WHERE CustomerName is NULL"""
    sql_cursor.execute(customer_query)
    null_customer_count= sql_cursor.fetchone()[0]
    if null_customer_count > 0:
        data_quality_issues.append(f" {null_customer_count:,} Customers with null names...")
    #print (f"Total null customer is:{data_quality_issues[0]:,}...")

    print("\nCHECK 3: INVALID EMAIL FORMATS")
    sql_cursor.execute("""SELECT COUNT(*) AS invalid_email_count
                            FROM Customers
                            WHERE Email LIKE '%@invalid'  """)
    invalid_emails = sql_cursor.fetchone()[0]
    if invalid_emails > 0:
        data_quality_issues.append(f" {invalid_emails} invalid email formats...")
        #print (data_quality_issues)

    print("\nCHECK 4: NEGATIVE PRODUCT PRICES")
    sql_cursor.execute("""SELECT COUNT(*) AS negative_price_count
                            FROM Products
                            WHERE UnitPrice < 0; """)
    negative_prices = sql_cursor.fetchone()[0]
    if negative_prices > 0:
        data_quality_issues.append(f" {negative_prices:,} Negative prices...")
        #print (data_quality_issues)

    print("\nCHECK 5: NEGATIVE STOCK QUANTITY")
    sql_cursor.execute("""SELECT COUNT(*) AS negative_stock_count
                            FROM Products
                            WHERE StockQuantity < 0; """)
    negative_stock = sql_cursor.fetchone()[0]
    if negative_stock > 0:
        data_quality_issues.append(f" {negative_stock:,} Negative stocks...")
        #print (data_quality_issues)

    print("\nCHECK 6: IDENTIFYING ORPHAN RECORDS")    
    sql_cursor.execute("""SELECT COUNT(*)
                       FROM Products prod
                       WHERE NOT EXISTS (SELECT 1
                       FROM Suppliers sup
                       WHERE sup.SupplierID=prod.SupplierID) """)
    orpthan_data = sql_cursor.fetchone()[0]
    if orpthan_data > 0:
        data_quality_issues.append(f" {orpthan_data:,} Orphan records...")
        #print (data_quality_issues)

    print("\nCHECK 7: FUTURE DATE CHECK")    
    sql_cursor.execute("""SELECT COUNT(*)AS future_date_count
                       FROM Customers
                       WHERE CreatedDate > GETDATE() """)
    future_dates = sql_cursor.fetchone()[0]
    if future_dates > 0:
        data_quality_issues.append(f" {future_dates:,} future date records...")
        #print (data_quality_issues)    

    if data_quality_issues:
        print ("\nData Quality Issues Found:")
        for quality_issues in data_quality_issues:
            print(quality_issues)
    else:
        print("No data quality issues identified")

except Exception (e):
    print(f"[ERROR]====> {e}")
    raise


CHECK 2: NULL CHECKS (CustomerName)

CHECK 3: INVALID EMAIL FORMATS

CHECK 4: NEGATIVE PRODUCT PRICES

CHECK 5: NEGATIVE STOCK QUANTITY

CHECK 6: IDENTIFYING ORPHAN RECORDS

CHECK 7: FUTURE DATE CHECK
Data Quality Issues Found:
 4,514 Customers with null names...
 8844 invalid email formats...
 775 Negative prices...
 1,467 Negative stocks...
 24,700 Orphan records...
 8,567 future date records...
