### Imports

In [None]:
from concurrent.futures import ThreadPoolExecutor, TimeoutError
from hfsql_guide.settings.credentials import dsn, user, passwd
from hfsql_guide.settings.paths import DATA_DIR, PARQUET_DIR
from pathlib import Path
import pandas as pd
import threading
import pypyodbc
import warnings
import sys

FAILED_LOG = Path(f"{DATA_DIR}/failed_tables.txt")
TIMEOUT_SECONDS = 5 * 60   # 5 minutes

### Code

In [5]:
# list of tables to export
tables = pd.read_excel(DATA_DIR / 'Tables_name.xlsx').iloc(axis=1)[0].tolist()

In [None]:
# remember to set up the connection in ODBC Data Source Administrator 64 bit
connection_string = (
    f"DSN={dsn};"
    f"UID={user};"
    f"PWD={passwd};"
)

In [7]:
def log_failed(table: str, reason: str):
    """Append failed table info to the log."""
    with FAILED_LOG.open("a", encoding="utf8") as f:
        f.write(f"{table}  ---  {reason}\n")

def run_query_with_timeout(query: str, conn, timeout: int):
    """Run a query in a separate thread so it can timeout."""
    with ThreadPoolExecutor(max_workers=1) as executor:
        future = executor.submit(pd.read_sql, query, conn, dtype="object")
        return future.result(timeout=timeout)

In [None]:
warnings.filterwarnings('ignore', category=UserWarning)

conn = None

try:
    conn = pypyodbc.connect(connection_string, autocommit=True)
    print("Database connection established.")

    for i, table in enumerate(tables, start=1):
        print(f"Table {i} of {len(tables)}: {table}")
        try:  
            query = f"SELECT * FROM {table}"

            try:
                df = run_query_with_timeout(query, conn, TIMEOUT_SECONDS)
            except TimeoutError:
                print(f"Timeout for {table}", file=sys.stderr)
                log_failed(table, "timeout")
                continue

            df = df.replace(['', ' ', 'NULL'], pd.NA) # replace empty strings and 'NULL' with NaN
            df.to_parquet(PARQUET_DIR / f"{table}.parquet", index=False)          

        except Exception as e:
            print(f"An error occurred: {e}", file=sys.stderr)
            log_failed(table, str(e))
            continue

    print("Data export completed successfully.")

except pypyodbc.Error as e:
    print(f"Database connection error: {e}", file=sys.stderr)

except Exception as e:
    print(f"An unexpected error occurred: {e}", file=sys.stderr)        

finally:
    if conn:
        conn.close()
        print("Database connection closed.")

Table 1 of 169
Table 2 of 169
Table 3 of 169
Table 4 of 169
Table 5 of 169
Table 6 of 169
Table 7 of 169
Table 8 of 169
Table 9 of 169


An error occurred: invalid literal for int() with base 10: ''


Table 10 of 169


An error occurred: invalid literal for int() with base 10: ''


Table 11 of 169
Table 12 of 169


An error occurred: invalid literal for int() with base 10: ''


Table 13 of 169
Table 14 of 169
Table 15 of 169
Table 16 of 169
Table 17 of 169
Table 18 of 169
Table 19 of 169
