### Imports

In [3]:
from hfsql_guide.settings.paths import DATA_DIR, PARQUET_DIR, BLACKLISTED_TABLES, FAILED_TABLES
from concurrent.futures import ThreadPoolExecutor, TimeoutError
from hfsql_guide.settings.credentials import dsn, user, passwd
import pandas as pd
import pypyodbc
import warnings
import sys

TIMEOUT_SECONDS = 3 * 60   # 3 minutes

### Code

In [4]:
# list of tables to export
tables = pd.read_excel(DATA_DIR / 'Tables_name.xlsx').iloc(axis=1)[0].tolist()

In [5]:
# remember to set up the connection in ODBC Data Source Administrator 64 bit
connection_string = (
    f"DSN={dsn};"
    f"UID={user};"
    f"PWD={passwd};"
)

In [6]:
def log_failed(table: str, reason: str):
    """Append failed table info to the log."""
    with FAILED_TABLES.open("a", encoding="utf8") as f:
        f.write(f"{table}  ---  {reason}\n")

def run_query_with_timeout(query: str, connection_string: str, timeout: int):
    """
    Runs a query in a separate thread, creating its OWN connection
    to prevent "zombie" hangs.
    """
    
    def _run_query_in_thread():
        """This function runs in the isolated thread."""
        conn_thread = None
        try:
            conn_thread = pypyodbc.connect(connection_string, autocommit=True)
            

            df = pd.read_sql(query, conn_thread, dtype="object")  # pyright: ignore[reportArgumentType]
            return df
        finally:
            if conn_thread:
                conn_thread.close()

    with ThreadPoolExecutor(max_workers=1) as executor:
        future = executor.submit(_run_query_in_thread)
        return future.result(timeout=timeout)

with open(BLACKLISTED_TABLES, 'r', encoding='utf8') as f:
    blacklist = [line.strip() for line in f if line.strip()]

with open(FAILED_TABLES, 'r', encoding='utf8') as f:
    already_failed = [line.split('  ---  ')[0] for line in f if line.strip()]

try:
    existing_files = set(f.stem for f in PARQUET_DIR.glob("*.parquet"))
    print(f"{len(existing_files)} .parquet files found.")
except FileNotFoundError:
    print("Nonexistent PARQUET_DIR, all files will be created.")
    existing_files = set()

125 .parquet files found.


In [7]:
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

conn = None

try:
    conn = pypyodbc.connect(connection_string, autocommit=True)
    print("Database connection established.")

    for i, table in enumerate(tables, start=1):
        print(f"Table {i} of {len(tables)}")
        if table in existing_files:
            print(f"    Skipping table {i}, .parquet file already exists.")
            continue
        if table in already_failed:
            print(f"    Skipping previously failed table: {i}")
            continue
        if table in blacklist:
            print(f"    Skipping blacklisted table: {i}")
            continue
        try:  
            query = f"SELECT * FROM {table}"

            try:
                df = run_query_with_timeout(query, connection_string, TIMEOUT_SECONDS)
            except TimeoutError:
                print(f"    Timeout for {table}", file=sys.stderr)
                log_failed(table, "timeout")
                continue

            df = df.astype('string')  # convert all columns to string dtype to avoid type issues
            df = df.replace(['', ' ', 'NULL'], pd.NA) # replace empty strings and 'NULL' with NaN
            df.to_parquet(PARQUET_DIR / f"{table}.parquet", index=False)          

        except Exception as e:
            print(f"    An error occurred: {e}", file=sys.stderr)
            log_failed(table, str(e))
            continue

    print("Data export completed successfully.")

except pypyodbc.Error as e:
    print(f"Database connection error: {e}", file=sys.stderr)

except Exception as e:
    print(f"    An unexpected error occurred: {e}", file=sys.stderr)        

finally:
    if conn:
        conn.close()
        print("Database connection closed.")

Database connection established.
Table 1 of 169
    Skipping table 1, .parquet file already exists.
Table 2 of 169
    Skipping table 2, .parquet file already exists.
Table 3 of 169
    Skipping table 3, .parquet file already exists.
Table 4 of 169
    Skipping table 4, .parquet file already exists.
Table 5 of 169
    Skipping table 5, .parquet file already exists.
Table 6 of 169
    Skipping table 6, .parquet file already exists.
Table 7 of 169
    Skipping table 7, .parquet file already exists.
Table 8 of 169
    Skipping table 8, .parquet file already exists.
Table 9 of 169


    Timeout for ADM_IngresosEgresos


Table 10 of 169
    Skipping previously failed table: 10
Table 11 of 169
    Skipping table 11, .parquet file already exists.
Table 12 of 169
    Skipping previously failed table: 12
Table 13 of 169
    Skipping table 13, .parquet file already exists.
Table 14 of 169
    Skipping table 14, .parquet file already exists.
Table 15 of 169
    Skipping table 15, .parquet file already exists.
Table 16 of 169
    Skipping table 16, .parquet file already exists.
Table 17 of 169
    Skipping table 17, .parquet file already exists.
Table 18 of 169
    Skipping table 18, .parquet file already exists.
Table 19 of 169
    Skipping blacklisted table: 19
Table 20 of 169
    Skipping table 20, .parquet file already exists.
Table 21 of 169
    Skipping table 21, .parquet file already exists.
Table 22 of 169
    Skipping table 22, .parquet file already exists.
Table 23 of 169
    Skipping table 23, .parquet file already exists.
Table 24 of 169
    Skipping table 24, .parquet file already exists.
Table 2

    An error occurred: Execution failed on sql 'SELECT * FROM nan': ('HY090', '[HY090] ¿Qué sucedió?\r\nError en el código SQL de la consulta <SQLODBC>. No se puede inicializar la consulta.\r\nArchivo nan desconocido\r\nError detectado:\r\nSELECT * FROM >>>>nan<<<<\r\n\r\nCódigo de error: 28000002\r\nNivel: error fatal\r\n\r\nMódulo: wd300sql64.dll (01A305067  - 30.0.562.0)\r\n\r\nInformación de depuración:\r\nIEWD300SQL=1.21\r\nModule=<WD300SQL>\r\nVersion=<30.0.562.0>\r\nInformación adicional:\r\nEIT_LOGICALTABLENAME : <nan>\r\nEIT_POSITION_ERREUR : <0\t14\t0\t16>')


Data export completed successfully.
Database connection closed.
