# Instal libraries

In [1]:
#!pip3 install psycopg2-binary

# Import libraries

In [2]:
import os
import psycopg2
from psycopg2.extras import execute_values
import polars as pl
import json
from psycopg2 import OperationalError, sql
from contextlib import contextmanager
#from dotenv import load_dotenv

# Functions

In [3]:
DB_NAME="testdb"
DB_USER="admin"
DB_PASSWORD="your_password"
DB_HOST="localhost"
DB_PORT="6432"

In [4]:
@contextmanager
def get_db_connection():
    """Context manager for PostgreSQL database connection."""
    conn = None
    try:
        # Establish the connection using environment variables
        conn = psycopg2.connect(
            dbname=DB_NAME,
            user=DB_USER,
            password=DB_PASSWORD,
            host=DB_HOST,
            port=DB_PORT
        )
        # Yield the connection to be used in the 'with' block
        yield conn
    except OperationalError as e:
        print(f"An error occurred while connecting to the database: {e}")
        raise
    finally:
        if conn:
            conn.close()

In [5]:
def test_db_connection():
    """Test function to verify database connection."""
    try:
        with get_db_connection() as conn:
            with conn.cursor() as cur:
                # Execute a simple query to test the connection
                cur.execute(sql.SQL("SELECT 1"))
                result = cur.fetchone()
                if result:
                    print("Database connection successful.")
                else:
                    print("Failed to retrieve data from the database.")
    except Exception as e:
        print(f"Test failed: {e}")

In [6]:
def load_and_prepare_data(parquet_file_path: str) -> pl.DataFrame:
    """
    Load and process the Parquet data file, expanding the `full_vehicleInfo` column.
    """
    # Read the Parquet file
    df = pl.read_parquet(parquet_file_path)

    # Cast 'full_vehicleInfo' to Struct type and unnest
    return df.with_columns(
        pl.col("full_vehicleInfo").cast(pl.Struct)
    ).unnest("full_vehicleInfo")

In [7]:
def insert_data_into_db(df: pl.DataFrame, table_name: str):
    """
    Insert data from a Polars DataFrame into a PostgreSQL table.
    """
    # Convert Polars DataFrame to list of tuples
    records = df.to_dicts()
    
    # Use the first record to generate column names dynamically
    columns = list(records[0].keys())
    rows = [tuple(record.values()) for record in records]

    # Database connection
    conn = None
    try:
        conn = psycopg2.connect(
            dbname=DB_NAME,
            user=DB_USER,
            password=DB_PASSWORD,
            host=DB_HOST,
            port=DB_PORT
        )
        with conn.cursor() as cur:
            # Create an insert query dynamically
            insert_query = f"INSERT INTO {table_name} ({', '.join(columns)}) VALUES %s"
            execute_values(cur, insert_query, rows)
            conn.commit()
            print(f"{len(rows)} records successfully inserted into {table_name}.")
    except Exception as e:
        print(f"An error occurred while inserting data: {e}")
    finally:
        if conn:
            conn.close()

In [8]:
def map_polars_to_postgres_types(polars_dtype):
    """
    Map Polars data types to PostgreSQL data types.
    """
    type_mapping = {
        pl.Int32: "INTEGER",
        pl.Int64: "BIGINT",
        pl.Float32: "REAL",
        pl.Float64: "DOUBLE PRECISION",
        pl.Utf8: "TEXT",
        pl.Boolean: "BOOLEAN",
        pl.Date: "DATE",
        pl.Datetime: "TIMESTAMP",
        pl.List: "JSONB",  # If lists are used, JSONB is a good fit
    }
    return type_mapping.get(polars_dtype, "TEXT")  # Default to TEXT for unknown types

In [9]:
def create_table_from_df(table_name: str, df: pl.DataFrame):
    """
    Create a PostgreSQL table based on the schema of a Polars DataFrame.
    """
    # Generate column definitions based on DataFrame schema
    columns = [
        f"{col_name} {map_polars_to_postgres_types(dtype)}"
        for col_name, dtype in zip(df.columns, df.dtypes)
    ]
    columns_sql = ", ".join(columns)
    
    # Construct CREATE TABLE statement
    create_table_query = f"""
    CREATE TABLE IF NOT EXISTS {table_name} (
        {columns_sql}
    );
    """
    
    # Connect to PostgreSQL and execute the query
    conn = None
    try:
        conn = psycopg2.connect(
            dbname=DB_NAME,
            user=DB_USER,
            password=DB_PASSWORD,
            host=DB_HOST,
            port=DB_PORT
        )
        with conn.cursor() as cur:
            cur.execute(create_table_query)
            conn.commit()
            print(f"Table '{table_name}' created successfully.")
    except Exception as e:
        print(f"An error occurred while creating the table: {e}")
    finally:
        if conn:
            conn.close()

# Main

## Test connection

In [10]:
test_db_connection()

Database connection successful.


## Prepare data

In [11]:
parquet_file_path = "../Data/Transform/Small/data.parquet"
table_name = "vehicles"

In [12]:
df = load_and_prepare_data(parquet_file_path)

## Create table

In [13]:
create_table_from_df(table_name, df)

Table 'vehicles' created successfully.


## Insert data

In [14]:
insert_data_into_db(df, table_name)

1000 records successfully inserted into vehicles.
