# Pipeline

1. Maak SQL scripts voor schema's: RAW, ARCHIVED, CLEANSED
2. Importeer source data in RAW
3. Data cleaning => RAW naar ARCHIVED en CLEANSED
4. Maak SQL scripts voor Data Warehouse / Ster schema
5. Import van CLEANSED naar DWH
6. Prep Data lake: export tabellen naar Parquet files
7. Upload Parquet files naar S3 (eerst bucket aanmaken)
8. Maak Athena tables
9. Gebruik Athena in BI tool naar keuze

In [1]:
%pip install -q pandas sqlalchemy psycopg2-binary

Note: you may need to restart the kernel to use updated packages.


## Stap 1: SQL scripts

In [15]:
import psycopg2

# Verbindingsgegevens
host = "db"
dbname = "postgres"
user = "postgres"
password = "Newpassword"
port = "5432"  # Standaard PostgreSQL poort

# Maak de verbinding
conn = psycopg2.connect(
    host=host,
    dbname=dbname,
    user=user,
    password=password,
    port=port
)

# Maak een cursor aan
cur = conn.cursor()
# Open het SQL-bestand
with open('./sql_scripts/raw.sql', 'r') as file:
    sql_script = file.read()
cur.execute(sql_script)

with open('./sql_scripts/archived.sql', 'r') as file:
    sql_script = file.read()
cur.execute(sql_script)

with open('./sql_scripts/cleansed.sql', 'r') as file:
    sql_script = file.read()
cur.execute(sql_script)

conn.commit()  # Vergeet niet te committeren als het script wijzigingen maakt



## Stap 2: raw importeren

In [3]:
from sqlalchemy import create_engine, types as sqlalchemytypes

# Vervang 'username', 'password', 'host', 'port', en 'database' met jouw databasegegevens
engine = create_engine('postgresql://postgres:Newpassword@host.docker.internal:5432/postgres')

import pandas as pd

In [16]:
SCHEMA = 'raw'
TABEL = 'aankomst'

# Pas het pad naar je CSV-bestand aan
df = pd.read_csv(f'./source_data/export_{TABEL}.txt', sep='\t', dtype=str, encoding='raw_unicode_escape')

df.to_sql(TABEL, con=engine, schema=SCHEMA, if_exists='replace', index=False, dtype={
    "Vluchtid": sqlalchemytypes.String,
    "Vliegtuigcode": sqlalchemytypes.String,
    "Terminal": sqlalchemytypes.String,
    "Gate": sqlalchemytypes.String,
    "Baan": sqlalchemytypes.String,
    "Bezetting": sqlalchemytypes.String,
    "Vracht": sqlalchemytypes.String,
    "Aankomsttijd": sqlalchemytypes.String,
})


61

In [17]:
TABEL = 'banen'
file_path = f'./source_data/export_{TABEL}.csv' 
df = pd.read_csv(file_path, sep=';', dtype=str)


df.to_sql(TABEL, con=engine, schema=SCHEMA, if_exists='replace', index=False, dtype={
    "Baannummer": sqlalchemytypes.String,
    "Code": sqlalchemytypes.String,
    "Naam": sqlalchemytypes.String,
    "Lengte": sqlalchemytypes.String
})

6

In [18]:
TABEL = 'klant'
file_path = f'./source_data/export_{TABEL}.csv'
df = pd.read_csv(file_path, sep=';', dtype=str)


df.to_sql(TABEL, con=engine, schema=SCHEMA, if_exists='replace', index=False, dtype={
    "Vluchtid": sqlalchemytypes.String,
    "Operatie": sqlalchemytypes.String,
    "Faciliteiten": sqlalchemytypes.String,
    "Shops": sqlalchemytypes.String
})

110

In [19]:
TABEL = 'luchthavens'
file_path = f'./source_data/export_{TABEL}.txt'
df = pd.read_csv(file_path, sep='\t', dtype=str, encoding='raw_unicode_escape')


df.to_sql(TABEL, con=engine, schema=SCHEMA, if_exists='replace', index=False, dtype={
    "Airport": sqlalchemytypes.String,
    "City": sqlalchemytypes.String,
    "Country": sqlalchemytypes.String,
    "IATA": sqlalchemytypes.String,
    "ICAO": sqlalchemytypes.String,
    "Lat": sqlalchemytypes.String,
    "Lon": sqlalchemytypes.String,
    "Alt": sqlalchemytypes.String,
    "TZ": sqlalchemytypes.String,
    "DST": sqlalchemytypes.String,
    "Tz": sqlalchemytypes.String
})

107

In [20]:
TABEL = 'maatschappijen'
file_path = f'./source_data/export_{TABEL}.txt'
df = pd.read_csv(file_path, sep='\t', dtype=str, encoding='raw_unicode_escape')


df.to_sql(TABEL, con=engine, schema=SCHEMA, if_exists='replace', index=False, dtype={
    "Name": sqlalchemytypes.String,
    "IATA": sqlalchemytypes.String,
    "ICAO": sqlalchemytypes.String
})

166

In [21]:
TABEL = 'planning'
file_path = f'./source_data/export_{TABEL}.txt'
df = pd.read_csv(file_path, sep='\t', dtype=str, encoding='raw_unicode_escape')


df.to_sql(TABEL, con=engine, schema=SCHEMA, if_exists='replace', index=False, dtype={
    "Vluchtnr": sqlalchemytypes.String,
    "Airlinecode": sqlalchemytypes.String,
    "Destcode": sqlalchemytypes.String,
    "Planterminal": sqlalchemytypes.String,
    "Plangate": sqlalchemytypes.String,
    "Plantijd": sqlalchemytypes.String
})

693

In [22]:
TABEL = 'vertrek'
file_path = f'./source_data/export_{TABEL}.txt'
df = pd.read_csv(file_path, sep='\t', dtype=str, encoding='raw_unicode_escape')


df.to_sql(TABEL, con=engine, schema=SCHEMA, if_exists='replace', index=False, dtype={
    "Vluchtid": sqlalchemytypes.String,
    "Vliegtuigcode": sqlalchemytypes.String,
    "Terminal": sqlalchemytypes.String,
    "Gate": sqlalchemytypes.String,
    "Baan": sqlalchemytypes.String,
    "Bezetting": sqlalchemytypes.String,
    "Vracht": sqlalchemytypes.String,
    "Vertrektijd": sqlalchemytypes.String
})

447

In [23]:
TABEL = 'vliegtuig'
file_path = f'./source_data/export_{TABEL}.txt'
df = pd.read_csv(file_path, sep='\t', dtype=str, encoding='raw_unicode_escape')


df.to_sql(TABEL, con=engine, schema=SCHEMA, if_exists='replace', index=False, dtype={
    "Vluchtid": sqlalchemytypes.String,
    "Vliegtuigcode": sqlalchemytypes.String,
    "Terminal": sqlalchemytypes.String,
    "Gate": sqlalchemytypes.String,
    "Baan": sqlalchemytypes.String,
    "Bezetting": sqlalchemytypes.String,
    "Vracht": sqlalchemytypes.String,
    "Vertrektijd": sqlalchemytypes.String
})

557

In [24]:
TABEL = 'vliegtuigtype'
file_path = f'./source_data/export_{TABEL}.csv'
df = pd.read_csv(file_path, sep=';', dtype=str)


df.to_sql(TABEL, con=engine, schema=SCHEMA, if_exists='replace', index=False, dtype={
    "IATA": sqlalchemytypes.String,
    "ICAO": sqlalchemytypes.String,
    "Merk": sqlalchemytypes.String,
    "Type": sqlalchemytypes.String,
    "Wake": sqlalchemytypes.String,
    "Cat": sqlalchemytypes.String,
    "Capaciteit": sqlalchemytypes.String,
    "Vracht": sqlalchemytypes.String
})

327

In [25]:
TABEL = 'vlucht'
file_path = f'./source_data/export_{TABEL}.txt'
df = pd.read_csv(file_path, sep='\t', dtype=str, encoding='raw_unicode_escape')


df.to_sql(TABEL, con=engine, schema=SCHEMA, if_exists='replace', index=False, dtype={
    "Vluchtid": sqlalchemytypes.String,
    "Vluchtnr": sqlalchemytypes.String,
    "Airlinecode": sqlalchemytypes.String,
    "Destcode": sqlalchemytypes.String,
    "Vliegtuigcode": sqlalchemytypes.String,
    "Datum": sqlalchemytypes.String
})

512

In [26]:
TABEL = 'weer'
file_path = f'./source_data/export_{TABEL}.txt'
df = pd.read_csv(file_path, sep='\t', dtype=str, encoding='raw_unicode_escape')


df.to_sql(TABEL, con=engine, schema=SCHEMA, if_exists='replace', index=False, dtype={
    "Datum": sqlalchemytypes.String,
    "DDVEC": sqlalchemytypes.String,
    "FHVEC": sqlalchemytypes.String,
    "FG": sqlalchemytypes.String,
    "FHX": sqlalchemytypes.String,
    "FHXH": sqlalchemytypes.String,
    "FHN": sqlalchemytypes.String,
    "FHNH": sqlalchemytypes.String,
    "FXX": sqlalchemytypes.String,
    "FXXH": sqlalchemytypes.String,
    "TG": sqlalchemytypes.String,
    "TN": sqlalchemytypes.String,
    "TNH": sqlalchemytypes.String,
    "TX": sqlalchemytypes.String,
    "TXH": sqlalchemytypes.String,
    "T10N": sqlalchemytypes.String,
    "T10NH": sqlalchemytypes.String,
    "SQ": sqlalchemytypes.String,
    "SP": sqlalchemytypes.String,
    "Q": sqlalchemytypes.String,
    "DR": sqlalchemytypes.String,
    "RH": sqlalchemytypes.String,
    "RHX": sqlalchemytypes.String,
    "RHXH": sqlalchemytypes.String,
    "PG": sqlalchemytypes.String,
    "PX": sqlalchemytypes.String,
    "PXH": sqlalchemytypes.String,
    "PN": sqlalchemytypes.String,
    "PNH": sqlalchemytypes.String,
    "VVN": sqlalchemytypes.String,
    "VVNH": sqlalchemytypes.String,
    "VVX": sqlalchemytypes.String,
    "VVXH": sqlalchemytypes.String,
    "NG": sqlalchemytypes.String,
    "UG": sqlalchemytypes.String,
    "UX": sqlalchemytypes.String,
    "UXH": sqlalchemytypes.String,
    "UN": sqlalchemytypes.String,
    "UNH": sqlalchemytypes.String,
    "EV2": sqlalchemytypes.String
})

644

awswrantler.s3.parquet zeker uitzoeken!!!!!!!!!