In [58]:
from pyspark.sql import SparkSession

# Configuration Spark optimale
spark = (SparkSession.builder
    .appName("TPCH Data Pipeline")
    .config("spark.sql.shuffle.partitions", "40")
    .config("spark.executor.memory", "4g")
    .config("spark.jars.packages", "org.postgresql:postgresql:42.7.3")
    .getOrCreate()
)

def load_tpch_table(table_name: str, partition_col: str):
    """Charge une table TPC-H avec partitionnement intelligent"""
    jdbc_url = "jdbc:postgresql://upstream_data:5432/tpchdb"
    props = {
        'user': 'tpchuser',
        'password': 'tpchpass',
        'driver': 'org.postgresql.Driver'
    }
    
    # 1. Détection dynamique des limites
    bounds = spark.read.jdbc(
        jdbc_url,
        f"(SELECT MIN({partition_col}) as min, MAX({partition_col}) as max FROM {table_name}) tmp",
        properties=props
    ).first()
    
    # 2. Ajustement pour les clés composites
    upper_bound = bounds['max'] + (1 if table_name == 'public.lineitem' else 0)
    
    # 3. Lecture avec vérification intégrée
    return spark.read.jdbc(
        url=jdbc_url,
        table=table_name,
        column=partition_col,
        lowerBound=bounds['min'],
        upperBound=upper_bound,
        numPartitions=40 if table_name == 'public.lineitem' else 10,
        properties=props
    ).cache()

try:
    # Chargement des données
    orders = load_tpch_table("public.orders", "o_orderkey")
    lineitems = load_tpch_table("public.lineitem", "l_orderkey")
    
    # Validation des résultats
    print("=== Orders ===")
    orders.select("o_orderkey", "o_totalprice").show(2, truncate=False)
    
    print("\n=== Lineitems ===")
    lineitems.select("l_orderkey", "l_linenumber", "l_quantity").show(2, vertical=True)

finally:
    if spark:
        spark.stop()

                                                                                

=== Orders ===


                                                                                

+----------+------------+
|o_orderkey|o_totalprice|
+----------+------------+
|1         |173665.47   |
|2         |46929.18    |
+----------+------------+
only showing top 2 rows


=== Lineitems ===


[Stage 3:>                                                          (0 + 1) / 1]

-RECORD 0-------------
 l_orderkey   | 1     
 l_linenumber | 1     
 l_quantity   | 17.00 
-RECORD 1-------------
 l_orderkey   | 1     
 l_linenumber | 2     
 l_quantity   | 36.00 
only showing top 2 rows



                                                                                

In [59]:
lineitem_data.show()

Py4JJavaError: An error occurred while calling o273.showString.
: java.lang.IllegalStateException: SparkContext has been shutdown
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2230)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2259)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2278)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:506)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:459)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:48)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3868)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2863)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:3858)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:510)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3856)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3856)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2863)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3084)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:288)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:327)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.base/java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Unknown Source)


In [54]:
from pyspark.sql import SparkSession

# Création de la session Spark optimisée
spark = (SparkSession.builder
    .appName("FCT_Orders_Test Optimisé")
    .config("spark.sql.shuffle.partitions", "40")
    .config("spark.executor.memory", "4g")
    .getOrCreate()
)

def get_table_from_db(table_name: str, partition_column: str, spark: SparkSession):
    """Lecture des données PostgreSQL avec partitionnement"""
    host = "upstream_data"
    port = "5432"
    db = "tpchdb"
    jdbc_url = f'jdbc:postgresql://{host}:{port}/{db}'

    connection_properties = {
        'user': 'tpchuser',
        'password': 'tpchpass',
        'driver': 'org.postgresql.Driver',
    }

    return spark.read.jdbc(
        url=jdbc_url,
        table=table_name,
        column=partition_column,  # Utilisation du paramètre de colonne
        lowerBound=1,
        upperBound=1000000,
        numPartitions=10,
        properties=connection_properties
    )

# Utilisation avec les bonnes colonnes de partitionnement
orders_data = get_table_from_db('public.orders', 'o_orderkey', spark)
#lineitem_data = get_table_from_db('public.lineitem', 'l_orderkey', spark)

In [55]:
orders_data.show()

[Stage 0:>                                                          (0 + 1) / 1]

+----------+---------+-------------+------------+-----------+---------------+---------------+--------------+--------------------+
|o_orderkey|o_custkey|o_orderstatus|o_totalprice|o_orderdate|o_orderpriority|        o_clerk|o_shippriority|           o_comment|
+----------+---------+-------------+------------+-----------+---------------+---------------+--------------+--------------------+
|         1|    36901|            O|   173665.47| 1996-01-02|5-LOW          |Clerk#000000951|             0|nstructions sleep...|
|         2|    78002|            O|    46929.18| 1996-12-01|1-URGENT       |Clerk#000000880|             0| foxes. pending a...|
|         3|   123314|            F|   193846.25| 1993-10-14|5-LOW          |Clerk#000000955|             0|sly final account...|
|         4|   136777|            O|    32151.78| 1995-10-11|5-LOW          |Clerk#000000124|             0|sits. slyly regul...|
|         5|    44485|            F|   144659.20| 1994-07-30|5-LOW          |Clerk#0000009

                                                                                

25/02/21 22:45:01 ERROR StandaloneSchedulerBackend: Application has been killed. Reason: Master removed our application: KILLED
25/02/21 22:45:01 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exiting due to error from cluster scheduler: Master removed our application: KILLED
	at org.apache.spark.errors.SparkCoreErrors$.clusterSchedulerError(SparkCoreErrors.scala:218)
	at org.apache.spark.scheduler.TaskSchedulerImpl.error(TaskSchedulerImpl.scala:923)
	at org.apache.spark.scheduler.cluster.StandaloneSchedulerBackend.dead(StandaloneSchedulerBackend.scala:154)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint.markDead(StandaloneAppClient.scala:262)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint$$anonfun$receive$1.applyOrElse(StandaloneAppClient.scala:169)
	at org.apache.spark.rpc.netty.Inbox.$anonfun$process$1(Inbox.scala:115)
	at org.apache.spark.rpc.netty.Inbox.safelyCall(Inbox.scala:213)
	at org.apache.spark.rpc.netty.Inbox.proce