In [1]:
import os

from datetime import datetime, date
from pyspark.sql import SparkSession, Row

import pandas as pd

In [2]:
SPARK_MASTER = os.getenv("SPARK_MASTER")
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD")

In [3]:
spark = (
    SparkSession.builder.appName("app")
    .master(SPARK_MASTER)
    .config("spark.jars", "/opt/spark/jars/postgresql-42.7.3.jar")
    .config("spark.executor.extraClassPath", "/opt/spark/jars/postgresql-42.7.3.jar")
    .config("spark.driver.extraClassPath", "/opt/spark/jars/postgresql-42.7.3.jar")
    .getOrCreate()
)

24/04/15 17:27:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
df = (
    spark.read.format("jdbc")
    .option("driver", "org.postgresql.Driver")
    .option("url", "jdbc:postgresql://docker-postgres:5432/postgres")
    .option("dbtable", "information_schema.tables")
    .option("user", "postgres")
    .option("password", POSTGRES_PASSWORD)
    .load()
)

df.printSchema()

root
 |-- table_catalog: string (nullable = true)
 |-- table_schema: string (nullable = true)
 |-- table_name: string (nullable = true)
 |-- table_type: string (nullable = true)
 |-- self_referencing_column_name: string (nullable = true)
 |-- reference_generation: string (nullable = true)
 |-- user_defined_type_catalog: string (nullable = true)
 |-- user_defined_type_schema: string (nullable = true)
 |-- user_defined_type_name: string (nullable = true)
 |-- is_insertable_into: string (nullable = true)
 |-- is_typed: string (nullable = true)
 |-- commit_action: string (nullable = true)



In [5]:
df.show()

                                                                                

+-------------+------------+--------------------+----------+----------------------------+--------------------+-------------------------+------------------------+----------------------+------------------+--------+-------------+
|table_catalog|table_schema|          table_name|table_type|self_referencing_column_name|reference_generation|user_defined_type_catalog|user_defined_type_schema|user_defined_type_name|is_insertable_into|is_typed|commit_action|
+-------------+------------+--------------------+----------+----------------------------+--------------------+-------------------------+------------------------+----------------------+------------------+--------+-------------+
|     postgres|      public|                test|BASE TABLE|                        NULL|                NULL|                     NULL|                    NULL|                  NULL|               YES|      NO|         NULL|
|     postgres|  pg_catalog|        pg_statistic|BASE TABLE|                        NULL|   

In [6]:
spark.sql("SHOW DATABASES").show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [5]:
pandas_df = pd.DataFrame(
    {
        "a": [1, 2, 3],
        "b": [2.0, 3.0, 4.0],
        "c": ["string1", "string2", "string3"],
        "d": [date(2000, 1, 1), date(2000, 2, 1), date(2000, 3, 1)],
        "e": [
            datetime(2000, 1, 1, 12, 0),
            datetime(2000, 1, 2, 12, 0),
            datetime(2000, 1, 3, 12, 0),
        ],
    }
)
df = spark.createDataFrame(pandas_df)
df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [6]:
df.show()

                                                                                

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
|  2|3.0|string2|2000-02-01|2000-01-02 12:00:00|
|  3|4.0|string3|2000-03-01|2000-01-03 12:00:00|
+---+---+-------+----------+-------------------+



In [7]:
df.collect()

                                                                                

[Row(a=1, b=2.0, c='string1', d=datetime.date(2000, 1, 1), e=datetime.datetime(2000, 1, 1, 12, 0)),
 Row(a=2, b=3.0, c='string2', d=datetime.date(2000, 2, 1), e=datetime.datetime(2000, 1, 2, 12, 0)),
 Row(a=3, b=4.0, c='string3', d=datetime.date(2000, 3, 1), e=datetime.datetime(2000, 1, 3, 12, 0))]

In [8]:
df = spark.createDataFrame(
    [
        Row(a=1, b=2.0, c="string1", d=date(2000, 1, 1), e=datetime(2000, 1, 1, 12, 0)),
        Row(a=2, b=3.0, c="string2", d=date(2000, 2, 1), e=datetime(2000, 1, 2, 12, 0)),
        Row(a=4, b=5.0, c="string3", d=date(2000, 3, 1), e=datetime(2000, 1, 3, 12, 0)),
    ]
)
df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]