In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col


"""
# Create a SparkSession
spark = SparkSession.builder \
    .appName("Load MySQL Table and Write as Delta") \
    .getOrCreate()
#     .config("spark.jars", "/path/to/mysql-connector-java.jar") \
spark.sparkContext.setJobDescription("delta job")
"""

from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession

builder = (
        SparkSession.builder
        .appName("pytest-pyspark-local-testing")
        .config('spark.sql.extensions', 'io.delta.sql.DeltaSparkSessionExtension')
        .config('spark.sql.catalog.spark_catalog', 'org.apache.spark.sql.delta.catalog.DeltaCatalog')

)

spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [2]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [3]:
import findspark
findspark.init()
findspark.add_packages('mysql:mysql-connector-java:9.2.0')
print( spark.version )

3.5.4


In [4]:
sample_data = [{"name": "John    D.", "age": 30},
  {"name": "Alice   G.", "age": 25},
  {"name": "Bob  T.", "age": 35},
  {"name": "Eve   A.", "age": 28}]

df = spark.createDataFrame(sample_data)
df.show()

+---+----------+
|age|      name|
+---+----------+
| 30|John    D.|
| 25|Alice   G.|
| 35|   Bob  T.|
| 28|  Eve   A.|
+---+----------+



In [13]:
import configparser

# Create a ConfigParser object
config = configparser.ConfigParser()

# Read the .ini file
config.read('settings.ini')  # Replace with the path to your .ini file
# Accessing data from the .ini file
username = config['settings']['username']
password = config['settings']['password']
host = config['settings']['host']
port = config['settings']['port']
schema = config['database']['schema']
print('loaded ini')


loaded ini


In [14]:

# Define the MySQL connection properties
url = f"jdbc:mysql://{host}:{port}/{schema}"  # Replace with your MySQL server details
print( url ) 
properties = {
    "user": username,  # Replace with your MySQL username
    "password": password,  # Replace with your MySQL password
    "driver": "com.mysql.cj.jdbc.Driver"  # MySQL JDBC driver
}


# Load data from MySQL table into a PySpark DataFrame
df = spark.read.jdbc(url=url, table="flights", properties=properties)

# Show the data
df.show(5)

jdbc:mysql://localhost:3306/datastaging
+---+----+-----+---+--------+--------------+---------+--------+--------------+---------+-------+------+-------+------+----+--------+--------+----+------+-------------------+--------------------+
| id|year|month|day|dep_time|sched_dep_time|dep_delay|arr_time|sched_arr_time|arr_delay|carrier|flight|tailnum|origin|dest|air_time|distance|hour|minute|          time_hour|                name|
+---+----+-----+---+--------+--------------+---------+--------+--------------+---------+-------+------+-------+------+----+--------+--------+----+------+-------------------+--------------------+
|  0|2013|    1|  1|   517.0|           515|      2.0|   830.0|           819|     11.0|     UA|  1545| N14228|   EWR| IAH|   227.0|    1400|   5|    15|2013-01-01 05:00:00|United Air Lines ...|
|  1|2013|    1|  1|   533.0|           529|      4.0|   850.0|           830|     20.0|     UA|  1714| N24211|   LGA| IAH|   227.0|    1416|   5|    29|2013-01-01 05:00:00|United 

In [10]:

# Define the path where the Delta table will be saved
delta_path = "data/flights/" # Replace with your desired Delta table path

# Save the DataFrame as a Delta table
df.write.format("delta").mode("overwrite").save(delta_path)

In [21]:
# Load data from the MySQL table into a PySpark DataFrame
df = spark.read.jdbc(url=url, table="flights", properties=properties)

# Register the DataFrame as a temporary view
df.createOrReplaceTempView("flights")

# Show the result of the SQL query
df.show(5)
query_sql = "SELECT * FROM flights WHERE month = 1"
result_df = spark.sql(query_sql)

# Define the path where the Delta table will be saved
delta_path = "data/flights_merge/" # Replace with your desired Delta table path

# Save the DataFrame as a Delta table
result_df.write.format("delta").mode("overwrite").save(delta_path)

+---+----+-----+---+--------+--------------+---------+--------+--------------+---------+-------+------+-------+------+----+--------+--------+----+------+-------------------+--------------------+
| id|year|month|day|dep_time|sched_dep_time|dep_delay|arr_time|sched_arr_time|arr_delay|carrier|flight|tailnum|origin|dest|air_time|distance|hour|minute|          time_hour|                name|
+---+----+-----+---+--------+--------------+---------+--------+--------------+---------+-------+------+-------+------+----+--------+--------+----+------+-------------------+--------------------+
|  0|2013|    1|  1|   517.0|           515|      2.0|   830.0|           819|     11.0|     UA|  1545| N14228|   EWR| IAH|   227.0|    1400|   5|    15|2013-01-01 05:00:00|United Air Lines ...|
|  1|2013|    1|  1|   533.0|           529|      4.0|   850.0|           830|     20.0|     UA|  1714| N24211|   LGA| IAH|   227.0|    1416|   5|    29|2013-01-01 05:00:00|United Air Lines ...|
|  2|2013|    1|  1|   54

In [None]:

query_sql = "SELECT count(*) FROM flights WHERE month = 1"
result_df = spark.sql(query_sql)
result_df.show()