In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.types import *

spark = SparkSession.builder \
    .appName("Spark Census") \
        .master('local[*]') \
        .enableHiveSupport() \
        .config('spark.drive.host', 'localhost') \
        .getOrCreate()

In [None]:
# setup variables
input_data = r"F:\DataSamples\DataExplorer\Census\Census_Surnames\Names_2010Census.csv"

b_print = True

In [20]:
input_data = input_data.replace('\\', '/')
print(f"Input data: {input_data}")

Input data: F:/DataSamples/DataExplorer/Census/Census_Surnames/Names_2010Census.csv


In [None]:
sql = f"""
create or replace temporary view census_raw
using csv
options (
    path "{input_data}",
    inferSchema "true",
    header "true"
)"""

spark.sql(sql)

DataFrame[]

In [24]:
df = spark.table("census_raw")
df.show(10)

+---------+----+-------+--------+------------+--------+--------+------+-------+---------+-----------+
|     name|rank|  count|prop100k|cum_prop100k|pctwhite|pctblack|pctapi|pctaian|pct2prace|pcthispanic|
+---------+----+-------+--------+------------+--------+--------+------+-------+---------+-----------+
|    SMITH|   1|2442977|  828.19|      828.19|    70.9|   23.11|   0.5|   0.89|     2.19|        2.4|
|  JOHNSON|   2|1932812|  655.24|     1483.42|   58.97|   34.63|  0.54|   0.94|     2.56|       2.36|
| WILLIAMS|   3|1625252|  550.97|     2034.39|   45.75|   47.68|  0.46|   0.82|     2.81|       2.49|
|    BROWN|   4|1437026|  487.16|     2521.56|   57.95|    35.6|  0.51|   0.87|     2.55|       2.52|
|    JONES|   5|1425470|  483.24|      3004.8|   55.19|   38.48|  0.44|      1|     2.61|       2.29|
|   GARCIA|   6|1166120|  395.32|     3400.12|    5.38|    0.45|  1.41|   0.47|     0.26|      92.03|
|   MILLER|   7|1161437|  393.74|     3793.86|   84.11|   10.76|  0.54|   0.66|   

In [32]:
spark.sql("drop table if exists census")

sql = f"""
create table if not exists census
(
    name string,
    rank int,
    count int,
    prop100k decimal(12, 5),
    cum_prop100k decimal(12, 5),
    pctwhite string,
    pctblack string,
    pctapi string,
    pctaian string,
    pct2prace string,
    pcthispanic string
)
using csv
options (
    path "{input_data}",
    header "true"
)"""

spark.sql(sql)

DataFrame[]

In [33]:
df2 = spark.table("census")
df2.show(10)

+---------+----+-------+---------+------------+--------+--------+------+-------+---------+-----------+
|     name|rank|  count| prop100k|cum_prop100k|pctwhite|pctblack|pctapi|pctaian|pct2prace|pcthispanic|
+---------+----+-------+---------+------------+--------+--------+------+-------+---------+-----------+
|    SMITH|   1|2442977|828.19000|   828.19000|    70.9|   23.11|   0.5|   0.89|     2.19|        2.4|
|  JOHNSON|   2|1932812|655.24000|  1483.42000|   58.97|   34.63|  0.54|   0.94|     2.56|       2.36|
| WILLIAMS|   3|1625252|550.97000|  2034.39000|   45.75|   47.68|  0.46|   0.82|     2.81|       2.49|
|    BROWN|   4|1437026|487.16000|  2521.56000|   57.95|    35.6|  0.51|   0.87|     2.55|       2.52|
|    JONES|   5|1425470|483.24000|  3004.80000|   55.19|   38.48|  0.44|      1|     2.61|       2.29|
|   GARCIA|   6|1166120|395.32000|  3400.12000|    5.38|    0.45|  1.41|   0.47|     0.26|      92.03|
|   MILLER|   7|1161437|393.74000|  3793.86000|   84.11|   10.76|  0.54| 

In [4]:
# Check current default format
spark.conf.get("spark.sql.sources.default")

'parquet'

In [5]:
spark.sql("""
CREATE TABLE sales (product STRING, date DATE, amount DECIMAL(10,2))
USING PARQUET
PARTITIONED BY (date);
""")

DataFrame[]

In [9]:
spark.sql("""INSERT INTO sales (product, date, amount)  VALUES
('Widget', date('2023-01-01'), 19.99), 
('Gadget', date('2023-01-02'), 29.99),
('Widget', date('2023-01-03'), 19.99),
('Gadget', date('2023-01-04'), 29.99),
('Widget', date('2023-01-05'), 19.99);
""")

DataFrame[]

In [24]:
# Get table creation statement (shows structure and storage details)
spark.sql("SHOW CREATE TABLE sales").show(truncate=False)

# For partitioned tables, see partition structure
spark.sql("SHOW PARTITIONS sales").show()

+--------------------------------------------------------------------------------------------------------------------------------------------+
|createtab_stmt                                                                                                                              |
+--------------------------------------------------------------------------------------------------------------------------------------------+
|CREATE TABLE spark_catalog.default.sales (\n  product STRING,\n  amount DECIMAL(10,2),\n  date DATE)\nUSING PARQUET\nPARTITIONED BY (date)\n|
+--------------------------------------------------------------------------------------------------------------------------------------------+

+---------------+
|      partition|
+---------------+
|date=2023-01-01|
|date=2023-01-02|
|date=2023-01-03|
|date=2023-01-04|
|date=2023-01-05|
+---------------+



In [13]:
df_format = spark.sql("describe formatted sales")
df_format.show(truncate=False)

+----------------------------+------------------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                           |comment|
+----------------------------+------------------------------------------------------------------------------------+-------+
|product                     |string                                                                              |NULL   |
|amount                      |decimal(10,2)                                                                       |NULL   |
|date                        |date                                                                                |NULL   |
|# Partition Information     |                                                                                    |       |
|# col_name                  |data_type                                                                           |comment|
|date   

In [15]:
# Pivot the df_format DataFrame to show data types as columns and their corresponding column names
pivot_df = df_format.groupBy().pivot("col_name").agg(expr("collect_list(data_type) as columns"))
pivot_df.show(truncate=False)

+---+----------------------------+-----------------------+-----------+---------------+-------------+------------------------------+---------+---------------------------------------------------------------+-----------+--------------------------------------------------------------------------------------+----------------------------------------------------------------+-------+------------------+---------+-------------------------------------------------------------+-------+---------+---------------+------------+--------+
|   |# Detailed Table Information|# Partition Information|# col_name |Catalog        |Created By   |Created Time                  |Database |InputFormat                                                    |Last Access|Location                                                                              |OutputFormat                                                    |Owner  |Partition Provider|Provider |Serde Library                                                |Table  

In [18]:
pivot_df = df_format.groupBy().pivot("col_name").agg(expr("min(data_type) as columns"))
pivot_df.show(truncate=False)

+---+----------------------------+-----------------------+----------+-------------+-----------+----------------------------+--------+-------------------------------------------------------------+-----------+------------------------------------------------------------------------------------+--------------------------------------------------------------+-----+------------------+--------+-----------------------------------------------------------+-----+-------+-------------+----+-------+
|   |# Detailed Table Information|# Partition Information|# col_name|Catalog      |Created By |Created Time                |Database|InputFormat                                                  |Last Access|Location                                                                            |OutputFormat                                                  |Owner|Partition Provider|Provider|Serde Library                                              |Table|Type   |amount       |date|product|
+---+-------------

In [2]:
sql = """
CREATE TABLE employees (
    id BIGINT,
    name STRING,
    salary DECIMAL(10,2),
    hire_date DATE,
    is_active BOOLEAN,
    skills ARRAY<STRING>,
    metadata MAP<STRING, STRING>,
    address STRUCT<street: STRING, city: STRING, zipcode: STRING>
);"""

spark.sql(sql)

DataFrame[]