# Create a spark Dataframe 
## -- for different learning and experimenting purposes

In [1]:
import sys
sys.path.append("/usr/local/spark/python")
sys.path.append("/usr/local/spark/python/lib/py4j-0.10.7-src.zip")

from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.types import *

from datetime import datetime, date
from decimal import Decimal
import numpy as np

spark = SparkSession.builder \
        .master('local[4]') \
        .appName('spark_create_df') \
        .enableHiveSupport() \
        .getOrCreate()

In [2]:
schema = StructType([
    StructField(name="Name", dataType=StringType(), nullable=True),
    StructField("id", IntegerType(), True),
    StructField("Age", ShortType(), True),
    StructField("entry_score", FloatType(), True),
    StructField("update_score", FloatType(), True),
    StructField("Food", StringType(), True),
    StructField("Balance", DecimalType(precision=12, scale=2), True),
    StructField("VIP", BooleanType(), True),
    StructField("sign_up_date", DateType(), True),
    StructField("last_check_out", TimestampType(), True)
])

In [8]:
df_value = [("Li Lei", 278584, 35, np.nan, 400.2312, "Chocolate", None, True, date(2005,6,23), datetime(2018,12,23,22,10,24)),
            ("Han Meimei",342887, 33, 443.9234, None, "Ice Cream", Decimal(111246.87), True, date(2010,12,10), datetime(2018,9,30,10,34,16)),
            ("Niu Ren", 588269, 28, None, 995.362547, None, Decimal(65897412.5677), False, date(2006,1,1), datetime(2019,1,4,12,56,45)),
            ("Jay Chou", 785445, 45, np.nan, None, "Donut", None, True, date(2001,5,5),datetime(2017,8,4,6,33,43))]

In [9]:
df = spark.createDataFrame(data=df_value, schema=schema)
df.show()
df.printSchema()

+----------+------+---+-----------+------------+---------+-----------+-----+------------+-------------------+
|      Name|    id|Age|entry_score|update_score|     Food|    Balance|  VIP|sign_up_date|     last_check_out|
+----------+------+---+-----------+------------+---------+-----------+-----+------------+-------------------+
|    Li Lei|278584| 35|        NaN|    400.2312|Chocolate|       null| true|  2005-06-23|2018-12-23 22:10:24|
|Han Meimei|342887| 33|   443.9234|        null|Ice Cream|  111246.87| true|  2010-12-10|2018-09-30 10:34:16|
|   Niu Ren|588269| 28|       null|   995.36255|     null|65897412.57|false|  2006-01-01|2019-01-04 12:56:45|
|  Jay Chou|785445| 45|        NaN|        null|    Donut|       null| true|  2001-05-05|2017-08-04 06:33:43|
+----------+------+---+-----------+------------+---------+-----------+-----+------------+-------------------+

root
 |-- Name: string (nullable = true)
 |-- id: integer (nullable = true)
 |-- Age: short (nullable = true)
 |-- entr

In [10]:
df.write.mode("overwrite").parquet("sample_parquet", compression="gzip")