In [1]:
from pyspark.sql import SparkSession
# Create a SparkSession
spark = SparkSession.builder.appName("Working with Different Types of Files").getOrCreate()

23/06/08 09:01:26 WARN Utils: Your hostname, FM-PC-LT-323 resolves to a loopback address: 127.0.1.1; using 172.16.5.219 instead (on interface wlp0s20f3)
23/06/08 09:01:26 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/08 09:01:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/06/08 09:01:28 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/06/08 09:01:28 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


# CSV

In [2]:
summery_csv = spark.read.format("csv")\
    .option("header", "true")\
    .option("mode", "FAILFAST")\
    .option("inferSchema", "true")\
    .load("data/csv/2015-summary.csv")
summery_csv.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|   15|
|       United States|            Croatia|    1|
|       United States|            Ireland|  344|
|               Egypt|      United States|   15|
|       United States|              India|   62|
|       United States|          Singapore|    1|
|       United States|            Grenada|   62|
|          Costa Rica|      United States|  588|
|             Senegal|      United States|   40|
|             Moldova|      United States|    1|
|       United States|       Sint Maarten|  325|
|       United States|   Marshall Islands|   39|
|              Guyana|      United States|   64|
|               Malta|      United States|    1|
|            Anguilla|      United States|   41|
|             Bolivia|      United States|   30|
|       United States|           Paraguay|    6|
|             Algeri

In [3]:
from pyspark.sql.types import StructField, StructType, StringType, LongType
myManualSchema = StructType([
    StructField("DEST_COUNTRY_NAME", StringType(), True),
    StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
    StructField("count", LongType(), False)
])
summery_csv1 = spark.read.format("csv")\
    .option("header", "true")\
    .option("mode", "FAILFAST")\
    .schema(myManualSchema)\
    .load("data/csv/2015-summary.csv")\
    .show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
+-----------------+-------------------+-----+
only showing top 5 rows



In [4]:
summery_csv.write.format("csv").mode("overwrite").option("sep", ",")\
    .save("data/csv/my-csv-file.csv")
summery_csv.write.format("csv").mode("overwrite").option("sep", "\t")\
    .save("data/csv/my-tsv-file.tsv")

# Json

In [5]:
summary_json = spark.read.format("json").option("mode", "FAILFAST")\
    .option("inferSchema", "true")\
    .load("data/json/2015-summary.json")
summary_json.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|   15|
|       United States|            Croatia|    1|
|       United States|            Ireland|  344|
|               Egypt|      United States|   15|
|       United States|              India|   62|
|       United States|          Singapore|    1|
|       United States|            Grenada|   62|
|          Costa Rica|      United States|  588|
|             Senegal|      United States|   40|
|             Moldova|      United States|    1|
|       United States|       Sint Maarten|  325|
|       United States|   Marshall Islands|   39|
|              Guyana|      United States|   64|
|               Malta|      United States|    1|
|            Anguilla|      United States|   41|
|             Bolivia|      United States|   30|
|       United States|           Paraguay|    6|
|             Algeri

In [6]:
summary_json.write.format("json")\
    .mode("overwrite")\
    .save("data/json/my-json-file.json")

# parquet

In [7]:
parquet_data = spark.read.format("parquet")\
    .load("data/parquet/parquet_data.parquet")

parquet_data.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



In [8]:
parquet_data.write.format("parquet").mode("overwrite")\
    .save("data/parquet/my-parquet-file.parquet")

# Orc Files

In [9]:
orc_data = spark.read.format("orc").load("data/orc/orc_data.orc")

orc_data.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



In [10]:
orc_data.write.format("orc").mode("overwrite").save("data/orc/my-orc-file.orc")

# text files

In [20]:
text_data = spark.read.text("data/text/sample_text_data.txt")
text_data.selectExpr("split(value, ':') as rows").show()

+--------------------+
|                rows|
+--------------------+
|[appName,  Sets t...|
|[Example,  .appNa...|
|                  []|
|[master,  Specifi...|
|[Example,  .maste...|
|                  []|
|[config,  Allows ...|
|[Example,  .confi...|
|                  []|
|[enableHiveSuppor...|
|[Example,  .enabl...|
|                  []|
|[spark.executor.m...|
|[Example,  .confi...|
|                  []|
|[spark.driver.mem...|
|[Example,  .confi...|
|                  []|
|[spark.sql.shuffl...|
|[Example,  .confi...|
+--------------------+
only showing top 20 rows



In [13]:
text_data.write.text("data/text/my-simple-text-files.txt")

In [15]:
summery_csv.limit(10).select("DEST_COUNTRY_NAME", "count")\
    .write.partitionBy("count").text("data/text/five-csv-partition-filespy.csv")

In [21]:
text_data.printSchema()

root
 |-- value: string (nullable = true)

