# Create a DataFrame from a csv file

In [1]:
import findspark
findspark.init()

import pyspark

In [2]:
from pyspark.sql import SparkSession

#Create spark session
spark = SparkSession.builder.appName("CSV to Dataset").getOrCreate()

In [3]:
#Read the csv file with no indications
df = spark.read.csv('C:/Users/usuario\Documents\Blanca\Spark\data')

In [4]:
df.show(5)

+--------------------+
|                 _c0|
+--------------------+
|"id_local";"id_di...|
|"285016136";"8";"...|
|"285016137";"8";"...|
|"285016140";"8";"...|
|"285016143";"8";"...|
+--------------------+
only showing top 5 rows



In [4]:
df.printSchema()

root
 |-- _c0: string (nullable = true)



In [5]:
#Read the csv file with options
df2 = spark.read.options(delimiter= ',', header= 'True', inferSchema= 'True') \
        .csv('C:/Users/usuario\Documents\Blanca\Spark\data')
    
df2.show(5, False)

+----------------------+
|WAR AND PEACE         |
+----------------------+
|By Leo Tolstoy/Tolstoi|
|CONTENTS              |
|BOOK ONE: 1805        |
|CHAPTER I             |
|CHAPTER II            |
+----------------------+
only showing top 5 rows



In [6]:
df2.printSchema()

root
 |-- WAR AND PEACE: string (nullable = true)



Reading csv files with specific custom schemas

In [44]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DoubleType

In [47]:
schema = StructType() \
        .add('name', StringType(), True).add('local_code', StringType(), True) \
            .add('category', StringType(), True)

df3 = spark.read.csv('C:/Users/usuario\Documents\Blanca\Spark/practica_pyspark/airport_codes.csv', schema=schema, header=True)
df3.printSchema()
df3.show(truncate=False)

root
 |-- name: string (nullable = true)
 |-- local_code: string (nullable = true)
 |-- category: string (nullable = true)

+----+-------------+----------------------------------+
|name|local_code   |category                          |
+----+-------------+----------------------------------+
|00A |heliport     |Total Rf Heliport                 |
|00AK|small_airport|Lowell Field                      |
|00AL|small_airport|Epps Airpark                      |
|00AR|heliport     |Newport Hospital & Clinic Heliport|
|00AZ|small_airport|Cordes Airport                    |
|00CA|small_airport|Goldstone /Gts/ Airport           |
|00CO|small_airport|Cass Field                        |
|00FA|small_airport|Grass Patch Airport               |
|00FD|heliport     |Ringhaver Heliport                |
|00FL|small_airport|River Oak Airport                 |
|00GA|small_airport|Lt World Airport                  |
|00GE|heliport     |Caffrey Heliport                  |
|00HI|heliport     |Kaupulehu Helipo