In [0]:
#from pyspark.sql import SparkSession
#spark = SparkSession.builder.appName("Spark Read").getOrCreate()
#uri -> uniform resource identifier
#ex: dbfs:/, hive:/,hdfs:/, gs:/, adls:/, s3:/
df = spark.read.csv("dbfs:/Volumes/workspace/default/tblcustomer/custs")
df.display()

In [0]:
df = spark.read.csv("/Volumes/inceptez_catalog/inputdb/customerdata/countries_population.csv")

df.display()


In [0]:
df = spark.read.csv("/Volumes/inceptez_catalog/inputdb/customerdata/countries_population.csv",header=True)
df.display()

In [0]:
df.printSchema()

In [0]:
df = spark.read.csv("/Volumes/inceptez_catalog/inputdb/customerdata/countries_population.csv", header=True,inferSchema=True,sep=",")

# To display the structure of the dataframe
df.printSchema()


In [0]:
df = spark.read.options(header=True).options(inferSchema=True).csv("/Volumes/inceptez_catalog/inputdb/customerdata/countries_population.csv")

df.printSchema()



### Generic way of read and load data into dataframe

In [0]:
df = spark.read.format("csv").options(header=True).options(inferSchema=True).load("/Volumes/inceptez_catalog/inputdb/customerdata/countries_population.csv")
cnt = df.count()


### Reading data from multiple files

In [0]:

df = spark.read.option("header", True).csv("dbfs:/Volumes/inceptez_catalog/inputdb/customerdata/*.csv")
#or
df = spark.read.csv(["dbfs:/Volumes/inceptez_catalog/inputdb/customerdatacountries_population.csv", 
                     "dbfs:/Volumes/inceptez_catalog/inputdb/customerdatacountries_population1.csv"], 
                    header=True, inferSchema=True)

In [0]:
df = spark.read.format("csv").options(header=True).options(inferSchema=True).load("dbfs:/Volumes/inceptez_catalog/inputdb/customerdata/*.csv")

df = spark.read.format("csv").options(header=True).options(inferSchema=True).load(["dbfs:/Volumes/inceptez_catalog/inputdb/customerdatacountries_population.csv", 
                     "dbfs:/Volumes/inceptez_catalog/inputdb/customerdatacountries_population1.csv"])


### Provide schema with SQL String

[PySpark SQL Datatypes](https://spark.apache.org/docs/latest/sql-ref-datatypes.html)

In [0]:
strschema = "country_id int, name string, nationality string, country_code string, iso_alpha2 string, capital string, population int, area_km2 double, region_id int, sub_region_id int"

df = spark.read.format("csv").options(header=True).schema(strschema).load("/Volumes/inceptez_catalog/inputdb/customerdata/countries_population.csv")

df.printSchema()

### Define schema programmatically
[Data Types](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/data_types.html)

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

countryschema = StructType([
    StructField('COUNTRY_ID', IntegerType(), True), 
    StructField('NAME', StringType(), True), 
    StructField('NATIONALITY', StringType(), True), 
    StructField('COUNTRY_CODE', StringType(), True), 
    StructField('ISO_ALPHA2', StringType(), True), 
    StructField('CAPITAL', StringType(), True), 
    StructField('POPULATION', IntegerType(), True), 
    StructField('AREA_KM2', DoubleType(), True), 
    StructField('REGION_ID', IntegerType(), True), 
    StructField('SUB_REGION_ID', IntegerType(), True)])

df = (spark.read.format("csv")
      .schema(countryschema)
      .option("header", "true")
      .option("delimiter", ",")
      .load("dbfs:/Volumes/workspace/default/countries/countries_population.csv"))


In [0]:
# To read 10 records
df = spark.read.format("csv").options(header=True).options(inferSchema=True).load("/Volumes/inceptez_catalog/inputdb/customerdata/countries_population.csv").limit(10)

df.schema
