In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
        .master("local[*]") \
        .appName("IMPORTING_CSV") \
        .getOrCreate()

In [27]:
people = spark.read.format("csv")\
        .option("header","true") \
        .option("path", "csv_files/mall_customers.csv")\
        .load()

In [28]:
people.show()

+----------+------+----+-------------+--------------+
|CustomerID|Gender| Age|Annual_Income|Spending_Score|
+----------+------+----+-------------+--------------+
|         1|  Male|  19|           15|            39|
|         2|  Male|  21|           15|            81|
|         3|Female|NULL|           16|             6|
|         4|Female|  23|           16|            77|
|         5|Female|  31|           17|            40|
|         6|Female|  22|           17|            76|
|         7|Female|  35|           18|             6|
|         8|Female|  23|           18|            94|
|         9|  Male|  64|           19|             3|
|        10|Female|  30|           19|            72|
|        11|  Male|  67|           19|            14|
|        12|Female|  35|           19|            99|
|        13|Female|  58|           20|            15|
|        14|Female|  24|           20|            77|
|        15|  Male|  37|           20|            13|
|        16|  Male|  22|    

### here, it just creates DAG and instantiates the path of execution but not actual transformations, since it is lazily evaluated

In [29]:
output = people.select(people.Age).orderBy(people.Spending_Score) 

In [31]:
output.count()

200

In [32]:
output.show()

+---+
|Age|
+---+
| 37|
| 34|
| 19|
| 36|
| 59|
| 25|
| 37|
| 40|
| 52|
| 67|
| 54|
| 49|
| 59|
| 58|
| 58|
| 46|
| 47|
| 47|
| 42|
| 43|
+---+
only showing top 20 rows



In [33]:
## creating temporary view
output.createOrReplaceTempView("peopless")

In [35]:
spark.sql("select * from peopless").show()

+---+
|Age|
+---+
| 37|
| 34|
| 19|
| 36|
| 59|
| 25|
| 37|
| 40|
| 52|
| 67|
| 54|
| 49|
| 59|
| 58|
| 58|
| 46|
| 47|
| 47|
| 42|
| 43|
+---+
only showing top 20 rows



### DataFrame Reader

In [38]:
from pyspark.sql.types import StructType,StructField,IntegerType,StringType

myschema = StructType([
    StructField("userID", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("friends", IntegerType(), True)])
    

In [40]:
friends = spark.read.format("csv")\
        .schema(myschema)\
        .option("header","true") \
        .option("path", "csv_files/mall_customers.csv")\
        .load()

In [41]:
friends.printSchema()

root
 |-- userID: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- friends: integer (nullable = true)



### DataFrame Writer
It saves or writes data to a specific datasource

In [47]:
import pyspark.sql.functions as func
from pyspark.sql.functions import col

output = friends.select("*").where(col("age")>30).withColumn('insert_ts', func.current_timestamp()) \
    .orderBy(col("userID")).cache()

In [None]:
output.write\
.format("csv")\
.option("path","file:///data_sources/output_files/")\
.partitionBy("age")\
.save()

In [53]:
!pwd

/home/dev/release
