In [1]:
import pyspark
from datetime import datetime, date
from pyspark.sql import Row
from pyspark.sql import Column
from pyspark.sql import SparkSession

<h1><center>Reading and writing a csv file</center></h1>

In [2]:
spark = SparkSession.builder.appName('Practice Pyspark').getOrCreate()

23/07/09 17:48:43 WARN Utils: Your hostname, nMACHINE resolves to a loopback address: 127.0.1.1; using 192.168.0.133 instead (on interface wlp2s0)
23/07/09 17:48:43 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/07/09 17:48:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df1 = spark.read.option("header", True).option("inferSchema", True).option("delimiter", ",").csv('Data/student.csv')

In [4]:
df2 = spark.read.format("csv").options(header='True', inferSchema='True', delimiter=',').load('Data/student.csv')

In [5]:
df1.show(2)

+---+--------+-----+----+------+
| id|    name|class|mark|gender|
+---+--------+-----+----+------+
|  1|John Deo| Four|  75|female|
|  2|Max Ruin|Three|  85|  male|
+---+--------+-----+----+------+
only showing top 2 rows



In [6]:
df2.show(2)

+---+--------+-----+----+------+
| id|    name|class|mark|gender|
+---+--------+-----+----+------+
|  1|John Deo| Four|  75|female|
|  2|Max Ruin|Three|  85|  male|
+---+--------+-----+----+------+
only showing top 2 rows



In [7]:
df2.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- class: string (nullable = true)
 |-- mark: integer (nullable = true)
 |-- gender: string (nullable = true)



### inferSchema will read data twice, so it is recommended to use custom schema

In [8]:
from pyspark.sql.types import StructType, StringType, IntegerType

In [9]:
schema = StructType() \
      .add("id",IntegerType(),True) \
      .add("name",StringType(),True) \
      .add("class",StringType(),True) \
      .add("mark",IntegerType(),True) \
      .add("gender",StringType(),True)

df3 = spark.read.format("csv").options(header='True', delimiter=',').schema(schema).load('Data/student.csv')

In [10]:
df3.show(2)

+---+--------+-----+----+------+
| id|    name|class|mark|gender|
+---+--------+-----+----+------+
|  1|John Deo| Four|  75|female|
|  2|Max Ruin|Three|  85|  male|
+---+--------+-----+----+------+
only showing top 2 rows



In [11]:
df3.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- class: string (nullable = true)
 |-- mark: integer (nullable = true)
 |-- gender: string (nullable = true)



## Other Features while reading csv
- Read Multiple CSV Files ==> 
df = spark.read.csv("path1,path2,path3")
- Read all CSV Files in a Directory ==> 
df = spark.read.csv("Folder path")
- quotes
When you have a column with a delimiter that used to split the columns, 
use quotes option to specify the quote character, by default it is ” and 
delimiters inside quotes are ignored. But using this option you can set any character.
df = spark.read.option("quote", "'").csv(csv_path)
- nullValues
Using nullValues option you can specify the string in a CSV to consider as null. For example, 
if you want to consider a date column with a value "1900-01-01" set null on DataFrame.

## Writing DF to csv

In [14]:
df2.write.options(header='True', delimiter=',').mode('overwrite').csv("Data/written_student_csv")

### Saving modes
overwrite – mode is used to overwrite the existing file.

append – To add the data to the existing file.

ignore – Ignores write operation when the file already exists.

error – This is a default option when the file already exists, it returns an error.

<h1><center>Filter Operations</center></h1>

In [16]:
df3.filter(df3.gender == 'male').show(2)

+---+--------+-----+----+------+
| id|    name|class|mark|gender|
+---+--------+-----+----+------+
|  2|Max Ruin|Three|  85|  male|
|  3|  Arnold|Three|  55|  male|
+---+--------+-----+----+------+
only showing top 2 rows



In [17]:
df3.filter(df3.gender != 'male').show(2)

+---+----------+-----+----+------+
| id|      name|class|mark|gender|
+---+----------+-----+----+------+
|  1|  John Deo| Four|  75|female|
|  4|Krish Star| Four|  60|female|
+---+----------+-----+----+------+
only showing top 2 rows



In [18]:
df3.filter(~(df3.gender == 'male')).show(2)

+---+----------+-----+----+------+
| id|      name|class|mark|gender|
+---+----------+-----+----+------+
|  1|  John Deo| Four|  75|female|
|  4|Krish Star| Four|  60|female|
+---+----------+-----+----+------+
only showing top 2 rows



In [20]:
from pyspark.sql.functions import col
df3.filter(col("gender") == "male").show(2)

+---+--------+-----+----+------+
| id|    name|class|mark|gender|
+---+--------+-----+----+------+
|  2|Max Ruin|Three|  85|  male|
|  3|  Arnold|Three|  55|  male|
+---+--------+-----+----+------+
only showing top 2 rows



In [21]:
df3.filter("gender == 'male'").show(2)

+---+--------+-----+----+------+
| id|    name|class|mark|gender|
+---+--------+-----+----+------+
|  2|Max Ruin|Three|  85|  male|
|  3|  Arnold|Three|  55|  male|
+---+--------+-----+----+------+
only showing top 2 rows



In [27]:
df3.filter((df3.gender=='male')&(df3['class']=='Four')).show(2)

+---+---------+-----+----+------+
| id|     name|class|mark|gender|
+---+---------+-----+----+------+
|  6|Alex John| Four|  55|  male|
| 15| Tade Row| Four|  88|  male|
+---+---------+-----+----+------+
only showing top 2 rows



In [29]:
li=["Three","Four"]
df3.filter(df3['class'].isin(li)).show(2)

+---+--------+-----+----+------+
| id|    name|class|mark|gender|
+---+--------+-----+----+------+
|  1|John Deo| Four|  75|female|
|  2|Max Ruin|Three|  85|  male|
+---+--------+-----+----+------+
only showing top 2 rows



In [31]:
df3.filter(df3['class'].startswith("T")).show()
df3.filter(df3['class'].endswith("ee")).show()
df3.filter(df3['class'].contains("Th")).show()

+---+--------+-----+----+------+
| id|    name|class|mark|gender|
+---+--------+-----+----+------+
|  2|Max Ruin|Three|  85|  male|
|  3|  Arnold|Three|  55|  male|
| 27|Big Nose|Three|  81|female|
+---+--------+-----+----+------+

+---+--------+-----+----+------+
| id|    name|class|mark|gender|
+---+--------+-----+----+------+
|  2|Max Ruin|Three|  85|  male|
|  3|  Arnold|Three|  55|  male|
| 27|Big Nose|Three|  81|female|
+---+--------+-----+----+------+

+---+--------+-----+----+------+
| id|    name|class|mark|gender|
+---+--------+-----+----+------+
|  2|Max Ruin|Three|  85|  male|
|  3|  Arnold|Three|  55|  male|
| 27|Big Nose|Three|  81|female|
+---+--------+-----+----+------+



In [32]:
df3.filter(df3.name.like("%Ruin%")).show(2)

+---+--------+-----+----+------+
| id|    name|class|mark|gender|
+---+--------+-----+----+------+
|  2|Max Ruin|Three|  85|  male|
+---+--------+-----+----+------+



In [33]:
df3.filter(df3.name.rlike("(?i)^*ruin$")).show(2)

+---+--------+-----+----+------+
| id|    name|class|mark|gender|
+---+--------+-----+----+------+
|  2|Max Ruin|Three|  85|  male|
+---+--------+-----+----+------+



In [34]:
df3.filter(df3.name.isNull()).show(2)

+---+----+-----+----+------+
| id|name|class|mark|gender|
+---+----+-----+----+------+
+---+----+-----+----+------+



In [35]:
df3.filter(df3.name.isNotNull()).show(2)

+---+--------+-----+----+------+
| id|    name|class|mark|gender|
+---+--------+-----+----+------+
|  1|John Deo| Four|  75|female|
|  2|Max Ruin|Three|  85|  male|
+---+--------+-----+----+------+
only showing top 2 rows



In [37]:
from pyspark.sql.types import StructField 
from pyspark.sql.types import ArrayType
data = [
    (("James","","Smith"),["Java","Scala","C++"],"OH","M"),
    (("Anna","Rose",""),["Spark","Java","C++"],"NY","F"),
    (("Julia","","Williams"),["CSharp","VB"],"OH","F"),
    (("Maria","Anne","Jones"),["CSharp","VB"],"NY","M"),
    (("Jen","Mary","Brown"),["CSharp","VB"],"NY","M"),
    (("Mike","Mary","Williams"),["Python","VB"],"OH","M")
 ]
        
schema = StructType([
     StructField('name', StructType([
        StructField('firstname', StringType(), True),
        StructField('middlename', StringType(), True),
         StructField('lastname', StringType(), True)
     ])),
     StructField('languages', ArrayType(StringType()), True),
     StructField('state', StringType(), True),
     StructField('gender', StringType(), True)
 ])

df4 = spark.createDataFrame(data = data, schema = schema)
df4.printSchema()
df4.show(2)

from pyspark.sql.functions import array_contains
df4.filter(array_contains(df4.languages,"Java")).show(2)

df4.filter(df4.name.lastname == "Williams").show(2)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)

+----------------+------------------+-----+------+
|            name|         languages|state|gender|
+----------------+------------------+-----+------+
|{James, , Smith}|[Java, Scala, C++]|   OH|     M|
|  {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|
+----------------+------------------+-----+------+
only showing top 2 rows

+----------------+------------------+-----+------+
|            name|         languages|state|gender|
+----------------+------------------+-----+------+
|{James, , Smith}|[Java, Scala, C++]|   OH|     M|
|  {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|
+----------------+------------------+-----+------+

+------------------

<h1><center>Add, Rename and Drop Columns, Change datatype of Column</center></h1>

df.withColumn("CopiedColumn",col("salary")* -1).show()

df.withColumn("Country", lit("USA")).show()

df6 = df.withColumn("Country", lit("USA")).withColumn("anotherColumn",lit("anotherValue"))




df.withColumnRenamed("gender","sex").show()




df.drop("salary").show()




df.withColumn("salary",col("salary").cast("Integer")).show()