In [1]:
import pyspark
import pandas as pd
from datetime import datetime, date
from pyspark.sql import Row
from pyspark.sql import Column
import pyspark.pandas as ps
from pyspark.sql import SparkSession



In [2]:
spark = SparkSession.builder.appName('Practice Pyspark').getOrCreate()

23/07/09 19:40:21 WARN Utils: Your hostname, nMACHINE resolves to a loopback address: 127.0.1.1; using 192.168.0.133 instead (on interface wlp2s0)
23/07/09 19:40:21 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/07/09 19:40:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/07/09 19:40:22 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/07/09 19:40:22 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [3]:
spark

In [4]:
df = spark.read.option("header", True).csv('Data/student.csv', inferSchema=True)

In [5]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- class: string (nullable = true)
 |-- mark: integer (nullable = true)
 |-- gender: string (nullable = true)



In [6]:
df.show(2)

+---+--------+-----+----+------+
| id|    name|class|mark|gender|
+---+--------+-----+----+------+
|  1|John Deo| Four|  75|female|
|  2|Max Ruin|Three|  85|  male|
+---+--------+-----+----+------+
only showing top 2 rows



In [7]:
df.select(["name", "class"]).show(2)

+--------+-----+
|    name|class|
+--------+-----+
|John Deo| Four|
|Max Ruin|Three|
+--------+-----+
only showing top 2 rows



In [8]:
df.dtypes

[('id', 'int'),
 ('name', 'string'),
 ('class', 'string'),
 ('mark', 'int'),
 ('gender', 'string')]

In [9]:
df.describe().show()

+-------+------------------+---------+-----+------------------+------+
|summary|                id|     name|class|              mark|gender|
+-------+------------------+---------+-----+------------------+------+
|  count|                35|       35|   35|                35|    35|
|   mean|              18.0|     null| null| 74.65714285714286|  null|
| stddev|10.246950765959598|     null| null|16.401116994139826|  null|
|    min|                 1|Alex John|Eight|                18|female|
|    max|                35|    Tumyu|Three|                96|  male|
+-------+------------------+---------+-----+------------------+------+



# Adding columns to pyspark Dataframe

In [10]:
df = df.withColumn("marks out of 1", df["mark"]/100)

In [11]:
df.show(2)

+---+--------+-----+----+------+--------------+
| id|    name|class|mark|gender|marks out of 1|
+---+--------+-----+----+------+--------------+
|  1|John Deo| Four|  75|female|          0.75|
|  2|Max Ruin|Three|  85|  male|          0.85|
+---+--------+-----+----+------+--------------+
only showing top 2 rows



# Drop columns to pyspark Dataframe

In [12]:
df.drop("marks out of 1").show(2)

+---+--------+-----+----+------+
| id|    name|class|mark|gender|
+---+--------+-----+----+------+
|  1|John Deo| Four|  75|female|
|  2|Max Ruin|Three|  85|  male|
+---+--------+-----+----+------+
only showing top 2 rows



# Rename columns to pyspark Dataframe

In [13]:
df.withColumnRenamed('name', 'NAME').show(2)

+---+--------+-----+----+------+--------------+
| id|    NAME|class|mark|gender|marks out of 1|
+---+--------+-----+----+------+--------------+
|  1|John Deo| Four|  75|female|          0.75|
|  2|Max Ruin|Three|  85|  male|          0.85|
+---+--------+-----+----+------+--------------+
only showing top 2 rows



# Pandas API conversion

In [14]:
df = df.pandas_api()

In [15]:
df.head(2)

Unnamed: 0,id,name,class,mark,gender,marks out of 1
0,1,John Deo,Four,75,female,0.75
1,2,Max Ruin,Three,85,male,0.85


In [16]:
df = df.to_spark()
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- class: string (nullable = true)
 |-- mark: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- marks out of 1: double (nullable = true)





In [17]:
df.show(2)

+---+--------+-----+----+------+--------------+
| id|    name|class|mark|gender|marks out of 1|
+---+--------+-----+----+------+--------------+
|  1|John Deo| Four|  75|female|          0.75|
|  2|Max Ruin|Three|  85|  male|          0.85|
+---+--------+-----+----+------+--------------+
only showing top 2 rows



# Dealing with NULL Values

In [18]:
df = spark.read.csv('Data/student_nullvalues.csv', header=True, inferSchema = True)

In [19]:
df.show()

+---+-----------+-----+----+------+----------+
| id|       name|class|mark|gender|experience|
+---+-----------+-----+----+------+----------+
|  1|   John Deo| Four|  75|female|         9|
|  2|   Max Ruin|Three|  85|  male|         8|
|  3|     Arnold|Three|  55|  male|         7|
|  4| Krish Star| null|  60|female|         6|
|  5|  John Mike| Four|  60|female|         5|
|  6|  Alex John| Four|  55|  male|         4|
|  7|My John Rob|Fifth|  78|  male|         3|
|  8|     Asruid| Five|  85|  male|         2|
|  9|    Tes Qry|  Six|  78|  male|         1|
| 10|   Big John| Four|  55|female|         2|
| 11|     Ronald|  Six|  89|female|         3|
| 12|      Recky|  Six|  94|female|         4|
| 13|       null|Seven|  88|female|         5|
| 14|       Bigy|Seven|  88|female|      null|
| 15|   Tade Row| Four|  88|  male|      null|
| 16|      Gimmy| Four|  88|  male|         8|
| 17|      Tumyu|  Six|null|  male|         9|
| 18|      Honny| Five|  75|  male|         8|
| 19|      Ti

In [20]:
### how = 'any', 'all'
df.na.drop(how='any', thresh=6).show()

+---+-----------+-----+----+------+----------+
| id|       name|class|mark|gender|experience|
+---+-----------+-----+----+------+----------+
|  1|   John Deo| Four|  75|female|         9|
|  2|   Max Ruin|Three|  85|  male|         8|
|  3|     Arnold|Three|  55|  male|         7|
|  5|  John Mike| Four|  60|female|         5|
|  6|  Alex John| Four|  55|  male|         4|
|  7|My John Rob|Fifth|  78|  male|         3|
|  8|     Asruid| Five|  85|  male|         2|
|  9|    Tes Qry|  Six|  78|  male|         1|
| 10|   Big John| Four|  55|female|         2|
| 11|     Ronald|  Six|  89|female|         3|
| 12|      Recky|  Six|  94|female|         4|
| 16|      Gimmy| Four|  88|  male|         8|
| 18|      Honny| Five|  75|  male|         8|
| 19|      Tinny| Nine|  18|  male|         7|
| 21| Babby John| Four|  69|female|         5|
| 22|     Reggid|Seven|  55|female|         4|
| 23|      Herod|Eight|  79|  male|         3|
| 24|  Tiddy Now|Seven|  78|  male|         2|
| 25|   Giff 

In [21]:
df.na.drop(how='any',subset=['experience']).show()

+---+-----------+-----+----+------+----------+
| id|       name|class|mark|gender|experience|
+---+-----------+-----+----+------+----------+
|  1|   John Deo| Four|  75|female|         9|
|  2|   Max Ruin|Three|  85|  male|         8|
|  3|     Arnold|Three|  55|  male|         7|
|  4| Krish Star| null|  60|female|         6|
|  5|  John Mike| Four|  60|female|         5|
|  6|  Alex John| Four|  55|  male|         4|
|  7|My John Rob|Fifth|  78|  male|         3|
|  8|     Asruid| Five|  85|  male|         2|
|  9|    Tes Qry|  Six|  78|  male|         1|
| 10|   Big John| Four|  55|female|         2|
| 11|     Ronald|  Six|  89|female|         3|
| 12|      Recky|  Six|  94|female|         4|
| 13|       null|Seven|  88|female|         5|
| 16|      Gimmy| Four|  88|  male|         8|
| 17|      Tumyu|  Six|null|  male|         9|
| 18|      Honny| Five|  75|  male|         8|
| 19|      Tinny| Nine|  18|  male|         7|
| 20|     Jackly| Nine|  65|  null|         6|
| 21| Babby J

In [22]:
### fill nan values
df.na.fill(1000000000).show()

+---+-----------+-----+----------+------+----------+
| id|       name|class|      mark|gender|experience|
+---+-----------+-----+----------+------+----------+
|  1|   John Deo| Four|        75|female|         9|
|  2|   Max Ruin|Three|        85|  male|         8|
|  3|     Arnold|Three|        55|  male|         7|
|  4| Krish Star| null|        60|female|         6|
|  5|  John Mike| Four|        60|female|         5|
|  6|  Alex John| Four|        55|  male|         4|
|  7|My John Rob|Fifth|        78|  male|         3|
|  8|     Asruid| Five|        85|  male|         2|
|  9|    Tes Qry|  Six|        78|  male|         1|
| 10|   Big John| Four|        55|female|         2|
| 11|     Ronald|  Six|        89|female|         3|
| 12|      Recky|  Six|        94|female|         4|
| 13|       null|Seven|        88|female|         5|
| 14|       Bigy|Seven|        88|female|1000000000|
| 15|   Tade Row| Four|        88|  male|1000000000|
| 16|      Gimmy| Four|        88|  male|     

In [23]:
### Handle NAN values with Imputer function (mean, median,mode)
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols = ["mark", "experience"],
    outputCols = ["{}_imputed".format(i) for i in ["mark", "experience"]]
).setStrategy("mean")

In [24]:
imputer.fit(df).transform(df).show()

+---+-----------+-----+----+------+----------+------------+------------------+
| id|       name|class|mark|gender|experience|mark_imputed|experience_imputed|
+---+-----------+-----+----+------+----------+------------+------------------+
|  1|   John Deo| Four|  75|female|         9|          75|                 9|
|  2|   Max Ruin|Three|  85|  male|         8|          85|                 8|
|  3|     Arnold|Three|  55|  male|         7|          55|                 7|
|  4| Krish Star| null|  60|female|         6|          60|                 6|
|  5|  John Mike| Four|  60|female|         5|          60|                 5|
|  6|  Alex John| Four|  55|  male|         4|          55|                 4|
|  7|My John Rob|Fifth|  78|  male|         3|          78|                 3|
|  8|     Asruid| Five|  85|  male|         2|          85|                 2|
|  9|    Tes Qry|  Six|  78|  male|         1|          78|                 1|
| 10|   Big John| Four|  55|female|         2|      

# Filter Operations

In [25]:
df.filter("mark<50").show()

+---+-----+-----+----+------+----------+
| id| name|class|mark|gender|experience|
+---+-----+-----+----+------+----------+
| 19|Tinny| Nine|  18|  male|         7|
+---+-----+-----+----+------+----------+



In [26]:
df.filter((df["mark"]>50) & (df["mark"]<80)).show()

+---+-----------+-----+----+------+----------+
| id|       name|class|mark|gender|experience|
+---+-----------+-----+----+------+----------+
|  1|   John Deo| Four|  75|female|         9|
|  3|     Arnold|Three|  55|  male|         7|
|  4| Krish Star| null|  60|female|         6|
|  5|  John Mike| Four|  60|female|         5|
|  6|  Alex John| Four|  55|  male|         4|
|  7|My John Rob|Fifth|  78|  male|         3|
|  9|    Tes Qry|  Six|  78|  male|         1|
| 10|   Big John| Four|  55|female|         2|
| 18|      Honny| Five|  75|  male|         8|
| 20|     Jackly| Nine|  65|  null|         6|
| 21| Babby John| Four|  69|female|         5|
| 22|     Reggid|Seven|  55|female|         4|
| 23|      Herod|Eight|  79|  male|         3|
| 24|  Tiddy Now|Seven|  78|  male|         2|
| 29|Tess Played|Seven|  55|  male|         5|
| 30|  Reppy Red|  Six|  79|female|         6|
| 34|   Gain Toe|Seven|  69|  male|         8|
+---+-----------+-----+----+------+----------+



# GroupBy and Aggregate Functions

In [27]:
df.groupBy('class').mean().show()

+-----+------------------+-----------------+-----------------+
|class|           avg(id)|        avg(mark)|  avg(experience)|
+-----+------------------+-----------------+-----------------+
| Five|              13.0|             80.0|              5.0|
|  Six|              21.0|87.33333333333333|5.571428571428571|
|Eight|              23.0|             79.0|              3.0|
| Four|            13.125|            72.25|5.714285714285714|
| null|               4.0|             60.0|              6.0|
|Seven|              24.7|77.44444444444444|4.333333333333333|
| Nine|              19.5|             41.5|              6.5|
|Three|10.666666666666666|73.66666666666667|              6.0|
|Fifth|               7.0|             78.0|              3.0|
+-----+------------------+-----------------+-----------------+



In [28]:
df.agg({'mark':'mean'}).show()

+-----------------+
|        avg(mark)|
+-----------------+
|75.15151515151516|
+-----------------+

