## Skillshare Big data analysis with Apache spark - PySpark Python

### Session

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkTutorial').getOrCreate()
type(spark)

pyspark.sql.session.SparkSession

In [2]:
spark

### Reading Json

In [3]:
df = spark.read.json('student.json')
df

DataFrame[grade: bigint, name: string]

In [4]:
df.show()

+-----+-----+
|grade| name|
+-----+-----+
|    4| John|
|    9|Marry|
|    7|Peter|
+-----+-----+



In [5]:
df.printSchema()

root
 |-- grade: long (nullable = true)
 |-- name: string (nullable = true)



In [6]:
df.columns

['grade', 'name']

In [7]:
df.count()

3

In [8]:
df.describe().show()

+-------+-----------------+-----+
|summary|            grade| name|
+-------+-----------------+-----+
|  count|                3|    3|
|   mean|6.666666666666667| null|
| stddev|2.516611478423583| null|
|    min|                4| John|
|    max|                9|Peter|
+-------+-----------------+-----+



In [9]:
df.head(2)

[Row(grade=4, name='John'), Row(grade=9, name='Marry')]

### Define Custom Schema

In [11]:
from pyspark.sql.types import StructField, StringType, StructType, IntegerType
schema = StructType([StructField('name', StringType()), StructField('grade', IntegerType())])
df = spark.read.json('student.json', schema=schema)
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- grade: integer (nullable = true)



### Dataframe as SQL Table

In [12]:
df.createOrReplaceTempView('student')  # will create SQL table with given name

In [15]:
spark.sql("SELECT * FROM student").show()

+-----+-----+
| name|grade|
+-----+-----+
| John|    4|
|Marry|    9|
|Peter|    7|
+-----+-----+



In [14]:
spark.sql("SELECT * FROM student WHERE grade > 5").show()

+-----+-----+
| name|grade|
+-----+-----+
|Marry|    9|
|Peter|    7|
+-----+-----+



### Dataframe Operations

* Select column
* Create new column
* Rename column

In [16]:
df.select('grade')

DataFrame[grade: int]

In [17]:
df.select('grade').show()

+-----+
|grade|
+-----+
|    4|
|    9|
|    7|
+-----+



In [18]:
df.select(['name', 'grade']).show()

+-----+-----+
| name|grade|
+-----+-----+
| John|    4|
|Marry|    9|
|Peter|    7|
+-----+-----+



In [21]:
df.withColumn('new_grade', df['grade']**2)

DataFrame[name: string, grade: int, new_grade: double]

In [22]:
df.withColumn('new_grade', df['grade']**2).show()

+-----+-----+---------+
| name|grade|new_grade|
+-----+-----+---------+
| John|    4|     16.0|
|Marry|    9|     81.0|
|Peter|    7|     49.0|
+-----+-----+---------+



In [23]:
df.show()

+-----+-----+
| name|grade|
+-----+-----+
| John|    4|
|Marry|    9|
|Peter|    7|
+-----+-----+



In [24]:
df.withColumnRenamed('name', 'my_name')

DataFrame[my_name: string, grade: int]

In [27]:
df1=df.withColumnRenamed('name', 'my_name')
df1.show()

+-------+-----+
|my_name|grade|
+-------+-----+
|   John|    4|
|  Marry|    9|
|  Peter|    7|
+-------+-----+



In [26]:
df.show()

+-----+-----+
| name|grade|
+-----+-----+
| John|    4|
|Marry|    9|
|Peter|    7|
+-----+-----+



DataFrame is immutable object !!!!

### Dataframe Operations

* Group By
* Order By
* Special Function

In [36]:
df = spark.read.csv('titanic.csv', header=True, sep='\t', inferSchema=True)
df.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------

In [37]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [38]:
#DataFrame
print(df.groupBy('Sex').count())
df.groupBy('Sex').count().show()

DataFrame[Sex: string, count: bigint]
+------+-----+
|   Sex|count|
+------+-----+
|  male|  100|
|female|   56|
+------+-----+



In [42]:
df.groupBy('Sex').agg({'Age': 'mean'}).show(),df.groupBy('Sex').agg({'Survived': 'mean'}).show() 

df.groupBy('Sex').agg({'Age':'mean','Survived':'mean'}).show() 

+------+------------------+
|   Sex|          avg(Age)|
+------+------------------+
|  male|30.326962025316455|
|female| 24.46808510638298|
+------+------------------+

+------+------------------+
|   Sex|     avg(Survived)|
+------+------------------+
|  male|              0.14|
|female|0.7142857142857143|
+------+------------------+

+------+------------------+------------------+
|   Sex|     avg(Survived)|          avg(Age)|
+------+------------------+------------------+
|  male|              0.14|30.326962025316455|
|female|0.7142857142857143| 24.46808510638298|
+------+------------------+------------------+



In [37]:
df.groupBy('Sex').mean().select('Sex', 'Avg(Age)').show()

+------+------------------+
|   Sex|          Avg(Age)|
+------+------------------+
|female| 24.46808510638298|
|  male|30.326962025316455|
+------+------------------+



In [44]:
df.orderBy('Fare').show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+------------------+------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|            Ticket|  Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+------------------+------+-----+--------+
|        144|       0|     3| Burke, Mr. Jeremiah|  male|19.0|    0|    0|            365222|  6.75| null|       Q|
|        130|       0|     3|  Ekstrom, Mr. Johan|  male|45.0|    0|    0|            347061| 6.975| null|       S|
|        132|       0|     3|Coelho, Mr. Domin...|  male|20.0|    0|    0|SOTON/O.Q. 3101307|  7.05| null|       S|
|        128|       1|     3|Madsen, Mr. Fridt...|  male|24.0|    0|    0|           C 17369|7.1417| null|       S|
|         20|       1|     3|Masselmani, Mrs. ...|female|null|    0|    0|              2649| 7.225| null|       C|
+-----------+--------+------+--------------------+------+----+-----+----

In [45]:
df.orderBy(df['Fare'].desc()).show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+--------+--------+-----------+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|  Ticket|    Fare|      Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+--------+--------+-----------+--------+
|         89|       1|     1|Fortune, Miss. Ma...|female|23.0|    3|    2|   19950|   263.0|C23 C25 C27|       S|
|         28|       0|     1|Fortune, Mr. Char...|  male|19.0|    3|    2|   19950|   263.0|C23 C25 C27|       S|
|        119|       0|     1|Baxter, Mr. Quigg...|  male|24.0|    0|    1|PC 17558|247.5208|    B58 B60|       C|
|         32|       1|     1|Spencer, Mrs. Wil...|female|null|    1|    0|PC 17569|146.5208|        B78|       C|
|         63|       0|     1|Harris, Mr. Henry...|  male|45.0|    1|    0|   36973|  83.475|        C83|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+--------+-----

In [46]:
from pyspark.sql.functions import mean, countDistinct

In [47]:
df.select(mean('Age').alias('mean AGE')).show()

+------------------+
|          mean AGE|
+------------------+
|28.141507936507935|
+------------------+



In [48]:
df.select(countDistinct('Sex').alias('distinct SEX')).show()

+------------+
|distinct SEX|
+------------+
|           2|
+------------+



### Filter Data

In [49]:
df.filter(df['sex'] == 'male').show(8)

+-----------+--------+------+--------------------+----+----+-----+-----+---------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name| Sex| Age|SibSp|Parch|   Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+----+----+-----+-----+---------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|male|22.0|    1|    0|A/5 21171|   7.25| null|       S|
|          5|       0|     3|Allen, Mr. Willia...|male|35.0|    0|    0|   373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|male|null|    0|    0|   330877| 8.4583| null|       Q|
|          7|       0|     1|McCarthy, Mr. Tim...|male|54.0|    0|    0|    17463|51.8625|  E46|       S|
|          8|       0|     3|Palsson, Master. ...|male| 2.0|    3|    1|   349909| 21.075| null|       S|
|         13|       0|     3|Saundercock, Mr. ...|male|20.0|    0|    0|A/5. 2151|   8.05| null|       S|
|         14|       0|     3|Andersson, Mr. An

In [53]:
df.filter((df['sex'] == 'male') & (df['Age'] > 60)) .show(5)

+-----------+--------+------+--------------------+----+----+-----+-----+----------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name| Sex| Age|SibSp|Parch|    Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+----+----+-----+-----+----------+-------+-----+--------+
|         34|       0|     2|Wheadon, Mr. Edwa...|male|66.0|    0|    0|C.A. 24579|   10.5| null|       S|
|         55|       0|     1|Ostby, Mr. Engelh...|male|65.0|    0|    1|    113509|61.9792|  B30|       C|
|         97|       0|     1|Goldschmidt, Mr. ...|male|71.0|    0|    0|  PC 17754|34.6542|   A5|       C|
|        117|       0|     3|Connors, Mr. Patrick|male|70.5|    0|    0|    370369|   7.75| null|       Q|
+-----------+--------+------+--------------------+----+----+-----+-----+----------+-------+-----+--------+



### Missing Data

In [54]:
df = spark.read.csv('missing-data.csv', inferSchema=True, header=False)
df.show()

+----+----+----+
| _c0| _c1| _c2|
+----+----+----+
| 5.0|   3| 2.5|
| 2.6|null| 4.0|
|null|null| 4.3|
| 7.0|   3|null|
+----+----+----+



In [55]:
df.dropna().show()

+---+---+---+
|_c0|_c1|_c2|
+---+---+---+
|5.0|  3|2.5|
+---+---+---+



In [58]:
# If specified, drop rows that have less than `thresh` non-null values
# Drop atleast two values
df.dropna(thresh=2).show()

+---+----+----+
|_c0| _c1| _c2|
+---+----+----+
|5.0|   3| 2.5|
|2.6|null| 4.0|
|7.0|   3|null|
+---+----+----+



In [60]:
df.dropna(how='all').show() # Discard all value Null

+----+----+----+
| _c0| _c1| _c2|
+----+----+----+
| 5.0|   3| 2.5|
| 2.6|null| 4.0|
|null|null| 4.3|
| 7.0|   3|null|
+----+----+----+



In [61]:
df.fillna(0).show()

+---+---+---+
|_c0|_c1|_c2|
+---+---+---+
|5.0|  3|2.5|
|2.6|  0|4.0|
|0.0|  0|4.3|
|7.0|  3|0.0|
+---+---+---+



### Datetime Data

In [62]:
df = spark.read.csv('stock-data.csv', inferSchema=True, header=True)
df.show(5)

+----------+----------+----------+---------+-----------+
|      Date|Open Price|High Price|Low Price|Close Price|
+----------+----------+----------+---------+-----------+
|2018-02-28|     142.0|     151.7|    142.0|     148.55|
|2018-02-27|    149.15|     154.0|    149.1|      150.0|
|2018-02-26|     151.6|     152.0|    151.2|     151.35|
|2018-02-25|     151.3|     154.0|    151.1|      153.2|
|2018-02-24|     152.0|     153.6|    152.0|     152.65|
+----------+----------+----------+---------+-----------+
only showing top 5 rows



In [53]:
df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Open Price: double (nullable = true)
 |-- High Price: double (nullable = true)
 |-- Low Price: double (nullable = true)
 |-- Close Price: double (nullable = true)



In [63]:
from pyspark.sql.functions import dayofmonth, dayofyear, weekofyear, hour, month, year

In [65]:
df.select(year('date')).show(5)

+----------+
|year(date)|
+----------+
|      2018|
|      2018|
|      2018|
|      2018|
|      2018|
+----------+
only showing top 5 rows



In [66]:
df.select(dayofmonth('date')).show(5)

+----------------+
|dayofmonth(date)|
+----------------+
|              28|
|              27|
|              26|
|              25|
|              24|
+----------------+
only showing top 5 rows



In [67]:
df.withColumn('day of year', dayofyear('date')).show(5)

+----------+----------+----------+---------+-----------+-----------+
|      Date|Open Price|High Price|Low Price|Close Price|day of year|
+----------+----------+----------+---------+-----------+-----------+
|2018-02-28|     142.0|     151.7|    142.0|     148.55|         59|
|2018-02-27|    149.15|     154.0|    149.1|      150.0|         58|
|2018-02-26|     151.6|     152.0|    151.2|     151.35|         57|
|2018-02-25|     151.3|     154.0|    151.1|      153.2|         56|
|2018-02-24|     152.0|     153.6|    152.0|     152.65|         55|
+----------+----------+----------+---------+-----------+-----------+
only showing top 5 rows



In [68]:
df.filter(dayofyear('date')  < 50).show()

+----------+----------+----------+---------+-----------+
|      Date|Open Price|High Price|Low Price|Close Price|
+----------+----------+----------+---------+-----------+
|2018-02-18|     161.9|    164.95|    160.0|      160.9|
|2018-02-17|     161.4|     171.1|    160.9|     163.75|
+----------+----------+----------+---------+-----------+

