In [1]:
sc

In [3]:
spark

In [5]:
hr_employeee = spark.read.csv('file:///home/hadoop/Downloads/HR_Employee.csv' , inferSchema=True, header=True)

In [6]:
hr_employeee.show(3)

+----------+--------------------+--------------------+---------+------+---+-------------+-------------+--------------+-----------------+--------------+--------+---------------+----------+------+----------+--------+------+-----------------------+---------------+---------------------+---------------+------------------+
|EmployeeID|          Department|             JobRole|Attrition|Gender|Age|MaritalStatus|    Education|EducationField|   BusinessTravel|JobInvolvement|JobLevel|JobSatisfaction|Hourlyrate|Income|Salaryhike|OverTime|Workex|YearsSinceLastPromotion|EmpSatisfaction|TrainingTimesLastYear|WorkLifeBalance|Performance_Rating|
+----------+--------------------+--------------------+---------+------+---+-------------+-------------+--------------+-----------------+--------------+--------+---------------+----------+------+----------+--------+------+-----------------------+---------------+---------------------+---------------+------------------+
|         1|               Sales|     Sales

In [7]:
hr_employeee.printSchema()

root
 |-- EmployeeID: integer (nullable = true)
 |-- Department: string (nullable = true)
 |-- JobRole: string (nullable = true)
 |-- Attrition: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- MaritalStatus: string (nullable = true)
 |-- Education: string (nullable = true)
 |-- EducationField: string (nullable = true)
 |-- BusinessTravel: string (nullable = true)
 |-- JobInvolvement: string (nullable = true)
 |-- JobLevel: integer (nullable = true)
 |-- JobSatisfaction: string (nullable = true)
 |-- Hourlyrate: integer (nullable = true)
 |-- Income: integer (nullable = true)
 |-- Salaryhike: integer (nullable = true)
 |-- OverTime: string (nullable = true)
 |-- Workex: integer (nullable = true)
 |-- YearsSinceLastPromotion: integer (nullable = true)
 |-- EmpSatisfaction: string (nullable = true)
 |-- TrainingTimesLastYear: integer (nullable = true)
 |-- WorkLifeBalance: string (nullable = true)
 |-- Performance_Rating: string (nul

### 1. Big Data File Types
        * Parquet FileFormat - Records are stored Columnar format, this file format compresses dataset of .csv of structured format into parquet format. Parquet Format is good for query type of response.
        * There are other file formats such as AVRO, ORC, etc.

In [9]:
hr_employeee.rdd.getNumPartitions()

1

In [None]:
#hr_employeee.repartition()hd

In [10]:
hr_employeee.write.parquet('file:///home/hadoop/Downloads/HR_Parquet')

In [102]:
hr_employeee.write.orc('/HR_Orc')

IllegalArgumentException: 'Pathname /file:/home/hadoop/Downloads/HR_Orc from hdfs://localhost:9000/file:/home/hadoop/Downloads/HR_Orc is not a valid DFS filename.'

In [13]:
spark.read.orc('/HR_Orc').show(10)

+----------+--------------------+--------------------+---------+------+---+-------------+-------------+--------------+-----------------+--------------+--------+---------------+----------+------+----------+--------+------+-----------------------+---------------+---------------------+---------------+------------------+
|EmployeeID|          Department|             JobRole|Attrition|Gender|Age|MaritalStatus|    Education|EducationField|   BusinessTravel|JobInvolvement|JobLevel|JobSatisfaction|Hourlyrate|Income|Salaryhike|OverTime|Workex|YearsSinceLastPromotion|EmpSatisfaction|TrainingTimesLastYear|WorkLifeBalance|Performance_Rating|
+----------+--------------------+--------------------+---------+------+---+-------------+-------------+--------------+-----------------+--------------+--------+---------------+----------+------+----------+--------+------+-----------------------+---------------+---------------------+---------------+------------------+
|         1|               Sales|     Sales

### Optimization Techniques
        * Optimizing spark jobs can significantly improve performance of spark running queries, spark jobs.

### 2. Partitioning 
    * Partitioning divides data into smaller chunks , which can be processed parallely.

In [15]:
hr_employeee.rdd.getNumPartitions()

1

In [16]:
partitioned_df = hr_employeee.repartition(3)

In [17]:
partitioned_df.rdd.getNumPartitions()

3

In [19]:
partitioned_df.write.parquet('/HR_Partition')

### 3. Caching & Persistance
    * Managing Different Level of Storage.

In [21]:
# In-Memory Cache Storage.
hr_employeee.cache()

DataFrame[EmployeeID: int, Department: string, JobRole: string, Attrition: string, Gender: string, Age: int, MaritalStatus: string, Education: string, EducationField: string, BusinessTravel: string, JobInvolvement: string, JobLevel: int, JobSatisfaction: string, Hourlyrate: int, Income: int, Salaryhike: int, OverTime: string, Workex: int, YearsSinceLastPromotion: int, EmpSatisfaction: string, TrainingTimesLastYear: int, WorkLifeBalance: string, Performance_Rating: string]

In [22]:
# Persistance of DataFrame with specific type of storage options like - memory only, memory_ser, memory_and_disk
from pyspark import StorageLevel
hr_employeee = hr_employeee.persist(StorageLevel.MEMORY_AND_DISK)

DataFrame[EmployeeID: int, Department: string, JobRole: string, Attrition: string, Gender: string, Age: int, MaritalStatus: string, Education: string, EducationField: string, BusinessTravel: string, JobInvolvement: string, JobLevel: int, JobSatisfaction: string, Hourlyrate: int, Income: int, Salaryhike: int, OverTime: string, Workex: int, YearsSinceLastPromotion: int, EmpSatisfaction: string, TrainingTimesLastYear: int, WorkLifeBalance: string, Performance_Rating: string]

In [23]:
hr_employeee = hr_employeee.persist(StorageLevel.MEMORY_ONLY_SER)

### 4. Serialization
    * Efficient Serialization reduces time to read/write data and transfer it over network. Kyro Serialization is popular serialization method for better performance over default Java Serialization.
    * Java Serialization is the default method. It is easy to use but drawback is it will slow down the read, write process.

In [24]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [25]:
spark = SparkSession.builder.appName("Java Serialization").getOrCreate()

In [26]:
spark.stop()

In [27]:
spark = SparkSession.builder.appName("PySpark Serialization")\
.config("spark-serializer","org.apache.spark.serializer.JavaSerializer")\
.getOrCreate()

In [28]:
spark

In [29]:
# spark = SparkSession.builder\
# .config("SparkSerializer","org.apache.spark.serializer.KyroSerializer")\
# .config("spark.kyro.registrationRequired","true")\
# .config("spark.kyro.classesToRegister","org.apache.spark.example.Person")\
# .appName("Kyro Serialization").getOrCreate()

### 5. Broadcast Join
    * Broadcast small datasets, improve join performance.

In [31]:
small_df = spark.read.csv('file:///home/hadoop/Downloads/airports.csv',inferSchema=True,header=True)
large_df = spark.read.csv('file:///home/hadoop/Downloads/raw_flight_data.csv',inferSchema=True,header=True)

In [32]:
from pyspark.sql.functions import broadcast
broadcast_df = broadcast(small_df)

In [34]:
broadcast_df = broadcast_df.cache()
small_df = small_df.cache()

In [39]:
airport_df = large_df.join(broadcast_df, large_df.OriginAirportID == broadcast_df.airport_id)

In [40]:
airport_df.show(2)

+----------+---------+-------+---------------+-------------+--------+--------+----------+--------------+-----+--------------------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|airport_id|          city|state|                name|
+----------+---------+-------+---------------+-------------+--------+--------+----------+--------------+-----+--------------------+
|        19|        5|     DL|          11433|        13303|      -3|       1|     11433|       Detroit|   MI|Detroit Metro Way...|
|        19|        5|     DL|          14869|        12478|       0|      -8|     14869|Salt Lake City|   UT|Salt Lake City In...|
+----------+---------+-------+---------------+-------------+--------+--------+----------+--------------+-----+--------------------+
only showing top 2 rows



### 6. Level of Parallelism

In [41]:
# Adjust level of parallielism based on your cluster size.
spark.conf.set("spark.default.parallelism", 100)

### 7. Avoid GroupByKey
    * Use ReduceByKey() or aggregateByKey() instead of GroupByKey() to reduce numner of shuffling.

In [45]:
rdd = spark.sparkContext.parallelize([('dosa',2),('idly',3),
                                      ('vada',5),('rice',1),('rice',2),('coffee',5),('idly',4)])

In [54]:
rdd.groupByKey().mapValues(sum).collect()

[('idly', 7), ('vada', 5), ('rice', 3), ('coffee', 5), ('dosa', 2)]

In [58]:
rdd.reduceByKey(lambda x,y : x+y ).collect()

[('idly', 7), ('vada', 5), ('rice', 3), ('coffee', 5), ('dosa', 2)]

In [56]:
rdd.aggregateByKey()

TypeError: aggregateByKey() missing 2 required positional arguments: 'seqFunc' and 'combFunc'

In [73]:
df = spark.createDataFrame([('dosa',2),('idly',3),
                            ('vada',5),('rice',1),('rice',2),
                            ('coffee',5),('idly',4)],
                            schema=['dish','quantity'])

In [75]:
from pyspark.sql.functions import sum
df.groupBy("dish").agg(sum('quantity')).show()

+------+-------------+
|  dish|sum(quantity)|
+------+-------------+
|  vada|            5|
|  dosa|            2|
|  idly|            7|
|  rice|            3|
|coffee|            5|
+------+-------------+



In [78]:
df.rdd.reduceByKey(lambda x,y : x+y).collect()

[('idly', 7), ('vada', 5), ('rice', 3), ('coffee', 5), ('dosa', 2)]

### 8. Reduce Shuffle
    * Reduce the number of shuffles by optimizing the transformations.
    * Use reduceByKey over groupByKey().
    * Use map() and reduce() over groupBy

### 9. Repartition() and Coelesce()
    * Repartition will increase the number of partitions.
    * Coelesce decreases the number of partitions.

### 10. Accumulators
    * Use accumulators for optimizing aggregate information like count(), sum() across all executors parallely executing tasks in multiple worker nodes.
    

In [91]:
acc = spark.sparkContext.accumulator(0)

In [92]:
type(acc)

pyspark.accumulators.Accumulator

In [93]:
rdd = spark.sparkContext.parallelize([1,2,3,4,5,6,7,8,9])

In [94]:
# python udf
def addition(x):
    acc.add(x)

In [98]:
rdd.foreach(addition)

In [99]:
acc.value

90

In [100]:
def counter(x):
    global acc
    acc.add(1)
    return x

In [101]:
rdd.map(counter).count()

9

### 11. Bucketing
    * Use Bucketing to create buckets of large datasets for efficient query and joins.