**Setting up Hadoop and Pyspark **

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz
!tar -xvzf spark-3.1.2-bin-hadoop3.2.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install pyspark


**First let's understand about data skewness**

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("Data_Skewness").getOrCreate()
df = spark.read.csv("/content/drive/MyDrive/Colab Notebooks/skewed_data.csv", header=True)

df.show()

+------+--------------+----------+-------+
|emp_id|      emp_name|emp_salary|dept_id|
+------+--------------+----------+-------+
|     1|Burton Camacho|     59958|      1|
|     1|Burton Camacho|     59958|      1|
|     1|Burton Camacho|     59958|      1|
|     1|   Elvis Giles|     44313|      2|
|     1|   Elvis Giles|     44313|      2|
|     1|   Elvis Giles|     44313|      2|
|     1|   Elton Wolfe|     59464|      3|
|     1| Neil Townsend|     24340|      3|
|     1|   Elton Wolfe|     59464|      3|
|     1| Neil Townsend|     24340|      3|
|     1|   Elton Wolfe|     59464|      3|
|     1| Neil Townsend|     24340|      3|
|     1|    Troy Ochoa|     62251|      4|
|     1|    Troy Ochoa|     62251|      4|
|     1|    Troy Ochoa|     62251|      4|
|     1|  Cruz Griffin|     42975|      5|
|     1|  Trevor Oneal|     88365|      5|
|     1|  Cruz Griffin|     42975|      5|
|     1|  Trevor Oneal|     88365|      5|
|     1|  Cruz Griffin|     42975|      5|
+------+---

In [4]:
from pyspark.sql.functions import *

agg_df = df.groupBy('emp_id')\
        .agg(count('emp_id').alias('total_records'))
agg_df.show()

+------+-------------+
|emp_id|total_records|
+------+-------------+
|     7|            9|
|     8|           12|
|     5|            6|
|     6|           17|
|     9|            9|
|     1|           85|
|    10|            8|
+------+-------------+



In [5]:
dept_df = spark.createDataFrame([(1,'A'), (2,'B'), (3,'C'), (4,'D'), (5,'E'), (6,'F'), (7,'G'), (8,'H'), (9,'I'), (10,'J')], ['dept_id', 'dept_name'])
dept_df.show()

+-------+---------+
|dept_id|dept_name|
+-------+---------+
|      1|        A|
|      2|        B|
|      3|        C|
|      4|        D|
|      5|        E|
|      6|        F|
|      7|        G|
|      8|        H|
|      9|        I|
|     10|        J|
+-------+---------+



In [6]:
# created a skewed distribution of records here 
df = df.repartition(10, 'emp_id')

print('num partitions : ', df.rdd.getNumPartitions())

df = df.withColumn('partition_id', spark_partition_id())

agg_df = df.groupBy('partition_id')\
        .agg(count('emp_id').alias('total_records'))\
        
agg_df.orderBy(desc(col('total_records'))).show()

# join will take more time in real scenario because df is highly skewed on emp_id for value 1
# see partition_id 4 has 85 records...
df_joined = df.join(dept_df, df.dept_id==dept_df.dept_id,'left')\
              .drop(dept_df.dept_id)

df_joined.show()

num partitions :  10
+------------+-------------+
|partition_id|total_records|
+------------+-------------+
|           4|           85|
|           5|           25|
|           9|           15|
|           6|           12|
|           3|            9|
+------------+-------------+

+------+-----------------+----------+-------+------------+---------+
|emp_id|         emp_name|emp_salary|dept_id|partition_id|dept_name|
+------+-----------------+----------+-------+------------+---------+
|     1|      Philip Cole|     33245|      7|           4|        G|
|     1|  Patrick Sweeney|     59835|      7|           4|        G|
|     1|      Philip Cole|     33245|      7|           4|        G|
|     1|  Patrick Sweeney|     59835|      7|           4|        G|
|     1|      Philip Cole|     33245|      7|           4|        G|
|     1|  Patrick Sweeney|     59835|      7|           4|        G|
|    10|    Ian Donaldson|     47112|      7|           5|        G|
|     8|     Rahim Franks| 

**Actual logic begins here for handling Data Skewness Issue...**

In [7]:
# creating data skewness scenario 
df = df.repartition(10, 'emp_id')
df = df.withColumn('partition_id', spark_partition_id())

agg_df = df.groupBy('partition_id')\
        .agg(count('emp_id').alias('total_records'))\
        
agg_df.orderBy(desc(col('total_records'))).show()

+------------+-------------+
|partition_id|total_records|
+------------+-------------+
|           4|           85|
|           5|           25|
|           9|           15|
|           6|           12|
|           3|            9|
+------------+-------------+



In [8]:
# generate a column having random values between a range 
df = df.withColumn('random_int', (rand()*10).cast('int'))

print("You can see random_int values are distributed properly for emp_id=1")
df.filter(col('emp_id')==1).show()

# create a salted key for source table 

df = df.withColumn('salted_key_src', concat(col('dept_id'), col('random_int')))\
      .drop('random_int')

df.show(10)

You can see random_int values are distributed properly for emp_id=1
+------+--------------+----------+-------+------------+----------+
|emp_id|      emp_name|emp_salary|dept_id|partition_id|random_int|
+------+--------------+----------+-------+------------+----------+
|     1|Burton Camacho|     59958|      1|           4|         1|
|     1|Burton Camacho|     59958|      1|           4|         1|
|     1|Burton Camacho|     59958|      1|           4|         4|
|     1|   Elvis Giles|     44313|      2|           4|         1|
|     1|   Elvis Giles|     44313|      2|           4|         5|
|     1|   Elvis Giles|     44313|      2|           4|         2|
|     1|   Elton Wolfe|     59464|      3|           4|         9|
|     1| Neil Townsend|     24340|      3|           4|         0|
|     1|   Elton Wolfe|     59464|      3|           4|         4|
|     1| Neil Townsend|     24340|      3|           4|         4|
|     1|   Elton Wolfe|     59464|      3|           4|      

In [11]:
# create salted key for dept_df (this second df typically has less records)

dept_df.show()

l = [lit(i) for i in range(10)]
print("list: ", l)

print("Add array of range values")
dept_df = dept_df.withColumn('range_val', array(l))
dept_df.show()

print("After explode operation")
dept_df = dept_df.select('dept_id', 'dept_name', explode('range_val').alias('range_val'))
dept_df.show()

# create salted key for dept_df
dept_df = dept_df.withColumn('salted_key_dept', concat(col('dept_id'), col('range_val')))

print("After creating salted key")
dept_df.show()

+-------+---------+---------+---------------+
|dept_id|dept_name|range_val|salted_key_dept|
+-------+---------+---------+---------------+
|      1|        A|        1|             11|
|      1|        A|        2|             12|
|      1|        A|        3|             13|
|      1|        A|        4|             14|
|      1|        A|        5|             15|
|      1|        A|        6|             16|
|      1|        A|        7|             17|
|      1|        A|        8|             18|
|      1|        A|        9|             19|
|      2|        B|        1|             21|
|      2|        B|        2|             22|
|      2|        B|        3|             23|
|      2|        B|        4|             24|
|      2|        B|        5|             25|
|      2|        B|        6|             26|
|      2|        B|        7|             27|
|      2|        B|        8|             28|
|      2|        B|        9|             29|
|      3|        C|        1|     

In [12]:
df_joined = df.join(dept_df, df.salted_key_src==dept_df.salted_key_dept,'left')\
              .drop(dept_df.dept_id)

print("total records: ", df_joined.count())
df_joined.show()

total records:  1314
+------+--------------+----------+-------+------------+--------------+---------+---------+---------------+
|emp_id|      emp_name|emp_salary|dept_id|partition_id|salted_key_src|dept_name|range_val|salted_key_dept|
+------+--------------+----------+-------+------------+--------------+---------+---------+---------------+
|     1|  Trevor Oneal|     88365|      5|           4|            51|        E|        1|             51|
|     1|  Trevor Oneal|     88365|      5|           4|            51|        E|        1|             51|
|     1|  Trevor Oneal|     88365|      5|           4|            51|        E|        1|             51|
|     1|  Trevor Oneal|     88365|      5|           4|            51|        E|        1|             51|
|     1|  Trevor Oneal|     88365|      5|           4|            51|        E|        1|             51|
|     1|  Trevor Oneal|     88365|      5|           4|            51|        E|        1|             51|
|     1|  Trevor