<a href="https://colab.research.google.com/github/buaindra/gcp_utility/blob/main/bigdata/colab/PySpark_Programming.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PySpark

#### Install pyspark

In [None]:
!pip install pyspark

#### import pyspark and create **spark session**

In [None]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

#### Read csv file

In [52]:
diamonds_df = spark.read.format("csv") \
                .option("header", "true") \
                .option("inferSchema", "true") \
                .load("/content/diamonds.csv")

diamonds_df.show(10)

+-----+---------+-----+-------+-----+-----+-----+----+----+----+
|carat|      cut|color|clarity|depth|table|price|   x|   y|   z|
+-----+---------+-----+-------+-----+-----+-----+----+----+----+
| 0.23|    Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|
| 0.21|  Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|
| 0.23|     Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|
| 0.29|  Premium|    I|    VS2| 62.4| 58.0|  334| 4.2|4.23|2.63|
| 0.31|     Good|    J|    SI2| 63.3| 58.0|  335|4.34|4.35|2.75|
| 0.24|Very Good|    J|   VVS2| 62.8| 57.0|  336|3.94|3.96|2.48|
| 0.24|Very Good|    I|   VVS1| 62.3| 57.0|  336|3.95|3.98|2.47|
| 0.26|Very Good|    H|    SI1| 61.9| 55.0|  337|4.07|4.11|2.53|
| 0.22|     Fair|    E|    VS2| 65.1| 61.0|  337|3.87|3.78|2.49|
| 0.23|Very Good|    H|    VS1| 59.4| 61.0|  338| 4.0|4.05|2.39|
+-----+---------+-----+-------+-----+-----+-----+----+----+----+
only showing top 10 rows



In [53]:
diamonds_df = spark.read.csv("/content/diamonds.csv", header=True, inferSchema=True)
diamonds_df.show(10)

+-----+---------+-----+-------+-----+-----+-----+----+----+----+
|carat|      cut|color|clarity|depth|table|price|   x|   y|   z|
+-----+---------+-----+-------+-----+-----+-----+----+----+----+
| 0.23|    Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|
| 0.21|  Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|
| 0.23|     Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|
| 0.29|  Premium|    I|    VS2| 62.4| 58.0|  334| 4.2|4.23|2.63|
| 0.31|     Good|    J|    SI2| 63.3| 58.0|  335|4.34|4.35|2.75|
| 0.24|Very Good|    J|   VVS2| 62.8| 57.0|  336|3.94|3.96|2.48|
| 0.24|Very Good|    I|   VVS1| 62.3| 57.0|  336|3.95|3.98|2.47|
| 0.26|Very Good|    H|    SI1| 61.9| 55.0|  337|4.07|4.11|2.53|
| 0.22|     Fair|    E|    VS2| 65.1| 61.0|  337|3.87|3.78|2.49|
| 0.23|Very Good|    H|    VS1| 59.4| 61.0|  338| 4.0|4.05|2.39|
+-----+---------+-----+-------+-----+-----+-----+----+----+----+
only showing top 10 rows



#### Analysis the spark dataframe columns, datatypes and statistics

In [55]:
diamonds_df.columns

['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y', 'z']

In [56]:
diamonds_df.dtypes

[('carat', 'double'),
 ('cut', 'string'),
 ('color', 'string'),
 ('clarity', 'string'),
 ('depth', 'double'),
 ('table', 'double'),
 ('price', 'int'),
 ('x', 'double'),
 ('y', 'double'),
 ('z', 'double')]

In [57]:
diamonds_df.describe().show()

+-------+------------------+---------+-----+-------+------------------+------------------+-----------------+------------------+------------------+------------------+
|summary|             carat|      cut|color|clarity|             depth|             table|            price|                 x|                 y|                 z|
+-------+------------------+---------+-----+-------+------------------+------------------+-----------------+------------------+------------------+------------------+
|  count|             53940|    53940|53940|  53940|             53940|             53940|            53940|             53940|             53940|             53940|
|   mean|0.7979397478679852|     null| null|   null| 61.74940489432624| 57.45718390804603|3932.799721913237| 5.731157211716609| 5.734525954764462|3.5387337782723316|
| stddev|0.4740112444054196|     null| null|   null|1.4326213188336525|2.2344905628213247|3989.439738146397|1.1217607467924915|1.1421346741235616|0.7056988469499883|
|   

#### Handling null in spark dataframe

#### link a temp view with this spark dataframe for sql query



In [58]:
diamonds_df.createOrReplaceTempView("tbl_diamonds")
diamonds_df = spark.sql("select * from tbl_diamonds where price <= 350")
diamonds_df.show()
spark.catalog.dropTempView("tbl_diamonds")

+-----+---------+-----+-------+-----+-----+-----+----+----+----+
|carat|      cut|color|clarity|depth|table|price|   x|   y|   z|
+-----+---------+-----+-------+-----+-----+-----+----+----+----+
| 0.23|    Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|
| 0.21|  Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|
| 0.23|     Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|
| 0.29|  Premium|    I|    VS2| 62.4| 58.0|  334| 4.2|4.23|2.63|
| 0.31|     Good|    J|    SI2| 63.3| 58.0|  335|4.34|4.35|2.75|
| 0.24|Very Good|    J|   VVS2| 62.8| 57.0|  336|3.94|3.96|2.48|
| 0.24|Very Good|    I|   VVS1| 62.3| 57.0|  336|3.95|3.98|2.47|
| 0.26|Very Good|    H|    SI1| 61.9| 55.0|  337|4.07|4.11|2.53|
| 0.22|     Fair|    E|    VS2| 65.1| 61.0|  337|3.87|3.78|2.49|
| 0.23|Very Good|    H|    VS1| 59.4| 61.0|  338| 4.0|4.05|2.39|
|  0.3|     Good|    J|    SI1| 64.0| 55.0|  339|4.25|4.28|2.73|
| 0.23|    Ideal|    J|    VS1| 62.8| 56.0|  340|3.93| 3.9|2.46|
| 0.22|  Premium|    F|  

#### Adding column into the existing spark dataframe

In [59]:
diamonds_df = diamonds_df.withColumn("price_in_rupees", diamonds_df["price"]*80)
diamonds_df.show()

+-----+---------+-----+-------+-----+-----+-----+----+----+----+---------------+
|carat|      cut|color|clarity|depth|table|price|   x|   y|   z|price_in_rupees|
+-----+---------+-----+-------+-----+-----+-----+----+----+----+---------------+
| 0.23|    Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|          26080|
| 0.21|  Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|          26080|
| 0.23|     Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|          26160|
| 0.29|  Premium|    I|    VS2| 62.4| 58.0|  334| 4.2|4.23|2.63|          26720|
| 0.31|     Good|    J|    SI2| 63.3| 58.0|  335|4.34|4.35|2.75|          26800|
| 0.24|Very Good|    J|   VVS2| 62.8| 57.0|  336|3.94|3.96|2.48|          26880|
| 0.24|Very Good|    I|   VVS1| 62.3| 57.0|  336|3.95|3.98|2.47|          26880|
| 0.26|Very Good|    H|    SI1| 61.9| 55.0|  337|4.07|4.11|2.53|          26960|
| 0.22|     Fair|    E|    VS2| 65.1| 61.0|  337|3.87|3.78|2.49|          26960|
| 0.23|Very Good|    H|    V

#### Renaming the existing column in spark dataframe

In [60]:
diamonds_df = diamonds_df.withColumnRenamed("x", "length") \
                .withColumnRenamed("y", "width") \
                .withColumnRenamed("z", "depth")

diamonds_df.show()

+-----+---------+-----+-------+-----+-----+-----+------+-----+-----+---------------+
|carat|      cut|color|clarity|depth|table|price|length|width|depth|price_in_rupees|
+-----+---------+-----+-------+-----+-----+-----+------+-----+-----+---------------+
| 0.23|    Ideal|    E|    SI2| 61.5| 55.0|  326|  3.95| 3.98| 2.43|          26080|
| 0.21|  Premium|    E|    SI1| 59.8| 61.0|  326|  3.89| 3.84| 2.31|          26080|
| 0.23|     Good|    E|    VS1| 56.9| 65.0|  327|  4.05| 4.07| 2.31|          26160|
| 0.29|  Premium|    I|    VS2| 62.4| 58.0|  334|   4.2| 4.23| 2.63|          26720|
| 0.31|     Good|    J|    SI2| 63.3| 58.0|  335|  4.34| 4.35| 2.75|          26800|
| 0.24|Very Good|    J|   VVS2| 62.8| 57.0|  336|  3.94| 3.96| 2.48|          26880|
| 0.24|Very Good|    I|   VVS1| 62.3| 57.0|  336|  3.95| 3.98| 2.47|          26880|
| 0.26|Very Good|    H|    SI1| 61.9| 55.0|  337|  4.07| 4.11| 2.53|          26960|
| 0.22|     Fair|    E|    VS2| 65.1| 61.0|  337|  3.87| 3.78| 2.

#### get the data from the spark dataframe using groupBy

In [61]:
from pyspark.sql.functions import avg

results_df = diamonds_df.select("color", "price") \
              .groupBy("color") \
              .agg(avg("price")) \
              .sort("color")

results_df.show()

+-----+-----------------+
|color|       avg(price)|
+-----+-----------------+
|    E|334.3333333333333|
|    F|            342.0|
|    H|            337.5|
|    I|339.3333333333333|
|    J|            338.8|
+-----+-----------------+



In [62]:
display(results_df)  # not working like databricks notebook

DataFrame[color: string, avg(price): double]