In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [40]:
spark = SparkSession.builder.appName("Week9_Assignment"). \
            config("spark.dynamicAllocation.enabled","false"). \
            config("spark.sql.files.maxPartitionBytes","146800640b"). \
            config("spark.sql.warehouse.dir","/user/itv012857/warehouse/"). \
            enableHiveSupport(). \
            master("yarn"). \
            getOrCreate()

In [3]:
spark

In [4]:
from pyspark.sql.types import StructType, StructField, IntegerType, ArrayType, StringType

In [5]:
users_schema = StructType(
                    [ StructField("user_id",IntegerType(),False),
                      StructField("user_first_name",StringType(),False),
                      StructField("user_last_name",StringType(),False),
                      StructField("user_email",StringType(),False),
                      StructField("user_gender",StringType(),False),
                      StructField("user_phone_numbers",ArrayType(StringType()),False),
                      StructField("user_address",StructType(
                                                              [
                                                                  StructField("street", StringType(), False),
                                                                  StructField("city", StringType(), False),
                                                                  StructField("state", StringType(), False),
                                                                  StructField("postal_code", StringType(), False),
                                                              ]
                                                          
                                              )
                                  ,False)
                    ]
                )

In [6]:
sms_users = spark.read.json("/public/sms/users/", schema = users_schema)

In [7]:
sms_users.show()

+-------+---------------+--------------+--------------------+-----------+--------------------+--------------------+
|user_id|user_first_name|user_last_name|          user_email|user_gender|  user_phone_numbers|        user_address|
+-------+---------------+--------------+--------------------+-----------+--------------------+--------------------+
| 200001|         Eirena|     Cutsforth|ecutsforth0@wisc.edu|     Female|[4197404036, 9173...|{8 Warrior Drive,...|
| 200002|          Marja|      Shopcott|mshopcott1@hexun.com|     Female|[9542037028, 2128...|{66 Prairieview T...|
| 200003|           Dawn|       Tointon|  dtointon2@ucsd.edu|     Female|[9523035647, 2134...|{18 Ronald Regan ...|
| 200004|          Goldi|        Leaman|     gleaman3@360.cn|     Female|[2027069459, 7042...|{7696 Calypso Jun...|
| 200005|       Brewster|      Hallagan|bhallagan4@livejo...|       Male|[8134746319, 2152...|{942 Emmet Park, ...|
| 200006|       Florence|       Glashby|fglashby5@deviant...|     Female

In [8]:
sms_users.rdd.getNumPartitions()

3

In [9]:
sms_users.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- user_first_name: string (nullable = true)
 |-- user_last_name: string (nullable = true)
 |-- user_email: string (nullable = true)
 |-- user_gender: string (nullable = true)
 |-- user_phone_numbers: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- user_address: struct (nullable = true)
 |    |-- street: string (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- postal_code: string (nullable = true)



In [10]:
sms_users.count()

1000000

In [11]:
sms_users2 = sms_users.withColumn("street",col("user_address.street")) \
                        .withColumn("city",col("user_address.city")) \
                        .withColumn("state",col("user_address.state")) \
                        .withColumn("postal_code",col("user_address.postal_code")) \
                        .drop("user_address")

In [12]:
sms_users2.show()

+-------+---------------+--------------+--------------------+-----------+--------------------+--------------------+-----------------+----------+-----------+
|user_id|user_first_name|user_last_name|          user_email|user_gender|  user_phone_numbers|              street|             city|     state|postal_code|
+-------+---------------+--------------+--------------------+-----------+--------------------+--------------------+-----------------+----------+-----------+
| 200001|         Eirena|     Cutsforth|ecutsforth0@wisc.edu|     Female|[4197404036, 9173...|     8 Warrior Drive|           Dallas|     Texas|      75358|
| 200002|          Marja|      Shopcott|mshopcott1@hexun.com|     Female|[9542037028, 2128...|66 Prairieview Te...|           Joliet|  Illinois|      60435|
| 200003|           Dawn|       Tointon|  dtointon2@ucsd.edu|     Female|[9523035647, 2134...|18 Ronald Regan Hill|  Shawnee Mission|    Kansas|      66225|
| 200004|          Goldi|        Leaman|     gleaman3@360.

In [13]:
sms_users2.filter("state == 'New York'").count()

49576

In [14]:
from pyspark.sql.functions import *

In [15]:
sms_users2.groupBy("state") \
.agg(
        countDistinct("postal_code").alias("postal_code_count")
) \
.orderBy("postal_code_count",ascending = False) \
.show(1)

+----------+-----------------+
|     state|postal_code_count|
+----------+-----------------+
|California|              206|
+----------+-----------------+
only showing top 1 row



In [16]:
sms_users2.filter("city is not null") \
.groupBy("city") \
.agg(
        countDistinct("user_id").alias("user_count")
) \
.orderBy("user_count",ascending = False) \
.show(1)

+----------+----------+
|      city|user_count|
+----------+----------+
|Washington|     28504|
+----------+----------+
only showing top 1 row



In [17]:
sms_users2.filter("user_email LIKE '%bizjournals.com'").count()

2015

In [18]:
sms_users2.filter(size(sms_users2["user_phone_numbers"]) == 4).count()

179041

In [19]:
sms_users2_w_phoneCount = sms_users2.withColumn("phone_number_count",size("user_phone_numbers"))

In [20]:
sms_users2_w_phoneCount.filter("phone_number_count <= 0").count()

108981

In [21]:
sms_users.rdd.getNumPartitions()

3

In [22]:
sms_users.write \
.format("parquet") \
.mode("overwrite") \
.option("path","/user/itv012857/data/sms_users/") \
.save()

In [23]:
spark.conf.get("spark.sql.files.openCostInBytes")

'4194304'

In [24]:
spark.conf.get("spark.sql.files.maxPartitionBytes")

'134217728b'

In [25]:
sms_users2.show()

+-------+---------------+--------------+--------------------+-----------+--------------------+--------------------+-----------------+----------+-----------+
|user_id|user_first_name|user_last_name|          user_email|user_gender|  user_phone_numbers|              street|             city|     state|postal_code|
+-------+---------------+--------------+--------------------+-----------+--------------------+--------------------+-----------------+----------+-----------+
| 200001|         Eirena|     Cutsforth|ecutsforth0@wisc.edu|     Female|[4197404036, 9173...|     8 Warrior Drive|           Dallas|     Texas|      75358|
| 200002|          Marja|      Shopcott|mshopcott1@hexun.com|     Female|[9542037028, 2128...|66 Prairieview Te...|           Joliet|  Illinois|      60435|
| 200003|           Dawn|       Tointon|  dtointon2@ucsd.edu|     Female|[9523035647, 2134...|18 Ronald Regan Hill|  Shawnee Mission|    Kansas|      66225|
| 200004|          Goldi|        Leaman|     gleaman3@360.

In [26]:
gender = ["Male","Female"]

In [27]:
sms_users2.groupBy("state") \
         .pivot("user_gender",gender) \
         .agg(count("*")) \
         .orderBy("state") \
         .show()

+--------------------+-----+------+
|               state| Male|Female|
+--------------------+-----+------+
|                null|54440| 54541|
|             Alabama| 9307|  9178|
|              Alaska| 1882|  1938|
|             Arizona| 9406|  9543|
|            Arkansas| 2420|  2416|
|          California|49120| 48716|
|            Colorado|10128| 10125|
|         Connecticut| 5797|  5917|
|            Delaware| 1651|  1654|
|District of Columbia|14212| 14292|
|             Florida|36692| 36688|
|             Georgia|13008| 13028|
|              Hawaii| 2172|  2062|
|               Idaho| 2058|  2101|
|            Illinois|11178| 11267|
|             Indiana| 9604|  9676|
|                Iowa| 4706|  4726|
|              Kansas| 5962|  5776|
|            Kentucky| 6216|  6108|
|           Louisiana| 8706|  8631|
+--------------------+-----+------+
only showing top 20 rows



In [33]:
spark.sparkContext.defaultParallelism

2

In [34]:
airlines_df = spark.read.csv("/public/airlines_all/airlines/")

In [35]:
airlines_df.rdd.getNumPartitions()

1919

Total Number of files	 1,919 <br>
Avg File Size	64 <br>
File Open Overhead	4 <br>
Size of one file	68 <br>

maxPartitionBytes = 128 MB : Only one file can file in one partition so 1919 partitions

In [37]:
spark.conf.get("spark.sql.files.maxPartitionBytes")

'134217728b'

In [38]:
140 * 1024 * 1024

146800640

In [40]:
spark = SparkSession.builder.appName("Week9_Assignment"). \
            config("spark.dynamicAllocation.enabled","false"). \
            config("spark.sql.files.maxPartitionBytes","146800640b"). \
            config("spark.sql.warehouse.dir","/user/itv012857/warehouse/"). \
            enableHiveSupport(). \
            master("yarn"). \
            getOrCreate()

In [42]:
airlines_df2 = spark.read.csv("/public/airlines_all/airlines/")

In [44]:
airlines_df2.rdd.getNumPartitions()

960

Two files can fit in one partition so 960 partitions