In [11]:
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = (SparkSession
    .builder
    .appName("PythonMnMCount")
    .getOrCreate())

mnm_file = "C:/Users/alice.marchi/Downloads/LearningSparkV2-master/chapter2/scala/data"

mnm_df = (spark.read.format("csv")
 .option("header", "true")
 .option("inferSchema", "true")
 .load(mnm_file))


In [3]:
count_mnm_df = (mnm_df
    .select("State", "Color", "Count")
    .groupBy("State", "Color")
    .agg(count("Count").alias("Total"))
    .orderBy("Total", ascending=False))

count_mnm_df.show(n=10, truncate=False)
print("Total Rows = %d" % (count_mnm_df.count()))

+-----+------+-----+
|State|Color |Total|
+-----+------+-----+
|CA   |Yellow|1807 |
|WA   |Green |1779 |
|OR   |Orange|1743 |
|TX   |Green |1737 |
|TX   |Red   |1725 |
|CA   |Green |1723 |
|CO   |Yellow|1721 |
|CA   |Brown |1718 |
|CO   |Green |1713 |
|NV   |Orange|1712 |
+-----+------+-----+
only showing top 10 rows

Total Rows = 60


In [4]:
ca_count_mnm_df = (mnm_df
    .select("State", "Color", "Count")
    .where(mnm_df.State == "CA")
    .groupBy("State", "Color")
    .agg(count("Count").alias("Total"))
    .orderBy("Total", ascending=False))

ca_count_mnm_df.show(n=10, truncate=False)

+-----+------+-----+
|State|Color |Total|
+-----+------+-----+
|CA   |Yellow|1807 |
|CA   |Green |1723 |
|CA   |Brown |1718 |
|CA   |Orange|1657 |
|CA   |Red   |1656 |
|CA   |Blue  |1603 |
+-----+------+-----+



In [8]:
tx_count_mnm_df = (mnm_df
    .select("State", "Color", "Count")
    .where(mnm_df.State == "TX")
    .groupBy("State", "Color")
    .agg(count("Count").alias("Total"))
    .orderBy("Total", ascending=False))

tx_count_mnm_df.show(n=10, truncate=False)

+-----+------+-----+
|State|Color |Total|
+-----+------+-----+
|TX   |Green |1737 |
|TX   |Red   |1725 |
|TX   |Yellow|1703 |
|TX   |Orange|1652 |
|TX   |Brown |1641 |
|TX   |Blue  |1614 |
+-----+------+-----+



In [27]:
nv_avg_mnm_df = (mnm_df
    .select("State", "Color", "Count")
    .where(mnm_df.State == "NV")
    .groupBy("State", "Color")
    .agg(avg("Count").alias("avg"))
    .orderBy("avg", ascending=False))

nv_avg_mnm_df.show(n=10, truncate=False)

+-----+------+------------------+
|State|Color |avg               |
+-----+------+------------------+
|NV   |Brown |55.81050090525045 |
|NV   |Red   |55.4944099378882  |
|NV   |Orange|54.865070093457945|
|NV   |Yellow|54.561194029850746|
|NV   |Blue  |53.797369994022716|
|NV   |Green |53.78739693757362 |
+-----+------+------------------+



In [28]:
states_count_mnm_df = (mnm_df
    .select("State", "Color", "Count")
    .groupBy("State", "Color")
    .agg(count("Count").alias("Total"))
    .orderBy("Total", ascending=False))

states_count_mnm_df.show(n=10, truncate=False)

+-----+------+-----+
|State|Color |Total|
+-----+------+-----+
|CA   |Yellow|1807 |
|WA   |Green |1779 |
|OR   |Orange|1743 |
|TX   |Green |1737 |
|TX   |Red   |1725 |
|CA   |Green |1723 |
|CO   |Yellow|1721 |
|CA   |Brown |1718 |
|CO   |Green |1713 |
|NV   |Orange|1712 |
+-----+------+-----+
only showing top 10 rows



In [29]:
states_count_mnm_df.createOrReplaceTempView("states_count")

In [30]:
spark.sql("SELECT * from states_count").take(5)

[Row(State='CA', Color='Yellow', Total=1807),
 Row(State='WA', Color='Green', Total=1779),
 Row(State='OR', Color='Orange', Total=1743),
 Row(State='TX', Color='Green', Total=1737),
 Row(State='TX', Color='Red', Total=1725)]

In [35]:
spark.sql("SELECT State, Color, MAX(Total) AS MaxTotal FROM states_count GROUP BY State, Color ORDER BY MaxTotal DESC").take(5)

[Row(State='CA', Color='Yellow', MaxTotal=1807),
 Row(State='WA', Color='Green', MaxTotal=1779),
 Row(State='OR', Color='Orange', MaxTotal=1743),
 Row(State='TX', Color='Green', MaxTotal=1737),
 Row(State='TX', Color='Red', MaxTotal=1725)]

In [36]:
spark.sql("SELECT State, Color, MIN(Total) AS MinTotal FROM states_count GROUP BY State, Color ORDER BY MinTotal DESC").take(5)

[Row(State='CA', Color='Yellow', MinTotal=1807),
 Row(State='WA', Color='Green', MinTotal=1779),
 Row(State='OR', Color='Orange', MinTotal=1743),
 Row(State='TX', Color='Green', MinTotal=1737),
 Row(State='TX', Color='Red', MinTotal=1725)]