In [55]:
import pandas as pd
import numpy as np
import datetime as dt

import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *#avg, count, expr
from pyspark.sql.types import *

In [22]:
sc = pyspark.SparkContext()
ss = SparkSession(sc)

In [29]:
# ingest the file
fil = './LearningSparkV2-master/chapter2/py/src/data'
data = ss.read.load(fil, format='com.databricks.spark.csv', header='True', inferSchema='true')
print(type(data))
print('%d records'%data.count())
print('%s'%data.take(5))

<class 'pyspark.sql.dataframe.DataFrame'>
100006 records
[Row(State='TX', Color='Red', Count=20), Row(State='NV', Color='Blue', Count=66), Row(State='CO', Color='Blue', Count=79), Row(State='OR', Color='Blue', Count=71), Row(State='WA', Color='Yellow', Count=93)]


In [31]:
# agg by state & color
count_mnm_df = data.select("State", "Color", "Count").groupBy("State", "Color").agg(count('Count').alias('Total')).orderBy('Total', ascending=False)
count_mnm_df.show(n=60, truncate=False)
print("Total Rows = %d" % (count_mnm_df.count()))

+-----+------+-----+
|State|Color |Total|
+-----+------+-----+
|CA   |Yellow|1807 |
|WA   |Green |1779 |
|OR   |Orange|1743 |
|TX   |Green |1737 |
|TX   |Red   |1726 |
|CA   |Green |1723 |
|CO   |Yellow|1721 |
|CA   |Brown |1718 |
|CO   |Green |1713 |
|NV   |Orange|1712 |
|TX   |Yellow|1703 |
|NV   |Green |1698 |
|AZ   |Brown |1698 |
|CO   |Blue  |1697 |
|WY   |Green |1695 |
|NM   |Red   |1690 |
|AZ   |Orange|1689 |
|NM   |Yellow|1688 |
|NM   |Brown |1687 |
|UT   |Orange|1684 |
|NM   |Green |1682 |
|UT   |Red   |1680 |
|AZ   |Green |1676 |
|NV   |Yellow|1675 |
|NV   |Blue  |1675 |
|WA   |Red   |1671 |
|WY   |Red   |1670 |
|WA   |Brown |1669 |
|NM   |Orange|1665 |
|WY   |Blue  |1664 |
|WA   |Yellow|1664 |
|WA   |Orange|1658 |
|NV   |Brown |1657 |
|CA   |Orange|1657 |
|CA   |Red   |1656 |
|CO   |Brown |1656 |
|UT   |Blue  |1655 |
|AZ   |Yellow|1654 |
|TX   |Orange|1652 |
|AZ   |Red   |1648 |
|OR   |Blue  |1647 |
|OR   |Red   |1645 |
|UT   |Yellow|1645 |
|CO   |Orange|1642 |
|TX   |Brown 

In [32]:
ca_count_mnm_df = data.select("State", "Color", "Count").where(data.State == "CA").groupBy("State", "Color").agg(count('Count').alias('Total')).orderBy("Total", ascending=False)
ca_count_mnm_df.show(n=60, truncate=False)
print("Total Rows = %d" % (ca_count_mnm_df.count()))

+-----+------+-----+
|State|Color |Total|
+-----+------+-----+
|CA   |Yellow|1807 |
|CA   |Green |1723 |
|CA   |Brown |1718 |
|CA   |Orange|1657 |
|CA   |Red   |1656 |
|CA   |Blue  |1603 |
+-----+------+-----+

Total Rows = 6


In [28]:
data_df = ss.createDataFrame([("Brooke", 20), ("Denny", 31), ("Jules", 30), ("TD", 35), ("Brooke", 25)], ["name", "age"])
avg_df = data_df.groupBy("name").agg(avg("age"))
avg_df.show()

+------+--------+
|  name|avg(age)|
+------+--------+
|Brooke|    22.5|
| Jules|    30.0|
|    TD|    35.0|
| Denny|    31.0|
+------+--------+



In [41]:
# define a schema (2 ways)
schema = StructType([StructField('Id', IntegerType()), StructField('First', StringType()), StructField('Last', StringType()),
                     StructField('URL', StringType()), StructField('Published', DateType()), StructField('Hits', IntegerType()),
                     StructField('Campaigns', ArrayType(StringType()))])
# "`Id` INT, `First` STRING, `Last` STRING, `Url` STRING, `Published` STRING, `Hits` INT, `Campaigns` ARRAY<STRING>"

# create data
data = [[1, "Jules", "Damji", "https://tinyurl.1", dt.date(2016, 1, 4), 4535, ["twitter","LinkedIn"]],
        [2, "Brooke","Wenig", "https://tinyurl.2", dt.date(2018, 5, 5), 8908, ["twitter", "LinkedIn"]],
        [3, "Denny", "Lee", "https://tinyurl.3", dt.datetime(2019, 6, 7), 7659, ["web","twitter", "FB", "LinkedIn"]],
        [4, "Tathagata", "Das", "https://tinyurl.4", dt.date(2018, 5, 12), 10568,["twitter", "FB"]],
        [5, "Matei","Zaharia", "https://tinyurl.5", dt.date(2014, 5, 14), 40578, ["web","twitter", "FB", "LinkedIn"]],
        [6, "Reynold", "Xin", "https://tinyurl.6", dt.date(2015, 3, 2), 25568,["twitter", "LinkedIn"]]]

# put together as a spark dataframe
blogs_df = ss.createDataFrame(data, schema)
blogs_df.show()
blogs_df.printSchema()

+---+---------+-------+-----------------+----------+-----+--------------------+
| Id|    First|   Last|              URL| Published| Hits|           Campaigns|
+---+---------+-------+-----------------+----------+-----+--------------------+
|  1|    Jules|  Damji|https://tinyurl.1|2016-01-04| 4535| [twitter, LinkedIn]|
|  2|   Brooke|  Wenig|https://tinyurl.2|2018-05-05| 8908| [twitter, LinkedIn]|
|  3|    Denny|    Lee|https://tinyurl.3|2019-06-07| 7659|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|2018-05-12|10568|       [twitter, FB]|
|  5|    Matei|Zaharia|https://tinyurl.5|2014-05-14|40578|[web, twitter, FB...|
|  6|  Reynold|    Xin|https://tinyurl.6|2015-03-02|25568| [twitter, LinkedIn]|
+---+---------+-------+-----------------+----------+-----+--------------------+

root
 |-- Id: integer (nullable = true)
 |-- First: string (nullable = true)
 |-- Last: string (nullable = true)
 |-- URL: string (nullable = true)
 |-- Published: date (nullable = true)
 |-- Hits: i

In [45]:
# use same schema reading data from json - not working?
schema = StructType([StructField('Id', IntegerType()), StructField('First', StringType()), StructField('Last', StringType()),
                     StructField('URL', StringType()), StructField('Published', StringType()), StructField('Hits', IntegerType()),
                     StructField('Campaigns', ArrayType(StringType()))])

fil = './LearningSparkV2-master/chapter3/data'
fromjson_df = ss.read.schema(schema).json(fil)
fromjson_df.show()
fromjson_df.printSchema()

+----+-----+----+----+---------+----+---------+
|  Id|First|Last| URL|Published|Hits|Campaigns|
+----+-----+----+----+---------+----+---------+
|null| null|null|null|     null|null|     null|
|null| null|null|null|     null|null|     null|
|null| null|null|null|     null|null|     null|
|null| null|null|null|     null|null|     null|
|null| null|null|null|     null|null|     null|
|null| null|null|null|     null|null|     null|
|null| null|null|null|     null|null|     null|
|null| null|null|null|     null|null|     null|
|null| null|null|null|     null|null|     null|
|null| null|null|null|     null|null|     null|
|null| null|null|null|     null|null|     null|
|null| null|null|null|     null|null|     null|
|null| null|null|null|     null|null|     null|
|null| null|null|null|     null|null|     null|
|null| null|null|null|     null|null|     null|
|null| null|null|null|     null|null|     null|
|null| null|null|null|     null|null|     null|
|null| null|null|null|     null|null|   

In [75]:
print(blogs_df.columns)
print(blogs_df.select(col('Id')))
print(blogs_df.select('Id'))

['Id', 'First', 'Last', 'URL', 'Published', 'Hits', 'Campaigns']
DataFrame[Id: int]
DataFrame[Id: int]


In [56]:
blogs_df.select(expr("Hits*2")).show()

+----------+
|(Hits * 2)|
+----------+
|      9070|
|     17816|
|     15318|
|     21136|
|     81156|
|     51136|
+----------+



In [64]:
blogs_df.withColumn('Influencer', expr('Hits > 10000')).orderBy('Influencer', ascending=False).show()

+---+---------+-------+-----------------+----------+-----+--------------------+----------+
| Id|    First|   Last|              URL| Published| Hits|           Campaigns|Influencer|
+---+---------+-------+-----------------+----------+-----+--------------------+----------+
|  5|    Matei|Zaharia|https://tinyurl.5|2014-05-14|40578|[web, twitter, FB...|      true|
|  4|Tathagata|    Das|https://tinyurl.4|2018-05-12|10568|       [twitter, FB]|      true|
|  6|  Reynold|    Xin|https://tinyurl.6|2015-03-02|25568| [twitter, LinkedIn]|      true|
|  1|    Jules|  Damji|https://tinyurl.1|2016-01-04| 4535| [twitter, LinkedIn]|     false|
|  2|   Brooke|  Wenig|https://tinyurl.2|2018-05-05| 8908| [twitter, LinkedIn]|     false|
|  3|    Denny|    Lee|https://tinyurl.3|2019-06-07| 7659|[web, twitter, FB...|     false|
+---+---------+-------+-----------------+----------+-----+--------------------+----------+



+----------+------+--------------+----------------+----------+----------+--------------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+-------------+-------+-------------+---------+--------+--------------------------+----------------------+------------------+--------------------+--------------------+-------------+---------+
|CallNumber|UnitID|IncidentNumber|        CallType|  CallDate| WatchDate|CallFinalDisposition|       AvailableDtTm|             Address|City|Zipcode|Battalion|StationArea| Box|OriginalPriority|Priority|FinalPriority|ALSUnit|CallTypeGroup|NumAlarms|UnitType|UnitSequenceInCallDispatch|FirePreventionDistrict|SupervisorDistrict|        Neighborhood|            Location|        RowID|    Delay|
+----------+------+--------------+----------------+----------+----------+--------------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+------------

In [None]:
sc.stop()