In [0]:
schema = "`Id` INT, `First` STRING, `Last` STRING, `Url` STRING, `Published` STRING, `Hits` INT, `Campaigns` ARRAY<STRING>"

# Create our static data
data = [
    [1, "Jules", "Damji", "https://tinyurl.1", "1/4/2016", 4535, ["twitter","LinkedIn"]],
    [2, "Brooke","Wenig", "https://tinyurl.2", "5/5/2018", 8908, ["twitter", "LinkedIn"]],
    [3, "Denny", "Lee", "https://tinyurl.3", "6/7/2019", 7659, ["web", "twitter", "FB", "LinkedIn"]],
    [4, "Tathagata", "Das", "https://tinyurl.4", "5/12/2018", 10568, ["twitter", "FB"]],
    [5, "Matei","Zaharia", "https://tinyurl.5", "5/14/2014", 40578, ["web", "twitter", "FB", "LinkedIn"]],
    [6, "Reynold", "Xin", "https://tinyurl.6", "3/2/2015", 25568, ["twitter", "LinkedIn"]]
]

# Create a DataFrame using the schema defined above
blogs_df = spark.createDataFrame(data=data, schema=schema)
# Show the DataFrame; it should reflect our table above
blogs_df.show()
# Print the schema used by Spark to process the DataFrame
print(blogs_df.printSchema())


In [1]:
from pyspark.sql.functions import col, sum

blogs_df\
.selectExpr("*" , "concat(first, ' ', last) full_name")\
.agg(sum(col("Hits")).alias("Hits"))\
.show()

In [2]:
events_df = spark.table("market.events")

grouped = events_df.groupBy("brand").agg(f.count("*")) 

In [3]:
grouped.show()

In [4]:
from pyspark.sql.functions import expr

blogs_df\
.where("hits > 10000")\
.show()


In [5]:
sc.applicationId

 

What were all the different types of fire calls in 2018?
What months within the year 2018 saw the highest number of fire calls?
Which neighborhood in San Francisco generated the most fire calls in 2018?
Which neighborhoods had the worst response times to fire calls in 2018?
Which week in the year in 2018 had the most fire calls?
Is there a correlation between neighborhood, zip code, and number of fire calls?
How can we use Parquet files or SQL tables to store this data and read it back?


In [8]:
%sparkLocal.pyspark

spark.read.option("header", True).option("inferSchema", True).csv("data/sf-fire-calls.csv") \
.write.parquet("data/sf-fire-calls.parquet")

In [9]:
fireDF = spark.read.option("header", True).load("data/sf-fire-calls.parquet")
fireDF.printSchema()
fireDF.show(100, False)


In [10]:
import pyspark.sql.functions as f

fireDF \
.withColumn("CallDate", 
    f.to_timestamp(f.col("CallDate"), 'dd/MM/yyyy'))\
.withColumn("CallYear", f.year("CallDate"))\
.where("CallYear = '2018'") \
.select("CallType")\
.distinct()\
.show(1000, False)


In [11]:
fireDF\
.withColumn("CallDate", f.to_timestamp(f.col("CallDate"), 'dd/MM/yyyy'))\
.withColumn("CallMonth", f.month("CallDate"))\
.withColumn("CallYear", f.year("CallDate"))\
.where("CallYear = '2018'") \
.groupBy("CallMonth")\
.agg(
    f.countDistinct("callNumber").alias("call_count")
    )\
.orderBy("call_count", ascending=False)\
.show()


In [13]:
(fireDF
.groupBy("Neighborhood")
.agg(countDistinct("callNumber").alias("call_count"))
.orderBy("call_count", ascending=False)
.show(1000, False))

In [14]:
fireDF\
.where("lower(CallType) like '%fire%'")\
.select("Neighborhood", "delay", "callType")\
.orderBy("delay", ascending=False)\
.show()

In [15]:
fireDF\
.where("lower(CallType) like '%fire%'")\
.withColumn("CallDate", f.to_timestamp(f.col("CallDate"), 'dd/MM/yyyy'))\
.withColumn("CallWeek", f.weekofyear("CallDate"))\
.withColumn("CallYear", f.year("CallDate"))\
.where("CallYear = '2018'") \
.groupBy("CallYear", "CallWeek")\
.agg(
    f.countDistinct("callNumber").alias("call_count")
    )\
.orderBy("call_count", ascending=False)\
.show()


In [18]:
df1 = spark.createDataFrame([
    (1, "andy", 20, "USA"), 
    (2, "jeff", 23, "China"), 
    (3, "james", 18, "USA")]) \
.toDF("id", "name", "age", "country")

# Create udf create python lambda
from pyspark.sql.functions import udf
udf1 = udf(lambda e: e.upper())
df2 = df1.select(udf1(df1["name"]))
df2.show()

# UDF could also be used in filter, in this case the return type must be Boolean
# We can also use annotation to create udf
from pyspark.sql.types import *

@udf(returnType=BooleanType())
def udf2(e):
    if e >= 20:
        return True;
    else:
        return False

df3 = df1.filter(udf2(df1["age"]))
df3.show()

# UDF could also accept more than 1 argument.
udf3 = udf(lambda e1, e2: e1 + "_" + e2)
df4 = df1.select(udf3(df1["name"], df1["country"]).alias("name_country"))
df4.show()


In [20]:
df1 = spark.createDataFrame([
    (1, "andy", 20, 1), 
    (2, "jeff", 23, 2), 
    (3, "james", 18, 3)])
    .toDF("id", "name", "age", "c_id")
    
df1.show()

df2 = spark.createDataFrame([(1, "USA"), (2, "China")]).toDF("c_id", "c_name")
df2.show()

# You can just specify the key name if join on the same key
df3 = df1.join(df2, on=["c_id"], how="inner")
df3.show()

# Or you can specify the join condition expclitly in case the key is different between tables
df4 = df1.join(df2, df1["c_id"] == df2["c_id"] )
df4.show()

# You can specify the join type afte the join condition, by default it is inner join
df5 = df1.join(df2, df1["c_id"] == df2["c_id"], "left_outer")
df5.show()

In [21]:
df1 = spark.createDataFrame([("andy", 20, 1, 1), ("jeff", 23, 1, 2), ("james", 12, 2, 2)]).toDF("name", "age", "key_1", "key_2")
df1.show()

df2 = spark.createDataFrame([(1, 1, "USA"), (2, 2, "China")]).toDF("key_1", "key_2", "country")
df2.show()

# Join on 2 fields: key_1, key_2

# You can pass a list of field name if the join field names are the same in both tables
df3 = df1.join(df2, ["key_1", "key_2"])
df3.show()

# Or you can specify the join condition expclitly in case when the join fields name is differetnt in the two tables
df4 = df1.join(df2, (df1["key_1"] == df2["key_1"]) & (df1["key_2"] == df2["key_2"]))
df4.show()



In [22]:
df1 = spark.createDataFrame([
        (1, "andy", 20, "USA"), 
        (2, "jeff", 23, "China"), 
        (3, "james", 18, "USA")]) \
    .toDF("id", "name", "age", "country")
    
df1.createOrReplaceTempView("people")

df2 = spark.
("select name, age from people")
df2.show()