In [0]:
schema = "`Id` INT, `First` STRING, `Last` STRING, `Url` STRING, `Published` STRING, `Hits` INT, `Campaigns` ARRAY<STRING>"

data = [[1, "Jules", "Damji", "https://tinyurl.1", "1/4/2016", 4535, ["twitter",
"LinkedIn"]],
 [2, "Brooke","Wenig", "https://tinyurl.2", "5/5/2018", 8908, ["twitter",
"LinkedIn"]],
 [3, "Denny", "Lee", "https://tinyurl.3", "6/7/2019", 7659, ["web",
"twitter", "FB", "LinkedIn"]],
 [4, "Tathagata", "Das", "https://tinyurl.4", "5/12/2018", 10568,
["twitter", "FB"]],
 [5, "Matei","Zaharia", "https://tinyurl.5", "5/14/2014", 40578, ["web",
"twitter", "FB", "LinkedIn"]],
 [6, "Reynold", "Xin", "https://tinyurl.6", "3/2/2015", 25568,
["twitter", "LinkedIn"]]
 ]

In [0]:
blogsDF = spark.createDataFrame(data, schema)

In [0]:
display(blogsDF)

Id,First,Last,Url,Published,Hits,Campaigns
1,Jules,Damji,https://tinyurl.1,1/4/2016,4535,"List(twitter, LinkedIn)"
2,Brooke,Wenig,https://tinyurl.2,5/5/2018,8908,"List(twitter, LinkedIn)"
3,Denny,Lee,https://tinyurl.3,6/7/2019,7659,"List(web, twitter, FB, LinkedIn)"
4,Tathagata,Das,https://tinyurl.4,5/12/2018,10568,"List(twitter, FB)"
5,Matei,Zaharia,https://tinyurl.5,5/14/2014,40578,"List(web, twitter, FB, LinkedIn)"
6,Reynold,Xin,https://tinyurl.6,3/2/2015,25568,"List(twitter, LinkedIn)"


In [0]:
blogsDF.first() # Returns the the first row

Out[102]: Row(Id=1, First='Jules', Last='Damji', Url='https://tinyurl.1', Published='1/4/2016', Hits=4535, Campaigns=['twitter', 'LinkedIn'])

In [0]:
blogsDF.head() # Returns the the first row

Out[103]: Row(Id=1, First='Jules', Last='Damji', Url='https://tinyurl.1', Published='1/4/2016', Hits=4535, Campaigns=['twitter', 'LinkedIn'])

In [0]:
blogsDF.describe() # Computes basic statistics for numeric and string columns

Out[104]: DataFrame[summary: string, Id: string, First: string, Last: string, Url: string, Published: string, Hits: string]

In [0]:
blogsDF.describe().show() # Computes basic statistics for numeric and string columns

+-------+------------------+---------+-------+-----------------+---------+------------------+
|summary|                Id|    First|   Last|              Url|Published|              Hits|
+-------+------------------+---------+-------+-----------------+---------+------------------+
|  count|                 6|        6|      6|                6|        6|                 6|
|   mean|               3.5|     null|   null|             null|     null|16302.666666666666|
| stddev|1.8708286933869707|     null|   null|             null|     null|13970.659679008242|
|    min|                 1|   Brooke|  Damji|https://tinyurl.1| 1/4/2016|              4535|
|    max|                 6|Tathagata|Zaharia|https://tinyurl.6| 6/7/2019|             40578|
+-------+------------------+---------+-------+-----------------+---------+------------------+



In [0]:
blogsDF.count() # Returns the number of rows in the DataFrame

Out[106]: 6

In [0]:
blogsDF.createOrReplaceTempView("blog") # Create a temporary view based on the DataFrame. The lifetime of the temporary view is tied to the SparkSession that was used to create the DataFrame.

In [0]:
spark.sql("""SELECT * FROM blog""").show()

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
+---+---------+-------+-----------------+---------+-----+--------------------+



In [0]:
blogsDF.printSchema() # Access a dataframe's schema using the "schema" attribute.

root
 |-- Id: integer (nullable = true)
 |-- First: string (nullable = true)
 |-- Last: string (nullable = true)
 |-- Url: string (nullable = true)
 |-- Published: string (nullable = true)
 |-- Hits: integer (nullable = true)
 |-- Campaigns: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [0]:
blogsDF.schema # Access a dataframe's schema using the "schema" attribute.

Out[110]: StructType(List(StructField(Id,IntegerType,true),StructField(First,StringType,true),StructField(Last,StringType,true),StructField(Url,StringType,true),StructField(Published,StringType,true),StructField(Hits,IntegerType,true),StructField(Campaigns,ArrayType(StringType,true),true)))

In [0]:
from pyspark.sql.functions import col

blogsDF.orderBy(col("Hits").desc()).show()   # Returns a sort expression based on descending order of the column

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
+---+---------+-------+-----------------+---------+-----+--------------------+



In [0]:
blogsDF.sort("Hits").show()

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
+---+---------+-------+-----------------+---------+-----+--------------------+



In [0]:
blogsDF.filter(col("Hits") > 10000).show() # Filters rows using the given condition

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
+---+---------+-------+-----------------+---------+-----+--------------------+



In [0]:
blogsDF.filter("Hits > 10000").show() # Filters rows using the given condition

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
+---+---------+-------+-----------------+---------+-----+--------------------+



In [0]:
nameBlogsDF = blogsDF.withColumnRenamed("First", "FirstName").show() # Returns a new DataFrame with a column renamed

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|FirstName|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
+---+---------+-------+-----------------+---------+-----+--------------------+



In [0]:
urlBlogsDF = blogsDF.select("Id", "Url") # Returns a new DataFrame by computing given expression for each element

display(urlBlogsDF)

Id,Url
1,https://tinyurl.1
2,https://tinyurl.2
3,https://tinyurl.3
4,https://tinyurl.4
5,https://tinyurl.5
6,https://tinyurl.6


In [0]:
newUrlBlogsDF = blogsDF.select("Id", col("Url").alias("webUrl"))

display(newUrlBlogsDF)

Id,webUrl
1,https://tinyurl.1
2,https://tinyurl.2
3,https://tinyurl.3
4,https://tinyurl.4
5,https://tinyurl.5
6,https://tinyurl.6


In [0]:
bigBlogsDF = blogsDF.selectExpr("Id", "Hits > 10000 as BigBlog") # Selects a list of SQL expressions

display(bigBlogsDF)

Id,BigBlog
1,False
2,False
3,False
4,True
5,True
6,True


In [0]:
anonymousBlogsDF = blogsDF.drop("First", "Last") # Returns a new DataFrame after dropping the given column, specified as a string or Column object
                                                 # Use strings to specify multiple columns
display(anonymousBlogsDF)

Id,Url,Published,Hits,Campaigns
1,https://tinyurl.1,1/4/2016,4535,"List(twitter, LinkedIn)"
2,https://tinyurl.2,5/5/2018,8908,"List(twitter, LinkedIn)"
3,https://tinyurl.3,6/7/2019,7659,"List(web, twitter, FB, LinkedIn)"
4,https://tinyurl.4,5/12/2018,10568,"List(twitter, FB)"
5,https://tinyurl.5,5/14/2014,40578,"List(web, twitter, FB, LinkedIn)"
6,https://tinyurl.6,3/2/2015,25568,"List(twitter, LinkedIn)"


In [0]:
anonymous2BlogsDF = (blogsDF.drop(col("First"))
                           .drop(col("Last"))
                    )
display(anonymous2BlogsDF)

Id,Url,Published,Hits,Campaigns
1,https://tinyurl.1,1/4/2016,4535,"List(twitter, LinkedIn)"
2,https://tinyurl.2,5/5/2018,8908,"List(twitter, LinkedIn)"
3,https://tinyurl.3,6/7/2019,7659,"List(web, twitter, FB, LinkedIn)"
4,https://tinyurl.4,5/12/2018,10568,"List(twitter, FB)"
5,https://tinyurl.5,5/14/2014,40578,"List(web, twitter, FB, LinkedIn)"
6,https://tinyurl.6,3/2/2015,25568,"List(twitter, LinkedIn)"


In [0]:
stringBlogsDF = blogsDF.withColumn("stringHits", col("Hits").cast("string"))

stringBlogsDF.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- First: string (nullable = true)
 |-- Last: string (nullable = true)
 |-- Url: string (nullable = true)
 |-- Published: string (nullable = true)
 |-- Hits: integer (nullable = true)
 |-- Campaigns: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- stringHits: string (nullable = true)



In [0]:
notNullBlogsDF = blogsDF.filter(col("Id").isNotNull())

display(notNullBlogsDF)

Id,First,Last,Url,Published,Hits,Campaigns
1,Jules,Damji,https://tinyurl.1,1/4/2016,4535,"List(twitter, LinkedIn)"
2,Brooke,Wenig,https://tinyurl.2,5/5/2018,8908,"List(twitter, LinkedIn)"
3,Denny,Lee,https://tinyurl.3,6/7/2019,7659,"List(web, twitter, FB, LinkedIn)"
4,Tathagata,Das,https://tinyurl.4,5/12/2018,10568,"List(twitter, FB)"
5,Matei,Zaharia,https://tinyurl.5,5/14/2014,40578,"List(web, twitter, FB, LinkedIn)"
6,Reynold,Xin,https://tinyurl.6,3/2/2015,25568,"List(twitter, LinkedIn)"


In [0]:
filterIdBlogsDF = blogsDF.filter(col("Id") > 3)

display(filterIdBlogsDF)

Id,First,Last,Url,Published,Hits,Campaigns
4,Tathagata,Das,https://tinyurl.4,5/12/2018,10568,"List(twitter, FB)"
5,Matei,Zaharia,https://tinyurl.5,5/14/2014,40578,"List(web, twitter, FB, LinkedIn)"
6,Reynold,Xin,https://tinyurl.6,3/2/2015,25568,"List(twitter, LinkedIn)"


In [0]:
filterIdBlogsDF = blogsDF.filter("Id > 3")

display(filterIdBlogsDF)

Id,First,Last,Url,Published,Hits,Campaigns
4,Tathagata,Das,https://tinyurl.4,5/12/2018,10568,"List(twitter, FB)"
5,Matei,Zaharia,https://tinyurl.5,5/14/2014,40578,"List(web, twitter, FB, LinkedIn)"
6,Reynold,Xin,https://tinyurl.6,3/2/2015,25568,"List(twitter, LinkedIn)"


In [0]:
namesBlogsDF = blogsDF.filter((col("First") == "Jules") | (col("First") == "Matei"))

display(namesBlogsDF)


Id,First,Last,Url,Published,Hits,Campaigns
1,Jules,Damji,https://tinyurl.1,1/4/2016,4535,"List(twitter, LinkedIn)"
5,Matei,Zaharia,https://tinyurl.5,5/14/2014,40578,"List(web, twitter, FB, LinkedIn)"


In [0]:
IdBlogsDF = blogsDF.filter("Id = 5 or Id = 2")

display(IdBlogsDF)

Id,First,Last,Url,Published,Hits,Campaigns
2,Brooke,Wenig,https://tinyurl.2,5/5/2018,8908,"List(twitter, LinkedIn)"
5,Matei,Zaharia,https://tinyurl.5,5/14/2014,40578,"List(web, twitter, FB, LinkedIn)"


In [0]:
Names2BlogsDF = blogsDF.filter("First = 'Jules' or First = 'Matei'")

display(Names2BlogsDF)

Id,First,Last,Url,Published,Hits,Campaigns
1,Jules,Damji,https://tinyurl.1,1/4/2016,4535,"List(twitter, LinkedIn)"
5,Matei,Zaharia,https://tinyurl.5,5/14/2014,40578,"List(web, twitter, FB, LinkedIn)"


In [0]:
sampleBlogsDF = blogsDF.sample(False, 0.2, 7)

display(sampleBlogsDF)

Id,First,Last,Url,Published,Hits,Campaigns
3,Denny,Lee,https://tinyurl.3,6/7/2019,7659,"List(web, twitter, FB, LinkedIn)"
6,Reynold,Xin,https://tinyurl.6,3/2/2015,25568,"List(twitter, LinkedIn)"


In [0]:
avgBlogsDF = blogsDF.groupBy("Campaigns").avg("Hits")

display(avgBlogsDF)

Campaigns,avg(Hits)
"List(twitter, LinkedIn)",13003.666666666666
"List(web, twitter, FB, LinkedIn)",24118.5
"List(twitter, FB)",10568.0


In [0]:
sumBlogsDF = blogsDF.groupBy("Campaigns").sum("Hits")

display(sumBlogsDF)

Campaigns,sum(Hits)
"List(twitter, LinkedIn)",39011
"List(web, twitter, FB, LinkedIn)",48237
"List(twitter, FB)",10568


In [0]:
from pyspark.sql.functions import sum 

sumBlogs2DF = blogsDF.groupBy("Campaigns").agg(sum("Hits").alias("sumHits"))

display(sumBlogs2DF)



Campaigns,sumHits
"List(twitter, LinkedIn)",39011
"List(web, twitter, FB, LinkedIn)",48237
"List(twitter, FB)",10568


In [0]:
from pyspark.sql.functions import element_at  # Returns element of array at given index. Array elements are numbered starting with 1.

campaignsBlogsDF = (blogsDF.withColumn("firstCampaign", element_at(col("Campaigns"), 1))
                           .withColumn("secondCampaign", element_at(col("Campaigns"), 2))
                   )
display(campaignsBlogsDF)

Id,First,Last,Url,Published,Hits,Campaigns,firstCampaign,secondCampaign
1,Jules,Damji,https://tinyurl.1,1/4/2016,4535,"List(twitter, LinkedIn)",twitter,LinkedIn
2,Brooke,Wenig,https://tinyurl.2,5/5/2018,8908,"List(twitter, LinkedIn)",twitter,LinkedIn
3,Denny,Lee,https://tinyurl.3,6/7/2019,7659,"List(web, twitter, FB, LinkedIn)",web,twitter
4,Tathagata,Das,https://tinyurl.4,5/12/2018,10568,"List(twitter, FB)",twitter,FB
5,Matei,Zaharia,https://tinyurl.5,5/14/2014,40578,"List(web, twitter, FB, LinkedIn)",web,twitter
6,Reynold,Xin,https://tinyurl.6,3/2/2015,25568,"List(twitter, LinkedIn)",twitter,LinkedIn


In [0]:
from pyspark.sql.functions import lit

newCampaignDF = campaignsBlogsDF.withColumn("newCampaign", lit("Instagram"))

display(newCampaignDF)

Id,First,Last,Url,Published,Hits,Campaigns,firstCampaign,secondCampaign,newCampaign
1,Jules,Damji,https://tinyurl.1,1/4/2016,4535,"List(twitter, LinkedIn)",twitter,LinkedIn,Instagram
2,Brooke,Wenig,https://tinyurl.2,5/5/2018,8908,"List(twitter, LinkedIn)",twitter,LinkedIn,Instagram
3,Denny,Lee,https://tinyurl.3,6/7/2019,7659,"List(web, twitter, FB, LinkedIn)",web,twitter,Instagram
4,Tathagata,Das,https://tinyurl.4,5/12/2018,10568,"List(twitter, FB)",twitter,FB,Instagram
5,Matei,Zaharia,https://tinyurl.5,5/14/2014,40578,"List(web, twitter, FB, LinkedIn)",web,twitter,Instagram
6,Reynold,Xin,https://tinyurl.6,3/2/2015,25568,"List(twitter, LinkedIn)",twitter,LinkedIn,Instagram


In [0]:
def initialsFunction(name, lastName): #define the function
    return name[0] + lastName[0]

initialsFunction("Jules", "Damji")

Out[132]: 'JD'

In [0]:
initialsUDF = udf(initialsFunction) # Create a UDF

In [0]:
udfBlogs = blogsDF.withColumn("Initials", initialsUDF("First", "Last")) # Apply the UDF

display(udfBlogs)

Id,First,Last,Url,Published,Hits,Campaigns,Initials
1,Jules,Damji,https://tinyurl.1,1/4/2016,4535,"List(twitter, LinkedIn)",JD
2,Brooke,Wenig,https://tinyurl.2,5/5/2018,8908,"List(twitter, LinkedIn)",BW
3,Denny,Lee,https://tinyurl.3,6/7/2019,7659,"List(web, twitter, FB, LinkedIn)",DL
4,Tathagata,Das,https://tinyurl.4,5/12/2018,10568,"List(twitter, FB)",TD
5,Matei,Zaharia,https://tinyurl.5,5/14/2014,40578,"List(web, twitter, FB, LinkedIn)",MZ
6,Reynold,Xin,https://tinyurl.6,3/2/2015,25568,"List(twitter, LinkedIn)",RX


In [0]:
initialsUDF = spark.udf.register("sql_udf", initialsFunction) # We can also register this UDF as a Spark SQL function.

In [0]:
%sql

SELECT sql_udf(First, Last) AS initials FROM blog

initials
JD
BW
DL
TD
MZ
RX


In [0]:
@udf("string")  # Define and register a UDF using Python decorator syntax. The @udf decorator parameter is the Column datatype the function returns.
def newInitialsUDF(name: str, lastName: str) -> str:
    return name[0] + lastName[0]

In [0]:
decoratorUdfBlogs = blogsDF.withColumn("Initials", newInitialsUDF("First", "Last")) 

display(decoratorUdfBlogs)

Id,First,Last,Url,Published,Hits,Campaigns,Initials
1,Jules,Damji,https://tinyurl.1,1/4/2016,4535,"List(twitter, LinkedIn)",JD
2,Brooke,Wenig,https://tinyurl.2,5/5/2018,8908,"List(twitter, LinkedIn)",BW
3,Denny,Lee,https://tinyurl.3,6/7/2019,7659,"List(web, twitter, FB, LinkedIn)",DL
4,Tathagata,Das,https://tinyurl.4,5/12/2018,10568,"List(twitter, FB)",TD
5,Matei,Zaharia,https://tinyurl.5,5/14/2014,40578,"List(web, twitter, FB, LinkedIn)",MZ
6,Reynold,Xin,https://tinyurl.6,3/2/2015,25568,"List(twitter, LinkedIn)",RX


In [0]:
assessPerformanceUDF = udf(assessPerformance, IntegerType())

storesDF.withColumn("result", assessPerformanceUDF(col("customerSatisfaction")))

In [0]:
from pyspark.sql.functions import upper

upperBlogsDF = blogsDF.withColumn("upperFirst", upper(col("First")))

display(upperBlogsDF)

Id,First,Last,Url,Published,Hits,Campaigns,upperFirst
1,Jules,Damji,https://tinyurl.1,1/4/2016,4535,"List(twitter, LinkedIn)",JULES
2,Brooke,Wenig,https://tinyurl.2,5/5/2018,8908,"List(twitter, LinkedIn)",BROOKE
3,Denny,Lee,https://tinyurl.3,6/7/2019,7659,"List(web, twitter, FB, LinkedIn)",DENNY
4,Tathagata,Das,https://tinyurl.4,5/12/2018,10568,"List(twitter, FB)",TATHAGATA
5,Matei,Zaharia,https://tinyurl.5,5/14/2014,40578,"List(web, twitter, FB, LinkedIn)",MATEI
6,Reynold,Xin,https://tinyurl.6,3/2/2015,25568,"List(twitter, LinkedIn)",REYNOLD


In [0]:
#Question 27 

In [0]:
from pyspark.sql.functions import lower

lowerBlogsDF = blogsDF.withColumn("First", lower("First"))

display(lowerBlogsDF)

Id,First,Last,Url,Published,Hits,Campaigns
1,jules,Damji,https://tinyurl.1,1/4/2016,4535,"List(twitter, LinkedIn)"
2,brooke,Wenig,https://tinyurl.2,5/5/2018,8908,"List(twitter, LinkedIn)"
3,denny,Lee,https://tinyurl.3,6/7/2019,7659,"List(web, twitter, FB, LinkedIn)"
4,tathagata,Das,https://tinyurl.4,5/12/2018,10568,"List(twitter, FB)"
5,matei,Zaharia,https://tinyurl.5,5/14/2014,40578,"List(web, twitter, FB, LinkedIn)"
6,reynold,Xin,https://tinyurl.6,3/2/2015,25568,"List(twitter, LinkedIn)"


In [0]:
from pyspark.sql.functions import lower

lowerBlogsDF = blogsDF.withColumn("First", lower(col("First")))

display(lowerBlogsDF)

Id,First,Last,Url,Published,Hits,Campaigns
1,jules,Damji,https://tinyurl.1,1/4/2016,4535,"List(twitter, LinkedIn)"
2,brooke,Wenig,https://tinyurl.2,5/5/2018,8908,"List(twitter, LinkedIn)"
3,denny,Lee,https://tinyurl.3,6/7/2019,7659,"List(web, twitter, FB, LinkedIn)"
4,tathagata,Das,https://tinyurl.4,5/12/2018,10568,"List(twitter, FB)"
5,matei,Zaharia,https://tinyurl.5,5/14/2014,40578,"List(web, twitter, FB, LinkedIn)"
6,reynold,Xin,https://tinyurl.6,3/2/2015,25568,"List(twitter, LinkedIn)"


In [0]:
from pyspark.sql.functions import split

splitBlogsDf = blogsDF.withColumn("splitPublished", (split(col("Published"), "/")))
                                  
display(splitBlogsDf)  

Id,First,Last,Url,Published,Hits,Campaigns,splitPublished
1,Jules,Damji,https://tinyurl.1,1/4/2016,4535,"List(twitter, LinkedIn)","List(1, 4, 2016)"
2,Brooke,Wenig,https://tinyurl.2,5/5/2018,8908,"List(twitter, LinkedIn)","List(5, 5, 2018)"
3,Denny,Lee,https://tinyurl.3,6/7/2019,7659,"List(web, twitter, FB, LinkedIn)","List(6, 7, 2019)"
4,Tathagata,Das,https://tinyurl.4,5/12/2018,10568,"List(twitter, FB)","List(5, 12, 2018)"
5,Matei,Zaharia,https://tinyurl.5,5/14/2014,40578,"List(web, twitter, FB, LinkedIn)","List(5, 14, 2014)"
6,Reynold,Xin,https://tinyurl.6,3/2/2015,25568,"List(twitter, LinkedIn)","List(3, 2, 2015)"


In [0]:
from pyspark.sql.functions import split

splitBlogsDF = blogsDF.withColumn("yearPublished", (split(col("Published"), "/")[2]))
                                  
display(splitBlogsDF)  

Id,First,Last,Url,Published,Hits,Campaigns,yearPublished
1,Jules,Damji,https://tinyurl.1,1/4/2016,4535,"List(twitter, LinkedIn)",2016
2,Brooke,Wenig,https://tinyurl.2,5/5/2018,8908,"List(twitter, LinkedIn)",2018
3,Denny,Lee,https://tinyurl.3,6/7/2019,7659,"List(web, twitter, FB, LinkedIn)",2019
4,Tathagata,Das,https://tinyurl.4,5/12/2018,10568,"List(twitter, FB)",2018
5,Matei,Zaharia,https://tinyurl.5,5/14/2014,40578,"List(web, twitter, FB, LinkedIn)",2014
6,Reynold,Xin,https://tinyurl.6,3/2/2015,25568,"List(twitter, LinkedIn)",2015


In [0]:
monthBlogsDf = blogsDF.withColumn("monthPublished", (split(col("Published"), "/")[1]))
                                  
display(monthBlogsDf)  

Id,First,Last,Url,Published,Hits,Campaigns,monthPublished
1,Jules,Damji,https://tinyurl.1,1/4/2016,4535,"List(twitter, LinkedIn)",4
2,Brooke,Wenig,https://tinyurl.2,5/5/2018,8908,"List(twitter, LinkedIn)",5
3,Denny,Lee,https://tinyurl.3,6/7/2019,7659,"List(web, twitter, FB, LinkedIn)",7
4,Tathagata,Das,https://tinyurl.4,5/12/2018,10568,"List(twitter, FB)",12
5,Matei,Zaharia,https://tinyurl.5,5/14/2014,40578,"List(web, twitter, FB, LinkedIn)",14
6,Reynold,Xin,https://tinyurl.6,3/2/2015,25568,"List(twitter, LinkedIn)",2


In [0]:
month2BlogsDf = splitBlogsDf.withColumn("monthPublished", element_at("splitPublished", 2))
                                  
display(month2BlogsDf) 

Id,First,Last,Url,Published,Hits,Campaigns,splitPublished,monthPublished
1,Jules,Damji,https://tinyurl.1,1/4/2016,4535,"List(twitter, LinkedIn)","List(1, 4, 2016)",4
2,Brooke,Wenig,https://tinyurl.2,5/5/2018,8908,"List(twitter, LinkedIn)","List(5, 5, 2018)",5
3,Denny,Lee,https://tinyurl.3,6/7/2019,7659,"List(web, twitter, FB, LinkedIn)","List(6, 7, 2019)",7
4,Tathagata,Das,https://tinyurl.4,5/12/2018,10568,"List(twitter, FB)","List(5, 12, 2018)",12
5,Matei,Zaharia,https://tinyurl.5,5/14/2014,40578,"List(web, twitter, FB, LinkedIn)","List(5, 14, 2014)",14
6,Reynold,Xin,https://tinyurl.6,3/2/2015,25568,"List(twitter, LinkedIn)","List(3, 2, 2015)",2


In [0]:
from pyspark.sql.functions import explode # Explode can be used to split an array column into an individual DataFrame row for each element in the array

explodeBlogsDF = splitBlogsDf.withColumn("exploded", explode("Campaigns"))

display(explodeBlogsDF)

Id,First,Last,Url,Published,Hits,Campaigns,splitPublished,exploded
1,Jules,Damji,https://tinyurl.1,1/4/2016,4535,"List(twitter, LinkedIn)","List(1, 4, 2016)",twitter
1,Jules,Damji,https://tinyurl.1,1/4/2016,4535,"List(twitter, LinkedIn)","List(1, 4, 2016)",LinkedIn
2,Brooke,Wenig,https://tinyurl.2,5/5/2018,8908,"List(twitter, LinkedIn)","List(5, 5, 2018)",twitter
2,Brooke,Wenig,https://tinyurl.2,5/5/2018,8908,"List(twitter, LinkedIn)","List(5, 5, 2018)",LinkedIn
3,Denny,Lee,https://tinyurl.3,6/7/2019,7659,"List(web, twitter, FB, LinkedIn)","List(6, 7, 2019)",web
3,Denny,Lee,https://tinyurl.3,6/7/2019,7659,"List(web, twitter, FB, LinkedIn)","List(6, 7, 2019)",twitter
3,Denny,Lee,https://tinyurl.3,6/7/2019,7659,"List(web, twitter, FB, LinkedIn)","List(6, 7, 2019)",FB
3,Denny,Lee,https://tinyurl.3,6/7/2019,7659,"List(web, twitter, FB, LinkedIn)","List(6, 7, 2019)",LinkedIn
4,Tathagata,Das,https://tinyurl.4,5/12/2018,10568,"List(twitter, FB)","List(5, 12, 2018)",twitter
4,Tathagata,Das,https://tinyurl.4,5/12/2018,10568,"List(twitter, FB)","List(5, 12, 2018)",FB


In [0]:
explodeBlogsDF.take(1) # This operation can be used to return the top n rows from a DataFrame

Out[184]: [Row(Id=1, First='Jules', Last='Damji', Url='https://tinyurl.1', Published='1/4/2016', Hits=4535, Campaigns=['twitter', 'LinkedIn'], splitPublished=['1', '4', '2016'], exploded='twitter')]

In [0]:
explodeBlogsDF.first().Id  # Extract the value for column "Id" from the first row of DataFrame explodeBlogsDF

Out[186]: 1