In [7]:
import findspark
from pyspark.sql import SparkSession
import pyspark

In [8]:
findspark.init()

In [9]:
spark = SparkSession.builder\
        .master("local[*]")\
        .appName('Chapter3')\
        .getOrCreate()

sc = spark.sparkContext

In [10]:
# Create an RDD of tuples ( name, age)
dataRDD =sc.parallelize([('Brooke', 20), ('Denny', 31), ("Jules", 30),
                        ('TD', 35), ('Brooke', 25)])

# Use map and reduceByKey transformations with their lambda
# expressions to aggregate and then compute average

agesRRDD = (dataRDD
           .map(lambda x : (x[0], (x[1],1)))
           .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
           .map(lambda x : (x[1][0]/x[1][1])))

In [13]:
from pyspark.sql.functions import avg

#Create a DataFrame
data_df =spark.createDataFrame([('Brooke', 20), ('Denny', 31), ("Jules", 30),
                        ('TD', 35), ('Brooke', 25)], ['name', 'age'])

# Group the same names together, aggregate their ages, and compute an average
avg_df = data_df.groupBy('name').agg(avg('age'))
# Show the result of the final execution
avg_df.show()

+------+--------+
|  name|avg(age)|
+------+--------+
|Brooke|    22.5|
| Denny|    31.0|
| Jules|    30.0|
|    TD|    35.0|
+------+--------+



In [14]:
from pyspark.sql.types import *
schema = StructType([StructField('author', StringType(), False),
                    StructField('title', StringType(), False),
                    StructField('pages', StringType(), False)])

In [15]:
# DDL schema
schema = 'author STRING, title STRING, pages INT'

In [18]:
# Define schema for our data using DDL
schema = "`Id` INT,`First` STRING, `Last` STRING, `Url` STRING, `Published` STRING, `Hits` INT, `Campaings` ARRAY<STRING>"

#Create our static data
data = [[1, "Jules", "Damji", "https://tinyurl.1", "1/4/2016", 4535, ["twitter","LinkedIn"]],
        [2, "Brooke","Wenig", "https://tinyurl.2", "5/5/2018", 8908, ["twitter","LinkedIn"]],
        [3, "Denny", "Lee", "https://tinyurl.3", "6/7/2019", 7659, ["web", "twitter", "FB", "LinkedIn"]],
        [4, "Tathagata", "Das", "https://tinyurl.4", "5/12/2018", 10568,["twitter", "FB"]],
        [5, "Matei","Zaharia", "https://tinyurl.5", "5/14/2014", 40578, ["web", "twitter", "FB", "LinkedIn"]],
        [6, "Reynold", "Xin", "https://tinyurl.6", "3/2/2015", 25568,["twitter", "LinkedIn"]]
       ]

# Main program
if __name__ == '__main__':
    # Create a SparkSession
    spark =   (SparkSession
              .builder
              .appName('Example-3.6')
              .getOrCreate())
    #Create a DataFrame using the schema defined above
    blogs_df = spark.createDataFrame(data, schema)
    # Show the DataFrame: it should reflect our table above
    blogs_df.show()
    # Print the schema used by Spark to process the DF
    print(blogs_df.printSchema())
    

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaings|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
+---+---------+-------+-----------------+---------+-----+--------------------+

root
 |-- Id: integer (nullable = true)
 |-- First: string (nullable = true)
 |-- Last: string (nullable = true)
 |-- Url: string (nullable = true)
 |-- Published: string (nullable = true)
 |-- Hits: integer (

In [19]:
blogs_df.schema

StructType(List(StructField(Id,IntegerType,true),StructField(First,StringType,true),StructField(Last,StringType,true),StructField(Url,StringType,true),StructField(Published,StringType,true),StructField(Hits,IntegerType,true),StructField(Campaings,ArrayType(StringType,true),true)))

In [21]:
from pyspark.sql import Row
blog_row = Row(6, 'Reynold', 'Xin', 'https://tinyurl.6', 255568, '3/2/2015',
              ['twitter', 'Linkedin'])
# Access using index for individual items
blog_row[1]

'Reynold'

In [22]:
rows = [Row('Matel Zaharta', 'CA'), Row('Reynold Xin', 'CA')]
authors_df = spark.createDataFrame(rows, ['Authors', 'State'])

In [23]:
authors_df.show()

+-------------+-----+
|      Authors|State|
+-------------+-----+
|Matel Zaharta|   CA|
|  Reynold Xin|   CA|
+-------------+-----+

