In [3]:
from os.path import abspath
from pyspark.sql import SparkSession
from pyspark import SparkConf

warehouse_location = abspath('spark-warehouse')

if __name__ == '__main__':
    spark = SparkSession \
        .builder \
        .appName("etl-posts-py") \
        .config("spark.sql.warehouse.dir", abspath('spark-warehouse')) \
        .enableHiveSupport() \
        .getOrCreate()

    #print(SparkConf().getAll())

    spark.sparkContext.setLogLevel("INFO")

    get_users = "./data/user.json"
    get_posts = "./data/posts.json"
    get_comments = "./data/comments.json"

    dataframe_users = spark.read \
                    .format('json') \
                    .option('inferSchema', 'false') \
                    .option("multiline","true") \
                    .option('header', 'true') \
                    .json(get_users)
    
    dataframe_posts = spark.read \
                    .format('json') \
                    .option('inferSchema', 'false') \
                    .option("multiline","true") \
                    .option('header', 'true') \
                    .json(get_posts)

    dataframe_comments = spark.read \
                    .format('json') \
                    .option('inferSchema', 'false') \
                    .option("multiline","true") \
                    .option('header', 'true') \
                    .json(get_comments)


In [10]:
dataframe_posts.describe()

DataFrame[summary: string, body: string, id: string, title: string, userId: string]

In [15]:
dataframe_users.createOrReplaceTempView('users')
dataframe_posts.createOrReplaceTempView('posts')
dataframe_comments.createOrReplaceTempView('comments')

In [20]:
data_join = spark.sql(
    '''
        SELECT 
            u.name as author,
            p.body as postagem,
            c.body as comentarios,
            c.email
        FROM users as u
        INNER JOIN posts as p
        ON u.id = p.userId
        INNER JOIN comments as c
        ON p.id == c.postId
    '''
)

In [21]:
data_join.show()

+-------------+--------------------+--------------------+--------------------+
|       author|            postagem|         comentarios|               email|
+-------------+--------------------+--------------------+--------------------+
|Leanne Graham|quia et suscipit
...|harum non quasi e...|   Hayden@althea.biz|
|Leanne Graham|quia et suscipit
...|non et atque
occa...|       Lew@alysha.tv|
|Leanne Graham|quia et suscipit
...|quia molestiae re...| Nikita@garfield.biz|
|Leanne Graham|quia et suscipit
...|est natus enim ni...|Jayne_Kuhic@sydne...|
|Leanne Graham|quia et suscipit
...|laudantium enim q...|  Eliseo@gardner.biz|
|Leanne Graham|est rerum tempore...|voluptate iusto q...|Carmen_Keeling@ca...|
|Leanne Graham|est rerum tempore...|sapiente assumend...|Meghan_Littel@ren...|
|Leanne Graham|est rerum tempore...|ut voluptatem cor...|Mallory_Kunze@mar...|
|Leanne Graham|est rerum tempore...|maiores sed dolor...|       Dallas@ole.me|
|Leanne Graham|est rerum tempore...|doloribus at sed

In [22]:
data_join.count()

500