In [2]:
from hdfs import InsecureClient
import os

client = InsecureClient(url='http://namenode:50070', user='hdfs')

hdfs_path = '/user/sebas'
local_path = '/home/jovyan/work/yelp_dataset'

for filename in os.listdir(local_path):
    if filename.endswith('.json'):
        local_file = os.path.join(local_path,filename)
        hdfs_file = os.path.join(hdfs_path,filename)
        client.upload(hdfs_file, local_file)

In [14]:
from pyspark.sql import SparkSession

# Initialize a Spark session with Hive support
spark = SparkSession \
    .builder \
    .appName("Hive Table Creation") \
    .config("spark.sql.warehouse.dir", "hdfs://namenode:8020/user/hive/warehouse") \
    .enableHiveSupport() \
    .getOrCreate()


# Read the JSON file
user_json = "hdfs://namenode:8020//user/sebas/yelp_academic_dataset_user.json"
df_user = spark.read.json(user_json)

review_json = "hdfs://namenode:8020//user/sebas/yelp_academic_dataset_review.json"
df_review = spark.read.json(review_json)

business_json = "hdfs://namenode:8020//user/sebas/yelp_academic_dataset_business.json"
df_business = spark.read.json(business_json)


# Infer the schema and print it
print("User Schema")
df_user.printSchema()

print("Review Schema")
df_review.printSchema()

print("Business Schema")
df_business.printSchema()


# Create a temporary view
df_user.createOrReplaceTempView("temp_user_table")
df_review.createOrReplaceTempView("temp_review_table")
df_business.createOrReplaceTempView("temp_business_table")


# Create a Hive table from the temporary view
spark.sql("CREATE TABLE user AS SELECT * FROM temp_user_table")
spark.sql("CREATE TABLE review AS SELECT * FROM temp_review_table")
spark.sql("CREATE TABLE business AS SELECT * FROM temp_business_table")


# Stop the Spark session
spark.stop()

User Schema
root
 |-- average_stars: double (nullable = true)
 |-- compliment_cool: long (nullable = true)
 |-- compliment_cute: long (nullable = true)
 |-- compliment_funny: long (nullable = true)
 |-- compliment_hot: long (nullable = true)
 |-- compliment_list: long (nullable = true)
 |-- compliment_more: long (nullable = true)
 |-- compliment_note: long (nullable = true)
 |-- compliment_photos: long (nullable = true)
 |-- compliment_plain: long (nullable = true)
 |-- compliment_profile: long (nullable = true)
 |-- compliment_writer: long (nullable = true)
 |-- cool: long (nullable = true)
 |-- elite: string (nullable = true)
 |-- fans: long (nullable = true)
 |-- friends: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- name: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)
 |-- yelping_since: string (nullable = true)

Review Schema
root
 |-- business_id: string (nullable = t

In [15]:
# Initialize a Spark session
spark = SparkSession \
    .builder \
    .appName("Hive Integration SQL") \
    .config("spark.sql.warehouse.dir", "hdfs://namenode:8020/user/hive/warehouse") \
    .enableHiveSupport() \
    .getOrCreate()

# Load data from Hive table
df_user = spark.sql("SELECT * FROM user")
df_review = spark.sql("SELECT * FROM review")
df_business = spark.sql("SELECT * FROM business")

# Perform your analysis, e.g., show the DataFrame
df_user.show(2)
print("User shape:", (df_user.count(), len(df_user.columns)))

df_review.show(2)
print("Review shape:", (df_review.count(), len(df_review.columns)))

df_business.show(2)
print("Business shape:", (df_business.count(), len(df_business.columns)))

+-------------+---------------+---------------+----------------+--------------+---------------+---------------+---------------+-----------------+----------------+------------------+-----------------+-----+--------------------+----+--------------------+-----+------+------------+------+--------------------+-------------------+
|average_stars|compliment_cool|compliment_cute|compliment_funny|compliment_hot|compliment_list|compliment_more|compliment_note|compliment_photos|compliment_plain|compliment_profile|compliment_writer| cool|               elite|fans|             friends|funny|  name|review_count|useful|             user_id|      yelping_since|
+-------------+---------------+---------------+----------------+--------------+---------------+---------------+---------------+-----------------+----------------+------------------+-----------------+-----+--------------------+----+--------------------+-----+------+------------+------+--------------------+-------------------+
|         3.91|    

In [17]:
spark.sql("SHOW TABLES").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|  default| business|      false|
|  default|   review|      false|
|  default|     user|      false|
+---------+---------+-----------+



In [18]:
spark.stop()