In [1]:
pip install ydata-profiling[pyspark]

Note: you may need to restart the kernel to use updated packages.


In [2]:
from pyspark.sql import SparkSession
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport
from ydata_profiling import ProfileReport

In [3]:
# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

spark = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark profiling example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

In [4]:
hdfs_path = "hdfs://hdfs-nn:9000/demo/bronze/critic_reviews.csv"

In [7]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema = StructType([
    StructField("Show", StringType(), True),
    StructField("Sentiment", IntegerType(), True),
    StructField("Review", StringType(), True),
])

critic_reviews_df = spark.read.csv(
    hdfs_path,
    header=True,
    schema=schema
)

In [8]:
critic_reviews_df.printSchema()
critic_reviews_df.show()

root
 |-- Show: string (nullable = true)
 |-- Sentiment: integer (nullable = true)
 |-- Review: string (nullable = true)

+----------------+---------+--------------------+
|            Show|Sentiment|              Review|
+----------------+---------+--------------------+
|Mare of Easttown|        1|I’m not sure I ha...|
|Mare of Easttown|        1|Kate is marvelous...|
|Mare of Easttown|        1|That’s the thing ...|
|Mare of Easttown|        1|Boasting Kate Win...|
|Mare of Easttown|        1|Performances in w...|
|Mare of Easttown|        1|Kate Winslet deli...|
|Mare of Easttown|        1|Exploring the dar...|
|Mare of Easttown|        1|Full of drama tra...|
|Mare of Easttown|        1|Anyone with a tas...|
|Mare of Easttown|        1|Mare of Easttown ...|
|Mare of Easttown|        1|Ingelsby's unflin...|
|Mare of Easttown|        1|All of this rich ...|
|Mare of Easttown|        1|"While the pacing...|
|Mare of Easttown|        1|What makes it wor...|
|Mare of Easttown|        1|

In [9]:
critic_reviews_df.describe(['Show','Sentiment','Review']).toPandas()

Unnamed: 0,summary,Show,Sentiment,Review
0,count,14791,14791.0,14790
1,mean,588.7594936708861,0.8454465553377054,
2,stddev,860.4101171070998,0.3614906806604114,
3,min,11.22.63,0.0,"""""""1883"""" hits all the right notes as an absor..."
4,max,iCarly,1.0,…the appeal may be limited to the many who kno...


In [10]:
#Select the columns to profile. 
df_to_profile=critic_reviews_df.select("Show","Sentiment","Review")

In [11]:
pdf = df_to_profile.toPandas()
report = ProfileReport(pdf)


In [12]:
#save profiling report in a notebook
report.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/3 [00:00<?, ?it/s][A
100%|██████████| 3/3 [00:00<00:00,  5.20it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [13]:
#save profiling report in a file
report.to_file('critic_reviews-data_profiling.html')

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [14]:
#close spark session
spark.stop()