In [1]:
pip install ydata-profiling[pyspark]

Note: you may need to restart the kernel to use updated packages.


In [2]:
from pyspark.sql import SparkSession
from ydata_profiling import ProfileReport

In [3]:
# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

spark = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark profiling example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

In [4]:
hdfs_path = "hdfs://hdfs-nn:9000/demo/bronze/Rotten_Tomatoes_Movies.csv"

In [5]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, DoubleType

schema= StructType([
    StructField("movie_title", StringType(),True),
    StructField("movie_info", StringType(),True),
    StructField("critics_consensus", StringType(),True),
    StructField("rating", StringType(),True),
    StructField("genre", StringType(),True),
    StructField("directors", StringType(),True),
    StructField("writers", StringType(),True),
    StructField("cast", StringType(),True),
    StructField("in_theaters_date",DateType(),True),
    StructField("on_streaming_date", DateType(),True),
    StructField("runtime_in_minutes", DoubleType(),True),
    StructField("studio_name", StringType(),True),
    StructField("tomatometer_status", StringType(),True),
    StructField("tomatometer_rating", DoubleType(),True),
    StructField("tomatometer_count", IntegerType(),True),
    StructField("audience_rating", DoubleType(),True),
    StructField("audience_count", IntegerType(),True),
    ])

# Read Sillicon valley episodes data to a dataframe
Rotten_Tomatoes_Movies_df = spark.read.csv(hdfs_path, header=True, schema=schema)

In [6]:
# Note that some profiling operations can resulte in errors due to bad loading options. 
# It is a good praticce start by inspect the schema and a data sample. 
Rotten_Tomatoes_Movies_df.printSchema()
Rotten_Tomatoes_Movies_df.show()

root
 |-- movie_title: string (nullable = true)
 |-- movie_info: string (nullable = true)
 |-- critics_consensus: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- directors: string (nullable = true)
 |-- writers: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- in_theaters_date: date (nullable = true)
 |-- on_streaming_date: date (nullable = true)
 |-- runtime_in_minutes: double (nullable = true)
 |-- studio_name: string (nullable = true)
 |-- tomatometer_status: string (nullable = true)
 |-- tomatometer_rating: double (nullable = true)
 |-- tomatometer_count: integer (nullable = true)
 |-- audience_rating: double (nullable = true)
 |-- audience_count: integer (nullable = true)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+-----------------+------------------+----------------

In [7]:
# In case of error select a subset of columns until you find the column that causes that.
#For start we can use describe as starting point for data profiling
#For this example the column summary was removed due to a conflit with the first describe column "summary"
Rotten_Tomatoes_Movies_df.describe(['movie_title','movie_info','critics_consensus','rating','genre','directors','writers','cast','in_theaters_date','on_streaming_date','runtime_in_minutes','studio_name','tomatometer_status','tomatometer_rating','tomatometer_count','audience_rating','audience_count']).toPandas()

Unnamed: 0,summary,movie_title,movie_info,critics_consensus,rating,genre,directors,writers,cast,runtime_in_minutes,studio_name,tomatometer_status,tomatometer_rating,tomatometer_count,audience_rating,audience_count
0,count,16638,16614,9944,16434,16433,16363,15452,16236,12416.0,16188,16515,12918.0,13277.0,13394.0,13765.0
1,mean,Infinity,,2011.0,1604.75,1492.25,1976.5,2001.0,1994.0,101.90608891752578,108.72519083969466,101.60266666666666,60.71450688961139,56.93936883332078,60.05696580558459,123085.34958227388
2,stddev,,,,738.035850529408,995.0749971735798,47.37615433949868,,28.284271247461902,26.11133339663312,105.90473097257838,19.846455881383285,29.3037329320665,64.8540451106354,24.265412407598976,1635168.5232283394
3,min,!Women Art Revolution,"""""""""""George Carlin was once asked how cocaine ...",""""" """"The Killing Fields"""" and """"Chariots of F...","""""""""The Wedding of the Painted Doll""""""""-- lat...","""""Big G"""" has been the object of ridicule sin...","""""1985"""" takes a unique look at a pivotal mom...","""""Baghead"""" was born","""""Do you ever just want to get out of this co...",1.0,"""""Concerto in F"""" and the 1935 folk opera Por...","""""A Puzzlement""""",0.0,0.0,0.0,0.0
4,max,√îritsu uch√ªgun Oneamisu no tsubasa (Royal Spac...,,üö´,both for Mankiewicz. Based on a novel by John ...,X-Men: Days of Future Past combines the best e...,√òtis Mass,√âva G√°rdos,"√âmile Genest, John Drainie, Tommy Tweed, Sandr...",2000.0,"√Åkos Ambrus, Imre Mad√°csi, Attila Pacsay",Zik Zak Filmworks,222.0,497.0,448.0,35797635.0


In [8]:
#Select the columns to profile. 
df_to_profile=Rotten_Tomatoes_Movies_df.select("movie_title","movie_info","critics_consensus","rating","genre","directors","writers","cast","in_theaters_date","on_streaming_date","runtime_in_minutes","studio_name","tomatometer_status","tomatometer_rating","tomatometer_count","audience_rating","audience_count")

In [9]:
import pandas as pd

df_pandas = Rotten_Tomatoes_Movies_df.toPandas()
df_pandas['in_theaters_date'] = pd.to_datetime(df_pandas['in_theaters_date'], errors='coerce')
df_pandas['on_streaming_date'] = pd.to_datetime(df_pandas['on_streaming_date'], errors='coerce')

report = ProfileReport(df_pandas, title="Rotten Tomatoes Movies Profiling", explorative=True)
report.to_file('profile.html')

report = ProfileReport(df_to_profile)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/17 [00:00<?, ?it/s][A
  6%|‚ñå         | 1/17 [00:04<01:13,  4.61s/it][A
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 17/17 [00:08<00:00,  1.95it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
#save profiling report in a file
report.to_file('profile.html')

In [None]:
#close spark session
spark.stop()