# pandas_profiling EDA Report Generator (v2023)

### Profile 1 - Single parquet files into pandas profile

In [None]:
# Local Working Directory
import os
	
def current_path():
	print("Current working directory before")
	print(os.getcwd())
	print()
	
# Changing the Working Dir = CH MacBook Pro (Work)
os.chdir('')

# Printing CWD after
current_path()

In [None]:
# imports & notebook features.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pyarrow.parquet as pq
import pyarrow as pa
from pandas_profiling import ProfileReport

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
# Set pandas-profiling configuration YAML
### located @ ~/anaconda3/envs/data-env/lib/python3.9/site-packages/pandas_profiling

# --config_file default.yaml 
# --config_file minimal.yaml

--config_file config_spruce_profile.yaml

In [None]:
## run pandas profiling on standard CSV via CLI
! pandas_profiling --title "Name of Report" --config_file default.yaml data.csv EDAreport.html

df = pd.read_csv('')
df.head()

# Prepare Dataframe

In [None]:
## prepare dataframe from Parquet Files 
### pyarrow
#table = pq.read_table('example.parquet')

## pandas
sprucedata = pd.read_parquet('')

sprucedata.head()

# Create pandas_profile report

In [None]:
spruceProfile = ProfileReport(sprucedata, title="EDA Profile Report", html={'style': {'full_width': True}})

# Show EDA profile in iframe or widget
spruceProfile.to_notebook_iframe() #HTML
#dfProfile.to_widgets() #to widgets (collapsed in NB)

### export profile to file

In [None]:
# to save/ output the file 
dfProfile.to_file(output_file="")


## Phase 2 - Stack all parquet files into single table, run pandas profiling

# Interpret Results

### Alerts Table Definitions - https://pandas-profiling.ydata.ai/docs/master/pages/getting_started/concepts.html

<table class="colwidths-given docutils align-default">
<colgroup>
<col style="width: 13%">
<col style="width: 88%">
</colgroup>
<thead>
<tr class="row-odd"><th class="head"><p><strong>Alert</strong></p></th>
<th class="head"><p><strong>Description</strong></p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">Constant</span></code></p></td>
<td><p>Column only contains one value</p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">Zeros</span></code></p></td>
<td><p>Column only contains zeros</p></td>
</tr>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">High</span> <span class="pre">Correlation</span></code></p></td>
<td><p>Correlations (either Spearman, Cramer, Pearson, Kendall, 𝜙k) are above the warning threshold (configurable).</p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">High</span> <span class="pre">Cardinality</span></code></p></td>
<td><p>Whether the column has more than 50 distinct values. Threshold is configurable.</p></td>
</tr>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">Skewness</span></code></p></td>
<td><p>Column’s univariate distribution presents skewness. Threshold value is configurable.</p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">Missing</span> <span class="pre">Values</span></code></p></td>
<td><p>Column has missing values</p></td>
</tr>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">Infinite</span> <span class="pre">Values</span></code></p></td>
<td><p>Column has infinite values (either <code class="docutils literal notranslate"><span class="pre">np.inf</span></code> or <code class="docutils literal notranslate"><span class="pre">-np.inf</span></code>)</p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">Unique</span> <span class="pre">Values</span></code></p></td>
<td><p>All values of the column are unique (count of unique values equals column’s length)</p></td>
</tr>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">Date</span></code></p></td>
<td><p>Column (likely/mostly) contains Date or Datetime records</p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">Uniform</span></code></p></td>
<td><p>Column follows a uniform distribution (Chi-squared test score &gt; 0.999, threshold score is configrable)</p></td>
</tr>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">Constant</span> <span class="pre">length</span></code></p></td>
<td><p>For strings/date/datetimes columns whose entries all have the same length</p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">Rejected</span></code></p></td>
<td><p>Variable has mixed types or is constant (thus not suitable for meaningful analysis)</p></td>
</tr>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">Unsupported</span></code></p></td>
<td><p>Column can’t be analysed (type is not supported, has mixed types, has <code class="docutils literal notranslate"><span class="pre">lists</span></code>/<code class="docutils literal notranslate"><span class="pre">dicts</span></code>/<code class="docutils literal notranslate"><span class="pre">tuples</span></code>, is 
empty, wrongly formatted)</p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">Duplicates</span></code></p></td>
<td><p>Dataset-level warning signaling the presence of more than 10 duplicated records.</p></td>
</tr>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">Empty</span></code></p></td>
<td><p>Dataset-level warning signaling there’s no data to be analysed.</p></td>
</tr>
</tbody>
</table>