# EDA pandas_profile - TEMPLATE - DATE (USER)

docs @ https://pandas-profiling.ydata.ai/docs/master/index.html

### set working directory

In [None]:
# gDrive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Local Working Directory
import os
	
def current_path():
	print("Current working directory before")
	print(os.getcwd())
	print()
	
# Changing the Working Dir = CH MacBook Pro (Work)
os.chdir('/Users/jonathan.cachat/Documents/CH Projects/2 - Influenza Lab Kits/data')

# Printing CWD after
current_path()

#### Import Libraries & Authenticate

In [None]:
# Authenticate via gcloud CLI
! gcloud auth application-default login

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import google.auth
from google.cloud import bigquery
from pandas_profiling import ProfileReport

%load_ext autoreload
%autoreload 2
%matplotlib inline

#Enable BigQuery Magic
%load_ext google.cloud.bigquery

#### run pandas_profiling from commmand-list on standard CSV

In [None]:
#! pandas_profiling --title "Example Profiling Report" --config_file default.yaml sourceData.csv EDAreport.html
! pandas_profiling --title "Name of Report" --config_file default.yaml data.csv EDAreport.html

#### Import CSV Datasets

In [None]:
df = pd.read_csv('/Users/jonathan.cachat/Library/CloudStorage/OneDrive-CardinalHealth/JupyterNB-JC/data/btc.csv')
df.head()

#### Import from BigQuery

In [None]:
%%bigquery dataFrame

#insert SQL statement here

## Generate pandas_profile EDA report

### Create pandas_profile report

In [None]:
dfProfile = ProfileReport(df, title="df EDA pandas_profiling Report", html={'style': {'full_width': True}})

In [None]:
# Show EDA profile in iframe or widget

dfProfile.to_notebook_iframe() #HTML

#dfProfile.to_widgets() #to widgets (collapsed in NB)

### Export EDA Profile to File

In [None]:
# to save/ output the file 
dfProfile.to_file(output_file="/Users/jonathan.cachat/Documents/CH Projects/2 - Influenza Lab Kits/EDA Profiles - Live Notebooks/FAN_BY_MARKET EDAprofile.html")

In [None]:
%%bigquery FAN_BY_STATE

SELECT * FROM `edna-datastg-pr-cah.D2_ML_CORP_DNA_FRANCIS_SALK_NP_EXPM.FAN_BY_STATE`

### Create pandas_profile report

In [None]:
FAN_BY_STATEprofile = ProfileReport(FAN_BY_STATE, title="FAN_BY_STATE EDA pandas_profiling Report", html={'style': {'full_width': True}})

In [None]:
# Show EDA profile in iframe or widget

FAN_BY_STATEprofile.to_notebook_iframe() #HTML

#EDAprofile.to_widgets() #to widgets (collapsed in NB)

### Export EDA Profile to File

In [None]:
# to save/ output the file 
FAN_BY_STATEprofile.to_file(output_file="/Users/jonathan.cachat/Documents/CH Projects/2 - Influenza Lab Kits/EDA Profiles - Live Notebooks/FAN_BY_STATE EDAprofile.html")

#### Messy Data Alerts

### Alerts Table Definitions - https://pandas-profiling.ydata.ai/docs/master/pages/getting_started/concepts.html

<table class="colwidths-given docutils align-default">
<colgroup>
<col style="width: 13%">
<col style="width: 88%">
</colgroup>
<thead>
<tr class="row-odd"><th class="head"><p><strong>Alert</strong></p></th>
<th class="head"><p><strong>Description</strong></p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">Constant</span></code></p></td>
<td><p>Column only contains one value</p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">Zeros</span></code></p></td>
<td><p>Column only contains zeros</p></td>
</tr>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">High</span> <span class="pre">Correlation</span></code></p></td>
<td><p>Correlations (either Spearman, Cramer, Pearson, Kendall, 𝜙k) are above the warning threshold (configurable).</p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">High</span> <span class="pre">Cardinality</span></code></p></td>
<td><p>Whether the column has more than 50 distinct values. Threshold is configurable.</p></td>
</tr>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">Skewness</span></code></p></td>
<td><p>Column’s univariate distribution presents skewness. Threshold value is configurable.</p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">Missing</span> <span class="pre">Values</span></code></p></td>
<td><p>Column has missing values</p></td>
</tr>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">Infinite</span> <span class="pre">Values</span></code></p></td>
<td><p>Column has infinite values (either <code class="docutils literal notranslate"><span class="pre">np.inf</span></code> or <code class="docutils literal notranslate"><span class="pre">-np.inf</span></code>)</p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">Unique</span> <span class="pre">Values</span></code></p></td>
<td><p>All values of the column are unique (count of unique values equals column’s length)</p></td>
</tr>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">Date</span></code></p></td>
<td><p>Column (likely/mostly) contains Date or Datetime records</p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">Uniform</span></code></p></td>
<td><p>Column follows a uniform distribution (Chi-squared test score &gt; 0.999, threshold score is configrable)</p></td>
</tr>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">Constant</span> <span class="pre">length</span></code></p></td>
<td><p>For strings/date/datetimes columns whose entries all have the same length</p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">Rejected</span></code></p></td>
<td><p>Variable has mixed types or is constant (thus not suitable for meaningful analysis)</p></td>
</tr>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">Unsupported</span></code></p></td>
<td><p>Column can’t be analysed (type is not supported, has mixed types, has <code class="docutils literal notranslate"><span class="pre">lists</span></code>/<code class="docutils literal notranslate"><span class="pre">dicts</span></code>/<code class="docutils literal notranslate"><span class="pre">tuples</span></code>, is 
empty, wrongly formatted)</p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">Duplicates</span></code></p></td>
<td><p>Dataset-level warning signaling the presence of more than 10 duplicated records.</p></td>
</tr>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">Empty</span></code></p></td>
<td><p>Dataset-level warning signaling there’s no data to be analysed.</p></td>
</tr>
</tbody>
</table>