# Step 00: EDA

In [None]:
import pyspark
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import IPython.core.display import display, HTML
import scipy.stats

#pyspark
import pyspark.sql import SparkSession, DataFrame as SparkDataFrame
import pyspark.sql.functions as F, isnan, when, count, col, to_date
from pyspark.sql.types import *
from pyspark.context import SparkContext
from pyspark.sql.window import Window
from pyspark.ml.feature import QuantileDiscretizer, VectorAssembler
import spark_df_profiling

#scipy and statsmodels
from scipt import stats
from scipy.stats import friedmanchisquare, kruskal, wilcoxon, ks_2samp, chi2_contingency, chi2, norm
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from statsmodels.iolib.smpickle import load_pickle

#scikit-learn
from sklearn.compose import make_colum_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.metrics import roc_auc_curve
from sklearn.model_selection import learning_curve
from sklearn.feature_selection import SelectFromModel

#light gbm
import lightgbm
from lightgbm import LGBMClassifier

import xgboost
from xgboost import XGBClassifier

#H20
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

#print versions
print('sklearn:{}'.format(sklearn.__version__))
print('pandas:{}'.format(pd.__version__))

## Helper functions

In [None]:
def get_missing_rate(path, sdf_input, label, sampling):
    cols = sdf_inputs.columns
    smpl_df = sdf_input.sample(withReplacement = False, fraction = sampling, seed=432).toPandas()
    lst = []
    
    for col in cols:
        values_dict = {}
        values_dict['var'] = col
        values_dict['tot_count'] = smpl_df[col].count()
        values_dict['unique_val'] = smpl_df[col].unique()
        values_dict['num_missing_rows'] = smpl_df[col].isnull().sum()
        values_dict['missing_rate'] = (smpl_df[col].isnull().sum()/len(smpl_df[col]) )
        values_dict['high_missing_rate'] = np.where( (smpl_df[col].isnull().sum()/len(smpl_df[col]))<= 0.95,0,1 )
        
        lst.append(values_dict)
        
    data_qa = pd.DataFrame(lst)
    data_qa.sort_values(by = ['missing_rate'], ascending = False, inplace = True)
    data_qa.to_csv(path + label + '_stats_missing.csv')
    print('CSV file saved')

## Read data

In [None]:
# read Hive table
phys = spark.sql(""" select * from table """)
phys = phys.filter(F.col('week_n') != 17)
phys.createOrReplaceTempView('phys')

# print stats
print("Number of obs:", phys.count())
print("Number of cols:", len(phys.columns))

stats = phys.groupBy(['week_n']) \ 
            .agg(F.sum(F.col('call_within30')).alias('tot positives'),
                F.count('*').alias('tot rows'),
                (F.sum(F.col('call_within30'))/F.count('*')).alias('target rate')).orderBy('week_n')

df = stats.toPandas()

#fill missing
phys = phys.na.fill(value=0)

#drop original target
phys = phys.drop(*['call_within30'])
df

## Weekly series

In [None]:
feat = ['clm_num','clm_count']

In [None]:
for f in feat:
    stats = phys.groupBy(['week_n'])\
            .agg(F.avg(F.col(f))).orderBy('week_n')
    stats.show(20)

## find missing rate

In [None]:
get_missing_rate(path = perspath,
                sdf_input = phys,
                label = 'data',
                sampling = 0.10)

## numeric and categorical variables

In [None]:
df = phys.sample(withReplacement = False, fraction = 0.05, seed=432).toPandas()
print(df.shape)

In [None]:
df['hoosp_ind'] = df['hoosp_ind'].astype('object')
df.head()

In [None]:
# ---------------------------
# numeric
# ---------------------------
num_vars = list(df.select_dtypes(include = ['int32', 'int64', 'float32', 'float64']).columns)
num_vars.remove('call_ind')
print('There are', len(num_vars), 'numeric features in the list')
print(num_vars)
print()

# -------------------
# categorical
# -------------------
cat_vars = list(df.select_dtypes(include = ['object']).columns)

# -----------------
# full list
# ------------------
full_lst = list(df.columns)
print("there are", len(full_lst), "total feat in the list")

# ---------------------
# prepare all cols
# ----------------------
final_vars = num_vars + cat_vars
print("there are", len(final_vars))

## A. Numeric EDA

In [None]:
stats = df.groupby(['week_n'])[num_vars].describe().reset_index()
stats.to_csv(path + 'num_eda.csv')
stats.head()

In [None]:
for col in num_vars:
    plt.figure(figsize=(8,6))
    sns.distplot(df[col])
    plt.show()

## B. Categorical EDA

In [None]:
for col in cat_vars:
    print("variable", col)
    print(pd.crosstab(df[col],df['target'],margins=True))
    print()

In [None]:
for col in cat_vars:
    stats = phys.groupBy(col) \
                .agg(F.count('*').alias('tot rows'))\
                .orderBy(F.col('tot rows').desc())
    stats.show(20)

## C. Spark profiling

In [None]:
report = spark_df_profiling.ProfileReport(phys)

#export to HTML
report.to_file(outputfile = path + 'data_sdf_profiling.html')