
# Exploratory Data Analysis


## import libraries and data

In [0]:
import pyspark
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [0]:
CATALOG = "workspace"
SCHEMA = "safe_driver_prediction"

train_df = spark.read.table(f"{CATALOG}.{SCHEMA}.train")
test_df = spark.read.table(f"{CATALOG}.{SCHEMA}.test")

print(f"train set has {train_df.count()} rows and {len(train_df.columns)} columns.")
print(f"test set has {test_df.count()} rows and {len(test_df.columns)} columns.")

In [0]:
train_df.limit(10).display()



## check different feature types

In [0]:
train_df = train_df.toPandas()
test_df = test_df.toPandas()

In [0]:
train_df.columns = train_df.columns.str.replace('ps_', '', regex=False)
test_df.columns = test_df.columns.str.replace('ps_', '', regex=False)

In [0]:

columns = train_df.columns

print("categorical features: \n", [col for col in columns if "_cat" in col])
print("--------------------------------------")
print("binary features: \n", [col for col in columns if "_bin" in col])

print("--------------------------------------")
print("--------------------------------------")

print("vehicle features: \n", [col for col in columns if "car_" in col])
print("--------------------------------------")
print("indicator features: \n", [col for col in columns if "ind_" in col])
print("--------------------------------------")
print("regular features: \n", [col for col in columns if "reg_" in col])
print("--------------------------------------")
print("calculated features: \n", [col for col in columns if "calc_" in col])
print("--------------------------------------")


## plot distributions

In [0]:
def plot_counts_per_feature(dataset: pd.DataFrame, feature_name: str):
    """
    Plot the counts of unique values in a given feature.
    """
    plt.figure(figsize=(6,4))
    sns.countplot(
        x=feature_name, data=dataset, 
        palette='pastel', hue=feature_name, legend=False
    )
    plt.title(f'counts of unique values in "{feature_name}" column')
    plt.xlabel(feature_name)
    plt.ylabel('counts')
    plt.show()

In [0]:
plot_counts_per_feature(
  dataset=train_df, 
  feature_name="target"
)

In [0]:
plot_counts_per_feature(
    dataset=train_df,
    feature_name="ind_02_cat"  # a categorical indicator feature
)

plot_counts_per_feature(
    dataset=train_df,
    feature_name="car_01_cat"  # a categorical vehicle feature
)

plot_counts_per_feature(
    dataset=train_df,
    feature_name="ind_06_bin"  # a binary indicator feature
)

plot_counts_per_feature(
    dataset=train_df,
    feature_name="calc_15_bin"  # a binary calculated feature
)

plot_counts_per_feature(
    dataset=train_df,
    feature_name="reg_03"  # a regular feature
)