# Diagnosis and DIA (Data Inspection Assistant)

If you cannot import this module, please try: pip install "polars_ds[plot]"

The dataset used for dependency detection can be found on github.

The plotly plots cannot be rendered on github.

In [2]:
import polars as pl
import polars_ds as pds
from polars_ds.diagnosis import DIA

In [None]:
df = pds.frame(size=1_000_000).select(
    pds.random(0.0, 12.0).alias("uniform_1"),
    pds.random(0.0, 1.0).alias("uniform_2"),
    pds.random_exp(0.5).alias("exp"),
    pds.random_normal(0.0, 1.0).alias("normal"),
    pds.random_normal(0.0, 1000.0).alias("fat_normal"),
).with_columns(
    pl.concat_list("uniform_2", 1 - pl.col("uniform_2")).alias("list_prob")
)
df.head()

In [3]:
dia = DIA(df)

In [None]:
dia.special_values_report()

In [None]:
# Only shows for numerical columns
dia.numeric_profile(histogram=True)

In [None]:
# Don't compute histogram. Use Polars as output format instead of GT
dia.numeric_profile(histogram=False, gt=False) 

In [None]:
dia.infer_corr()

In [None]:
dia.infer_corr(method="kendall")

In [None]:
dia.meta()

In [None]:
# Uniform_2 can potentially be a probability score column (e.g. output of predict_proba, but taking values only for class =1)
# list_prob can potentially be a 2-class probability column (e.g. output of predict_proba)
dia.infer_prob()

# Dependency Detection, Null Distributions, Distribution Comparisons

Does knowing values in column A tell us values in column B?

In [None]:
df = pl.read_parquet("dependency.parquet")
df.head()

In [None]:
df.shape

In [13]:
dia = DIA(df)

In [None]:
dia.infer_dependency()

In [None]:
dia.plot_dependency()

In [None]:
# ID implies everything, of course, because ID is unique.
# So let's not plot it
dia.plot_dependency(subset=pl.all().exclude("ID"))

In [None]:
dia.str_stats()

In [None]:
dia.numeric_profile(iqr_multiplier=2)

In [None]:
import polars.selectors as cs

dia.corr(subset=["Monthly_Income", "Existing_EMI"])

In [None]:
dia.corr(subset=cs.all(), method="spearman")

In [None]:
dia.plot_corr(subset=["Monthly_Income", "Existing_EMI"], method="spearman")

In [None]:
dia.plot_corr(subset=["Monthly_Income", "Existing_EMI"], method="xi")

In [None]:
# Checks whether nulls in one feature happens at the same time as nulls in other features
dia.plot_null_distribution(cs.numeric())

In [None]:
# Checks whether nulls in one feature happens at the same time as nulls in other features
dia.plot_null_distribution(
    cs.numeric(), 
    condition= (pl.col("Source_Category") == 'B')
)

In [None]:
dia.plot_distribution(
    "EMI", 
    by = "Primary_Bank_Type", 
    n_bins=100, 
    density=False, 
    condition= pl.col("EMI").is_between(pl.col("EMI").quantile(0.01), pl.col("EMI").quantile(0.99)),
    # Additional Plotly's kwargs can also be passed 
    opacity = 0.7
)

In [None]:
dia.plot_distribution(
    pl.col("EMI").sqrt().alias("EMI_SQRT"), 
    by = "Primary_Bank_Type", 
    n_bins=100, 
    density=False, 
    condition= pl.col("EMI").is_between(pl.col("EMI").quantile(0.01), pl.col("EMI").quantile(0.99)),
    # Additional Plotly's kwargs can also be passed 
    opacity = 0.7
)

In [None]:
dia.plot_distribution(
    pl.col("EMI"), 
    by = pl.col("Loan_Amount") > 10_000, 
    n_bins=100, 
    density=False, 
    condition= pl.col("EMI").is_between(pl.col("EMI").quantile(0.01), pl.col("EMI").quantile(0.99)),
    # Additional Plotly's kwargs can also be passed 
    opacity = 0.7
)

# Classic Iris Dataset

In [1]:
import polars as pl
import polars_ds as pds
from polars_ds.diagnosis import DIA
# Only used to get dataset.
from sklearn import datasets


dataset = datasets.load_iris()
df = pl.from_numpy(dataset.data, schema = dataset.feature_names).with_columns(
    pl.Series(values=dataset.target).alias("species")
).with_columns(
    pl.when(pl.col("species") == 0).then(pl.lit('setosa'))
    .when(pl.col("species") == 1).then(pl.lit('versicolor'))
    .when(pl.col("species") == 2).then(pl.lit('virginica')).alias("species")
)
df.head()

sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
f64,f64,f64,f64,str
5.1,3.5,1.4,0.2,"""setosa"""
4.9,3.0,1.4,0.2,"""setosa"""
4.7,3.2,1.3,0.2,"""setosa"""
4.6,3.1,1.5,0.2,"""setosa"""
5.0,3.6,1.4,0.2,"""setosa"""


In [2]:
dia = DIA(df)
dia.plot_pca(pl.all().exclude("species"), by = "species")

In [4]:
# Just for fun, let's see how well can sepal length approximate petal length

plot = dia.plot_lstsq(
    x = "sepal length (cm)", 
    target = pl.col("petal length (cm)"), 
    add_bias=True
)
plot


In [None]:
# The plot is an Altair plot, you can do a lot of cool things from here. For more details, 
# visit Altair's official docs!
plot.interactive()

In [5]:
dia.plot_lstsq(
    x = "sepal length (cm)", 
    target = pl.col("petal length (cm)"), 
    add_bias=True,
    filter_by = pl.col("petal length (cm)") > 2,
    by = "species"
)

TypeError: '>' not supported between instances of 'NoneType' and 'int'

In [None]:
# Run the above on different subsets
for v in df["species"].unique():
    fig = dia.plot_lstsq(
        x = "sepal length (cm)", 
        target = "petal length (cm)",  
        add_bias=True,
        condition = pl.col("species") == v
    )
    fig.show()