In [None]:
import scipy.stats
import numpy as np
import csv

Start with example given in scipy docs at [scipy.stats.rv_histogram](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.rv_histogram.html).
Small tweak - use 10 bins instead of 100.

In [None]:
# rvs - random variates of any type
data = scipy.stats.norm.rvs(size=100000, loc=0, scale=1.5, random_state=123)
print(f"No of data items: {len(data)}")
data

In [None]:
hist = np.histogram(data, bins=10)
hist

In [None]:
hist_dist = scipy.stats.rv_histogram(hist, density=False)
print(f"pdf: {hist_dist.pdf(1.0)}")
print(f"cdf: {hist_dist.cdf(2.0)}")

In [None]:
import matplotlib.pyplot as plt

x = np.linspace(-5.0, 5.0, 100)
fig, ax = plt.subplots()
ax.set_title("PDF from Template")
ax.hist(data, density=True, bins=100)
ax.plot(x, hist_dist.pdf(x), label="PDF")
ax.plot(x, hist_dist.cdf(x), label="CDF")
ax.legend()
# Use plt.show instead of fig.show
plt.show()

Now try own data, include smoothing.

In [None]:
model1_no_fraud_path = "data/CountModel1ScoresNoFraud.csv"
model1_fraud_path = "data/CountModel1ScoresFraud.csv"

data = np.genfromtxt(
    fname=model1_no_fraud_path, delimiter=",", skip_header=True, dtype="<i4"
)

In [None]:
# Try smoothing
# https://stackoverflow.com/questions/60583088/generate-probability-distribution-or-smoothing-plot-from-points-containing-proba/60586523#60586523
from scipy.stats import gaussian_kde

x = np.linspace(0, 1000, 500)
fig, ax = plt.subplots()

ax.step(data[:, 0], data[:, 1], color="dodgerblue", lw=1, ls=":", where="pre")
ax2 = ax.twinx()
kde = gaussian_kde(data[:, 0] - 50, bw_method=0.25, weights=data[:, 1])
ax2.plot(x, kde(x), color="crimson")

In [None]:
import matplotlib.pyplot as plt

kde = gaussian_kde(data[:, 0] - 50, bw_method=0.25, weights=data[:, 1])

x = np.linspace(-50, 1000, 500)
fig, ax = plt.subplots()

ax.set_title("Probability Density")

ax.plot(x, kde.pdf(x), label="PDF")
# ax.plot(x, hist_dist.cdf(x), label="CDF")
ax.legend()
# Use plt.show instead of fig.show
plt.show()

Do same with fraud dataset.

In [None]:
data = np.genfromtxt(
    fname=model1_fraud_path, delimiter=",", skip_header=True, dtype="<i4"
)

In [None]:
# Play with bw_method value for closer fit
from scipy.stats import gaussian_kde

x = np.linspace(0, 1000, 500)
fig, ax = plt.subplots()

ax.step(data[:, 0], data[:, 1], color="dodgerblue", lw=1, ls=":", where="pre")
ax2 = ax.twinx()
kde = gaussian_kde(data[:, 0] - 50, bw_method=0.18, weights=data[:, 1])
ax2.plot(x, kde(x), color="crimson")

In [None]:
import matplotlib.pyplot as plt

kde = gaussian_kde(data[:, 0] - 50, bw_method=0.18, weights=data[:, 1])

x = np.linspace(-50, 1000, 500)
fig, ax = plt.subplots()

ax.set_title("Probability Density")

ax.plot(x, kde.pdf(x), label="PDF")
# ax.plot(x, hist_dist.cdf(x), label="CDF")
ax.legend()
# Use plt.show instead of fig.show
plt.show()