In [1]:
!python -V

Python 3.12.1


In [2]:
!pip install -r requirements.txt



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from scipy.stats import entropy, ks_2samp

def js_divergence(p, q):
    m = 0.5 * (p + q)
    return 0.5 * entropy(p, m) + 0.5 * entropy(q, m)

np.random.seed(42)
dist1 = np.random.normal(0, 1, 1000)  # Mean=0, Std=1
dist2 = np.random.normal(1, 1.5, 1000)  # Mean=1, Std=1.5

# dummy dataframe
data = pd.DataFrame({"dist1": dist1, "dist2": dist2})

histogram_1, edges1 = np.histogram(data["dist1"], bins=50, density=True)
histogram_2, edges2 = np.histogram(data["dist2"], bins=50, density=True)

# Normalize histograms
bin_centers1 = 0.5 * (edges1[:-1] + edges1[1:])
bin_centers2 = 0.5 * (edges2[:-1] + edges2[1:])

kl_divergence = entropy(histogram_1, histogram_2)


ks_divergence, _ = ks_2samp(data["dist1"], data["dist2"])

js_divergence = js_divergence(histogram_1, histogram_2)

# Print divergence values
print(f"KL Divergence: {kl_divergence}")
print(f"KS Divergence: {ks_divergence}")
print(f"JS Divergence: {js_divergence}")

# Create a PDF and save plots
with PdfPages("divergence_plots.pdf") as pdf:
    plt.figure(figsize=(12, 8))

    plt.subplot(2, 2, 1)
    plt.hist(data["dist1"], bins=50, density=True, alpha=0.7, label='Dist1', color='blue')
    plt.hist(data["dist2"], bins=50, density=True, alpha=0.7, label='Dist2', color='orange')
    plt.title('Histogram of Distributions')
    plt.legend()

    cdf1 = np.cumsum(histogram_1) / np.sum(histogram_1)
    cdf2 = np.cumsum(histogram_2) / np.sum(histogram_2)
    plt.subplot(2, 2, 2)
    plt.plot(bin_centers1, cdf1, label='CDF Dist1', color='blue')
    plt.plot(bin_centers2, cdf2, label='CDF Dist2', color='orange')
    plt.title('Cumulative Distribution Functions (CDF)')
    plt.legend()

    plt.subplot(2, 2, 3)
    kl_contributions = histogram_1 * np.log(histogram_1 / histogram_2)
    plt.bar(bin_centers1, kl_contributions, width=0.1, color='green', alpha=0.7)
    plt.title('KL Divergence Contribution')
    plt.axhline(0, color='black', linestyle='--', linewidth=0.5)

    plt.subplot(2, 2, 4)
    js_contributions_p = histogram_1 * np.log(histogram_1 / (0.5 * (histogram_1 + histogram_2)))
    js_contributions_q = histogram_2 * np.log(histogram_2 / (0.5 * (histogram_1 + histogram_2)))
    plt.bar(bin_centers1, js_contributions_p, width=0.1, alpha=0.7, label='P log(P/M)', color='blue')
    plt.bar(bin_centers2, js_contributions_q, width=0.1, alpha=0.7, label='Q log(Q/M)', color='orange')
    plt.title('JS Divergence Contribution')
    plt.axhline(0, color='black', linestyle='--', linewidth=0.5)
    plt.legend()

    plt.tight_layout()
    pdf.savefig()
    plt.close()


KL Divergence: inf
KS Divergence: 0.387
JS Divergence: 0.021436856521886302


  kl_contributions = histogram_1 * np.log(histogram_1 / histogram_2)
  kl_contributions = histogram_1 * np.log(histogram_1 / histogram_2)
  kl_contributions = histogram_1 * np.log(histogram_1 / histogram_2)
  kl_contributions = histogram_1 * np.log(histogram_1 / histogram_2)
  js_contributions_p = histogram_1 * np.log(histogram_1 / (0.5 * (histogram_1 + histogram_2)))
  js_contributions_p = histogram_1 * np.log(histogram_1 / (0.5 * (histogram_1 + histogram_2)))
  js_contributions_p = histogram_1 * np.log(histogram_1 / (0.5 * (histogram_1 + histogram_2)))
  js_contributions_q = histogram_2 * np.log(histogram_2 / (0.5 * (histogram_1 + histogram_2)))
  js_contributions_q = histogram_2 * np.log(histogram_2 / (0.5 * (histogram_1 + histogram_2)))
  js_contributions_q = histogram_2 * np.log(histogram_2 / (0.5 * (histogram_1 + histogram_2)))


# fin.