In [1]:
import pandas as pd
import numpy as np
from bokeh.io import output_notebook, export_png

In [2]:
output_notebook()

In [3]:
from sklearn.neighbors import KernelDensity
from bokeh.plotting import figure, show

In [4]:
file = "../data/csv_files/titanic_all.csv"

titanic = pd.read_csv(file)
titanic = titanic.drop(["name", "class", "survived"], axis=1)

female = titanic[titanic["sex"] == "female"]
male = titanic[titanic["sex"] == "male"]

In [5]:
total_age = titanic.age.dropna()
female_age = female.age.dropna()
male_age = male.age.dropna()

In [6]:
f_values = female_age.values
m_values = male_age.values
t_values = total_age.values
positions = np.linspace(-10, 80)

In [28]:
from bokeh.plotting import figure
from sklearn.neighbors import KernelDensity
import numpy as np


def plot_kde(
    data_colors, legend_labels, line_color=None, kernel="gaussian", bandwidth=2
):
    positions = np.linspace(
        -10, 80, 1000
    )  # Adjust the number of points based on your needs

    p = figure(
        title="Figure 7.8",
        height=300,
        width=500,
        toolbar_location=None,
        x_axis_label="age (years)",
        y_axis_label="scaled density",
    )

    for (data, color), legend_label in zip(data_colors, legend_labels):
        kde = KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(data[:, np.newaxis])
        density = np.exp(kde.score_samples(positions[:, np.newaxis]))
        total_value = len(data)
        scaled_density = density * total_value

        p.patch(
            positions,
            scaled_density,
            fill_alpha=0.9,
            fill_color=color,
            line_color=line_color,
            legend_label=legend_label,
        )

    p.x_range.start = 0
    p.xaxis.ticker = [0, 20, 40, 60]
    p.xgrid.grid_line_color = None
    p.xaxis.axis_line_color = None
    p.xaxis.major_tick_line_color = "gray"
    p.xaxis.major_tick_out = 2

    p.y_range.start = 0
    p.yaxis.minor_tick_out = 0
    p.yaxis.axis_line_color = None
    p.yaxis.major_tick_line_color = "gray"
    p.yaxis.major_tick_out = 0
    p.yaxis.major_tick_in = 0

    p.legend.title = "gender"
    p.legend.location = "top_right"

    return p

In [29]:
data_colors = [(m_values, "#5BA4DB"), (f_values, "#D0771E")]
legend_labels = ["male", "female"]

a = plot_kde(data_colors, legend_labels)
show(a)

In [9]:
from bokeh.layouts import gridplot

plots = []

b_data = [(t_values, "#D5D4D3"), (m_values, "#5BA4DB")]
b_legend = ["all passengers", "males"]

c_data = [(t_values, "#D5D4D3"), (f_values, "#D0771E")]
c_legend = ["all passengers", "females"]

b = plot_kde(
    b_data,
)
layout = gridplot([b, c], ncols=2)
show(layout)

In [10]:
file = "../data/csv_files/cows.csv"

df = pd.read_csv(file)

In [11]:
jersey = df[df["breed"] == "Jersey"]
holstein = df[df["breed"] == "Holstein-Friesian"]
guernsey = df[df["breed"] == "Guernsey"]
ayrshire = df[df["breed"] == "Ayrshire"]

In [12]:
j_values = jersey.butterfat.values
h_values = holstein.butterfat.values
g_values = guernsey.butterfat.values
a_values = ayrshire.butterfat.values
positions = np.linspace(2, 8)

In [13]:
a_kde = KernelDensity(kernel="gaussian", bandwidth=0.15).fit(a_values[:, np.newaxis])
a_log_dens = a_kde.score_samples(positions[:, np.newaxis])

g_kde = KernelDensity(kernel="gaussian", bandwidth=0.25).fit(g_values[:, np.newaxis])
g_log_dens = g_kde.score_samples(positions[:, np.newaxis])

h_kde = KernelDensity(kernel="gaussian", bandwidth=0.15).fit(h_values[:, np.newaxis])
h_log_dens = h_kde.score_samples(positions[:, np.newaxis])

j_kde = KernelDensity(kernel="gaussian", bandwidth=0.3).fit(j_values[:, np.newaxis])
j_log_dens = j_kde.score_samples(positions[:, np.newaxis])

# create figure object
p = figure(
    title="figure 7.11",  # plot title
    height=300,  # plot height
    width=600,  # plot width
    x_axis_label="butterfat contents",
    y_axis_label="density",
)

p.patch(
    positions,
    np.exp(a_log_dens),
    fill_alpha=0.4,
    fill_color="#4AA0F6",
    line_color="#4AA0F6",
)

p.patch(
    positions,  # x-axis coordinates
    np.exp(j_log_dens),  # y-axis coordinates
    fill_alpha=0.4,  # fill transparency
    fill_color="green",
    line_color="green",
)

p.patch(
    positions,
    np.exp(g_log_dens),
    fill_alpha=0.4,
    fill_color="#E29548",
    line_color="#E29548",
)

p.patch(
    positions,
    np.exp(h_log_dens),
    fill_alpha=0.4,
    fill_color="#DF8E3D",
    line_color="#DF8E3D",
)


p.yaxis.ticker = [0, 0.5, 1, 1.5]
p.y_range.start = 0
p.x_range.start = 3
# p.xaxis.formatter = NTF(format="0%")


show(p)