In [1]:
import pandas as pd
import numpy as np
from bokeh.io import output_notebook, export_png

In [2]:
output_notebook()

In [3]:
from sklearn.neighbors import KernelDensity
from bokeh.plotting import figure, show

In [4]:
file = "../data/csv_files/titanic_all.csv"

titanic = pd.read_csv(file)
titanic = titanic.drop(["name", "class", "survived"], axis=1)

female = titanic[titanic["sex"] == "female"]
male = titanic[titanic["sex"] == "male"]

In [5]:
f_values = female.age.dropna().values
m_values = male.age.dropna().values
t_values = titanic.age.dropna().values

In [8]:
from bokeh.plotting import figure
from sklearn.neighbors import KernelDensity
import numpy as np


def plot_kde(
    data_colors, legend_labels, title, line_color=None, kernel="gaussian", bandwidth=2
):
    positions = np.linspace(-10, 80, 1000)

    p = figure(
        title=title,
        height=300,
        width=500,
        toolbar_location=None,
        x_axis_label="age (years)",
        y_axis_label="scaled density",
    )

    for (data, color), legend_label in zip(data_colors, legend_labels):
        kde = KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(data[:, np.newaxis])
        density = np.exp(kde.score_samples(positions[:, np.newaxis]))
        total_value = len(data)
        scaled_density = density * total_value

        p.patch(
            positions,
            scaled_density,
            fill_alpha=0.9,
            fill_color=color,
            line_color=line_color,
            legend_label=legend_label,
        )

    p.x_range.start = 0
    p.xaxis.ticker = [0, 20, 40, 60]
    p.xgrid.grid_line_color = None
    p.xaxis.axis_line_color = None
    p.xaxis.major_tick_line_color = "gray"
    p.xaxis.major_tick_out = 2

    p.y_range.start = 0
    p.yaxis.minor_tick_out = 0
    p.yaxis.axis_line_color = None
    p.yaxis.major_tick_line_color = "gray"
    p.yaxis.major_tick_out = 0
    p.yaxis.major_tick_in = 0

    p.legend.title = "gender"
    p.legend.location = "top_right"

    return p

In [9]:
single_plot_data = [(m_values, "#5BA4DB"), (f_values, "#D0771E")]
legend_labels = ["male", "female"]
title = "Figure 7.8"

single = plot_kde(single_plot_data, legend_labels, title, line_color="black")
show(single)

In [10]:
from bokeh.layouts import gridplot

male_data = [(t_values, "#D5D4D3"), (m_values, "#5BA4DB")]
male_legend = ["all passengers", "males"]
male_title = "Figure 7.9"

female_data = [(t_values, "#D5D4D3"), (f_values, "#D0771E")]
female_legend = ["all passengers", "females"]
female_title = ""

male = plot_kde(male_data, male_legend, male_title)
female = plot_kde(female_data, female_legend, female_title)

layout = gridplot([male, female], ncols=2)
show(layout)

In [11]:
file = "../data/csv_files/cows.csv"

df = pd.read_csv(file)

In [12]:
jersey = df[df["breed"] == "Jersey"]
holstein = df[df["breed"] == "Holstein-Friesian"]
guernsey = df[df["breed"] == "Guernsey"]
ayrshire = df[df["breed"] == "Ayrshire"]

In [13]:
j_values = jersey.butterfat.values
h_values = holstein.butterfat.values
g_values = guernsey.butterfat.values
a_values = ayrshire.butterfat.values
positions = np.linspace(2, 8, 1000)

In [14]:
# Data
values = [a_values, g_values, h_values, j_values]
bandwidths = [0.125, 0.25, 0.1, 0.3]
colors = ["#4AA0F6", "#DF8E3D", "#E29548", "green"]
legend_labels = ["Ayrshire", "Guernsey", "Holstein-Friesian", "Jersey"]

# Create figure object
p = figure(
    title="figure 7.11",  # plot title
    height=300,  # plot height
    width=600,  # plot width
    x_axis_label="butterfat contents",
    y_axis_label="density",
)

# Loop to calculate KDE and plot patches
for data, bandwidth, color, label in zip(values, bandwidths, colors, legend_labels):
    kde = KernelDensity(kernel="gaussian", bandwidth=bandwidth).fit(data[:, np.newaxis])
    log_dens = kde.score_samples(positions[:, np.newaxis])

    p.patch(
        positions,
        np.exp(log_dens),
        fill_alpha=0.4,
        fill_color=color,
        line_color=color,
        legend_label=label,
    )

p.yaxis.ticker = [0, 0.5, 1, 1.5]
p.y_range.start = 0
p.x_range.start = 3

show(p)