In [249]:
# Meta code to wrap into slides
# `panel serve app.ipynb --autoreload`

import os

import panel as pn
import pandas as pd
import hvplot.pandas


pn.extension(
    template="slides",
    css_files=["https://fonts.googleapis.com/css?family=Inter"],
    raw_css=[
        'html, body { font-family: "Inter",sans-serif; }',
        ":root {--r-background-color: rgb(16, 39, 47) }",
        "h1 { text-align: left; font-size: 3em; }",
        "h2 { text-align: left; font-size: 3em; }",
        "ul { text-align: left; font-size: 2em; }",
        "p { text-align: left; font-size: 2em; }",
        "pre { text-align: left; }",
    ],
    theme="dark",
)

pn.state.template.param.update(
    design=pn.theme.Material,
    header_background="white",
    logo="anaconda.png",
    title="Easy exploration and ML-model tuning with hvPlot",
)

pn.state.template.config.param.update(
    raw_css=[
        "#header { height: 0; padding: 20px; }",
        "li { text-align: left; }",
        "p { margin-block-start: 0.5em; margin-block-end: 0.2em}",
    ],
    css_files=[pn.io.resources.CSS_URLS["font-awesome"]],
)

slide = lambda *objs: pn.Column(
    pn.Column(*objs),
    sizing_mode="stretch_both",
    min_height=600,
    styles={"display": "flex", "height": "100%", "align-items": "center"},
)


def header(text, size="4em", **kwargs):
    return HTML(
        f"<span>{text}</span>",
        styles={"font-size": size, "font-weight": "bold"},
        **kwargs,
    )


def text_fragment(text, size="0.5em", **kwargs):
    return Markdown(
        text,
        styles={"font-size": size, "font-weight": "bold"},
        tags=["fragment"],
        **kwargs,
    )


def ends(df):
    try:
        return pn.Column(df.iloc[:5, :5], "...", df.iloc[-5:, -5:])
    except:
        return pn.Column(df.iloc[:5], "...", df.iloc[-5:])

In [250]:
slide(
    pn.Column(
        """
        ## Goals

        - Demo hvplot's ease-of-use and features
        - Show how to use hvplot for ML
        - Share applicable **PRO TIPS** for data workflows
        """,
        pn.Column(
            pn.Row(
                pn.panel("katrina.png", width=300),
                pn.panel("tracks_climatology.png", width=300),
            ),
            pn.Row(
                pn.panel("heatmap.png", width=300),
                pn.panel("scores.png", width=300)
            ),
        ),
    ).servable()
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'04b24e27-9306-4f86-acad-2452675162fa': {'version…

In [251]:
os.system(
    "wget -nc https://www.ncei.noaa.gov/data/international-best-track-archive-for-climate-stewardship-ibtracs/v04r00/access/csv/ibtracs.since1980.list.v04r00.csv"
)

slide(
    pn.Column(
        """
        ## Download the data

        - Contains records of tropical cyclone (TC) since 1980
        - Includes coordinates, intensity, wind speed, central pressure, etc
        ```python
        wget -nc https://www.ncei.noaa.gov/data/international-best-track-archive-for-
        climate-stewardship-ibtracs/v04r00/access/csv/ibtracs.since1980.list.v04r00.csv
        ```
        """
    ).servable()
)

File ‘ibtracs.since1980.list.v04r00.csv’ already there; not retrieving.



BokehModel(combine_events=True, render_bundle={'docs_json': {'f871559b-28e2-4077-843b-e046542a10cc': {'version…

In [252]:
df = pd.read_csv(
    "ibtracs.since1980.list.v04r00.csv", parse_dates=["ISO_TIME"], skiprows=[1]
)

icols = [19, 20, 161, 162]
for icol in icols:
    col = df.iloc[:, icol].name
    df[col] = pd.to_numeric(df[col], errors="coerce")

slide(
    pn.Column(
        """
        ## Read and clean the data

        - Open CSV with pandas
        - Remove second row (index 1) since it's another header row with units
        - Fix data types (object to numeric)

        ```python
        import pandas as pd

        df = pd.read_csv("ibtracs.since1980.list.v04r00.csv", parse_dates=["ISO_TIME"], skiprows=[1])
    
        icols = [19, 20, 161, 162]
        for icol in icols:
            col = df.iloc[:, icol].name
            df[col] = pd.to_numeric(df[col], errors="coerce")
    
        display(df.dtypes[:8], df.head())
        ```
        """,
        pn.Row(ends(df.dtypes), ends(df)),
    ).servable()
)

  df = pd.read_csv(


BokehModel(combine_events=True, render_bundle={'docs_json': {'e6d06332-286e-4b6a-ac16-0a6eb9a0d4b4': {'version…

In [None]:
slide(
    pn.Column(
        """
        ## Export to parquet

        - Minimal effort for improved read/write efficiency
        - Accessible by many tools (pandas, dask, duckdb, etc)

        ```python
        df.to_parquet("ibtracs.parquet")
        ```
        """,
    ).servable()
)


In [None]:
katrina_df = df.query(
    "NAME == 'KATRINA' and SEASON == 2005"
)

slide(
    pn.Row(
        '''
        ## Query for Hurricane Katrina

        - Use `query` instead of `loc` as it's more concise
        - OR use SQL with duckdb and convert to pandas with `fetchdf`
        ```python
        df = pd.read_parquet("ibtracs.parquet")
        katrina_df = df.query(
            "NAME == 'KATRINA' and SEASON == 2005"
        )

        # equivalent to the above
        import duckdb

        katrina_df = duckdb.execute(
            """
            SELECT * FROM 'ibtracs.parquet'
            WHERE NAME == 'KATRINA' AND SEASON == 2005
            """
        ).fetchdf()
        ```
        ''',
    ).servable()
)

In [None]:
katrina_points = katrina_df.hvplot(
    kind="paths",
    x="LON",
    y="LAT",
    color="USA_SSHS",
    hover_cols=["USA_WIND", "USA_PRES", "ISO_TIME"],
    colorbar=True,
    clabel="Saffir-Simpson Hurricane Scale",
    title="Hurricane Katrina Track",
    tiles=True,
).opts(responsive=True, shared_axes=False)

slide(
    pn.Column(
        '''
        ## Plot Hurricane Katrina's tracks

        - Get an interactive plot with `hvplot`
        - Simply choose a chart type (`kind=paths`) select `x` and `y` coordinates
        - Map `color` to the Saffir-Simpson Hurricane Scale (SSHS)
        - Add data columns to show upon hovering with `hover_cols`
        - **PRO TIP**: Set `tiles=True` to get a base map for spatial context
        - Add annotations (`title`, `colorbar`, `tiles`, etc)

        ```python
        katrina_df.hvplot(
            kind="paths",
            x="LON",
            y="LAT",
            color="USA_SSHS",
            hover_cols=["USA_WIND", "USA_PRES", "ISO_TIME"],
            colorbar=True,
            clabel="Saffir-Simpson Hurricane Scale",
            title="Hurricane Katrina Track",
            tiles=True,
        )
        ```
        ''',
    ).servable()
)

In [None]:
slide(
    pn.pane.HoloViews(katrina_points).servable()
)

In [110]:
year_df = df.query("SEASON == 2005")

year_points = year_df.hvplot(
    kind="points",
    x="LON",
    y="LAT",
    color="USA_SSHS",
    hover_cols=["USA_WIND", "USA_PRES", "ISO_TIME"],
    groupby="NAME",
    colorbar=True,
    clabel="Saffir-Simpson Hurricane Scale",
    title="2005 Tropical Cyclone Track",
    tiles=True,
).opts(responsive=True, aspect=1)

slide(
    pn.Column(
        '''
        ## Select and plot each track in 2005

        - Can easily be points instead by setting `kind="points"`
        - **PRO TIP**: Use `groupby` to get a line for each storm

        ```python
        year_df = df.query("SEASON == 2005")

        year_df.hvplot(
            kind="points",
            x="LON",
            y="LAT",
            color="USA_SSHS",
            hover_cols=["USA_WIND", "USA_PRES", "ISO_TIME"],
            groupby="NAME",
            colorbar=True,
            clabel="Saffir-Simpson Hurricane Scale",
            title="Hurricane Track",
            tiles=True,
        )
        ```
        ''',
    ).servable()
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'5855a012-690a-49b7-8022-b2793dce5d55': {'version…

In [111]:
slide(
    pn.panel(year_points).servable()
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'aa7b5f9a-27cf-426e-8d5d-9329aa207f94': {'version…

In [117]:
all_points = df.hvplot(
    kind="points",
    x="LON",
    y="LAT",
    color="USA_SSHS",
    colorbar=True,
    clabel="Saffir-Simpson Hurricane Scale",
    title="Tropical Cyclone Tracks",
    tiles=True,
).opts(responsive=True, aspect=1)

slide(
    pn.Column(
        '''
        ## Plot ALL tracks since 1980

        - Overplotted; not too meaningful and hard to interpet...

        ```python
        df.hvplot(
            kind="points",
            x="LON",
            y="LAT",
            color="USA_SSHS",
            tiles=True,
        )
        ```
        ''',
    ).servable()
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'44aa6a5e-428d-4210-8022-422807272193': {'version…

In [118]:
slide(
    pn.panel(all_points).servable()
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'6b9b3953-16fa-423d-a728-87d6da11c6b2': {'version…

In [119]:
max_points = df.hvplot(
    kind="points",
    x="LON",
    y="LAT",
    color="USA_SSHS",
    colorbar=True,
    clabel="Saffir-Simpson Hurricane Scale",
    title="Tropical Cyclone Tracks (Most Intense)",
    tiles=True,
    rasterize=True,
    aggregator="max",
    clim=(0, 5),
    cmap="RdYlBu_r",
).opts(responsive=True, aspect=1)

slide(
    pn.Column(
        '''
        ## Plot ALL tracks since 1980 as rasterized max

        - **PRO TIP**: Use `rasterize=True` to plot billions of points efficiently
        - Uses datashader under the hood to aggregate points into pixels
        - Set `aggregator="max"` to get the max value for each pixel
        - Adjust colorbar limits easily with `clim=(0, 5)`
        - Update the colormap with `cmap="RdYlBu_r"`

        ```python
        df.hvplot(
            kind="points",
            x="LON",
            y="LAT",
            color="USA_SSHS",
            colorbar=True,
            clabel="Saffir-Simpson Hurricane Scale",
            title="Tropical Cyclone Tracks",
            tiles=True,
            rasterize=True,
            aggregator="max",
            clim=(0, 5),
            cmap="RdYlBu_r",
        )
        ```
        ''',
    ).servable()
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'9b487a29-40e9-44fe-a116-b97866fbc49b': {'version…

In [120]:
slide(
    pn.panel(max_points).servable()
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'f4411c25-1543-42c6-805d-0e87f3029334': {'version…

Task exception was never retrieved
future: <Task finished name='Task-359' coro=<Callback.process_on_change() done, defined at /Users/ahuang/miniconda3/lib/python3.10/site-packages/holoviews/plotting/bokeh/callbacks.py:322> exception=UnsetValueError("figure(id='d70162f0-93ff-4b4e-bcc3-09aaaa64e6f8', ...).inner_height doesn't have a value set")>
Traceback (most recent call last):
  File "/Users/ahuang/miniconda3/lib/python3.10/site-packages/holoviews/plotting/bokeh/callbacks.py", line 340, in process_on_change
    msg[attr] = self.resolve_attr_spec(path, cb_obj)
  File "/Users/ahuang/miniconda3/lib/python3.10/site-packages/holoviews/plotting/bokeh/callbacks.py", line 248, in resolve_attr_spec
    resolved = getattr(resolved, p, None)
  File "/Users/ahuang/miniconda3/lib/python3.10/site-packages/bokeh/core/property/descriptors.py", line 283, in __get__
    raise UnsetValueError(f"{obj}.{self.name} doesn't have a value set")
bokeh.core.property.descriptors.UnsetValueError: figure(id='d7016

In [121]:
count_points = df.hvplot(
    kind="points",
    x="LON",
    y="LAT",
    colorbar=True,
    clabel="Count",
    title="Tropical Cyclone Tracks (Count)",
    tiles=True,
    rasterize=True,
    aggregator="count",
    x_sampling=0.25,
    y_sampling=0.25,
    cmap="viridis",
).opts(responsive=True, aspect=1)

slide(
    pn.Column(
        '''
        ## Plot ALL tracks since 1980 as rasterized count

        - Drop `color="USA_SSHS"` & set `aggregator="count"` to get the 2D histogram
        - Other `aggregator`s include `any', 'count', 'first', 'last', 'max', 'mean', 'min', etc
        - Increase `x_sampling` and `y_sampling` to get a smoother plot

        ```python
        df.hvplot(
            kind="points",
            x="LON",
            y="LAT",
            colorbar=True,
            clabel="Count",
            title="Tropical Cyclone Tracks (Count)",
            tiles=True,
            rasterize=True,
            aggregator="count",
            x_sampling=0.25,
            y_sampling=0.25,
            cmap="viridis",
        )
        ```
        ''',
    ).servable()
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'5d32750b-df50-4259-b80b-23a4f8053bfe': {'version…

In [122]:
slide(
    pn.panel(count_points).servable()
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'427ec7ea-6e36-4a97-ac63-78985dd79a86': {'version…

Task exception was never retrieved
future: <Task finished name='Task-364' coro=<Callback.process_on_change() done, defined at /Users/ahuang/miniconda3/lib/python3.10/site-packages/holoviews/plotting/bokeh/callbacks.py:322> exception=UnsetValueError("figure(id='8e5d2871-a3f3-4fef-a4ab-104f0ed19277', ...).inner_height doesn't have a value set")>
Traceback (most recent call last):
  File "/Users/ahuang/miniconda3/lib/python3.10/site-packages/holoviews/plotting/bokeh/callbacks.py", line 340, in process_on_change
    msg[attr] = self.resolve_attr_spec(path, cb_obj)
  File "/Users/ahuang/miniconda3/lib/python3.10/site-packages/holoviews/plotting/bokeh/callbacks.py", line 248, in resolve_attr_spec
    resolved = getattr(resolved, p, None)
  File "/Users/ahuang/miniconda3/lib/python3.10/site-packages/bokeh/core/property/descriptors.py", line 283, in __get__
    raise UnsetValueError(f"{obj}.{self.name} doesn't have a value set")
bokeh.core.property.descriptors.UnsetValueError: figure(id='8e5d2

In [124]:
slide(
    pn.Column(
        '''
        ## Mid-point Summary

        - Use hvplot to quickly, and easily, visualize your data
        - Outputs interactive plots with hover, zoom, pan capabilities
        - Set keyword arguments to access advanced features
        - Get a base map overlay with `tiles`
        - Adds widgets automatically for easy exploration using `groupby`
        - Plots billions of data points efficiently thru `rasterize`
        - Integrates with `panel` for dashboards (or presentations like this)
        - Check out https://hvplot.holoviz.org/user_guide/Customization.html for all the options
        ''',
    ).servable()
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'51f874f8-cd21-4132-a4a3-c1154f119331': {'version…

In [128]:
slide(
    pn.Row(
        """
        ## Using hvplot for ML

        Motivation:
        "El Niño events generally suppress Atlantic<br>hurricane activity"
        [[Source](https://www.weather.gov/jan/el_nino_and_la_nina)]

        Let's test this hypothesis using hvplot and scikit-learn!
        """,
        pn.panel(
            "https://www.weather.gov/images/jan/ElNino_LaNina/elninowxpatterns.gif"
        ),
    ).servable()
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'3569ed15-1f52-4507-95df-f7d3c4e6dc85': {'version…

In [149]:
atlantic_df = df.query(
    "NAME != 'NOT_NAMED' and "
    "USA_ATCF_ID.str.startswith('AL') and "
    "SEASON < 2023 and "
    "ISO_TIME.dt.month.between(6, 11) "
)

atlantic_points = atlantic_df.hvplot(
    kind="points",
    x="LON",
    y="LAT",
    colorbar=True,
    clabel="Count",
    title="Tropical Cyclone Tracks (Count)",
    tiles=True,
    rasterize=True,
    aggregator="count",
    x_sampling=0.25,
    y_sampling=0.25,
    cmap="viridis",
).opts(xlim=(-179, 179), aspect=1, responsive=True)


slide(
    pn.Column(
        '''
        ## Subset peak hurricane season in the Atlantic basin

        - The six-month season runs from June 1 to November 30 [[Source](https://www.noaa.gov/news-release/2023-atlantic-hurricane-season-outlook)].
        - Verify query works as expected by visualizing

        ```python
        atlantic_df = df.query(
            "NAME != 'NOT_NAMED' and "
            "USA_ATCF_ID.str.startswith('AL') and "
            "SEASON < 2023 and "
            "ISO_TIME.dt.month.between(6, 11) "
        )

        atlantic_df.hvplot.points(
            kind="points",
            x="LON",
            y="LAT",
            colorbar=True,
            clabel="Count",
            title="Tropical Cyclone Tracks (Count)",
            tiles=True,
            rasterize=True,
            aggregator="count",
            x_sampling=0.25,
            y_sampling=0.25,
            cmap="viridis",
        ).opts(xlim=(-179, 179), responsive=True)
        ```
        ''',
    ).servable()
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'56524628-b4ff-4515-bfe6-f198bf4a0873': {'version…

In [150]:
slide(pn.panel(atlantic_points).servable())

BokehModel(combine_events=True, render_bundle={'docs_json': {'f269cafb-cd3d-4843-bb6b-bb140d630a1c': {'version…

Task exception was never retrieved
future: <Task finished name='Task-446' coro=<Callback.process_on_change() done, defined at /Users/ahuang/miniconda3/lib/python3.10/site-packages/holoviews/plotting/bokeh/callbacks.py:322> exception=UnsetValueError("figure(id='9e1a44b7-ff8f-4a92-9fb8-a580881bb399', ...).inner_height doesn't have a value set")>
Traceback (most recent call last):
  File "/Users/ahuang/miniconda3/lib/python3.10/site-packages/holoviews/plotting/bokeh/callbacks.py", line 340, in process_on_change
    msg[attr] = self.resolve_attr_spec(path, cb_obj)
  File "/Users/ahuang/miniconda3/lib/python3.10/site-packages/holoviews/plotting/bokeh/callbacks.py", line 248, in resolve_attr_spec
    resolved = getattr(resolved, p, None)
  File "/Users/ahuang/miniconda3/lib/python3.10/site-packages/bokeh/core/property/descriptors.py", line 283, in __get__
    raise UnsetValueError(f"{obj}.{self.name} doesn't have a value set")
bokeh.core.property.descriptors.UnsetValueError: figure(id='9e1a4

In [157]:
# %%
atlantic_count_df = (
    df.query(
        "NAME != 'NOT_NAMED' and "
        "USA_ATCF_ID.str.startswith('AL') and "
        "SEASON < 2023 and "
        "ISO_TIME.dt.month.between(6, 11) "
    )
    .groupby("SEASON")["NAME"]
    .nunique()
    .reset_index()
    .rename(columns={"NAME": "unique_names"})
)

atlantic_count_curve = atlantic_count_df.hvplot(
    x="SEASON",
    y="unique_names",
    title="Unique Names per Season",
)

slide(
    pn.Column(
        '''
        ## Compute the number of unique names per season

        - Each name is used once per season
        - **PRO TIP**: Use `between` to filter between two values

        ```python
        atlantic_count_df = (
            df.query(
                "NAME != 'NOT_NAMED' and "
                "USA_ATCF_ID.str.startswith('AL') and "
                "SEASON < 2023 and "
                "ISO_TIME.dt.month.between(6, 11) "
            )
            .groupby("SEASON")["NAME"]
            .nunique()
            .reset_index()
            .rename(columns={"NAME": "unique_names"})
        )
        """).fetchdf()

        atlantic_count_df.hvplot(
            x="SEASON",
            y="unique_names",
            title="Unique Names per Season",
        )
        ''',
        pn.panel(atlantic_count_curve),
    ).servable()
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'d4425605-3c78-4b4d-8137-738d72e2f3e7': {'version…

In [160]:
nino_df = pd.read_csv(
    "https://raw.githubusercontent.com/ahuang11/oni/master/nino_ml.csv",
    index_col=0,
    parse_dates=True,
)
nino_df = nino_df.rename_axis("date").dropna()

slide(
    pn.Column(
        """
        ## Get Nino predictors loaded

        - **PRO TIP**: Read directly from URL
        - t300 - depth averaged temps up from 0 to 300m
        - wwv - warm water volume
        - u850 - 850 mb trade wind index

        ```python
        nino_df = pd.read_csv(
            'https://raw.githubusercontent.com/ahuang11/oni/master/nino_ml.csv',
            index_col=0,
            parse_dates=True
        )
        nino_df = nino_df.rename_axis("date").dropna()
        ```
        """,
        ends(nino_df),
    ).servable()
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'ca1a0a4f-88a3-4a6d-9e52-392704369de5': {'version…

In [182]:
nino_spring_df = nino_df.assign(**{
    "year": lambda df: df.index.year,
    "month": lambda df: df.index.month,
}).query(
    "month.between(1, 5)"
)

slide(
    pn.Row(
        '''
        ## Subset predictors

        - Select months prior to the peak season: January to May
        - **PRO TIP**: Use `lambda` for one assignments using latest df

        ```python
        nino_spring_df = nino_df.assign(**{
            "year": lambda df: df.index.year,
            "month": lambda df: df.index.month,
        }).query(
            "month.between(1, 5)"
        )

        display(nino_spring_df.head())
        ```
        ''',
        pn.Column(
            pn.panel("diagram.png", width=500),
            ends(nino_spring_df),
        ),
    ).servable()
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'c6e86058-ffc8-4a19-bb90-b3732bb32a7f': {'version…

In [184]:
ml_df = (
    atlantic_count_df.set_index("SEASON")
    .join(nino_spring_df.set_index("year"))
).reset_index(names=["year"])

slide(
    pn.Column(
        """
        ## Join the two dataframes

        - Use year / SEASON as the primary key

        ```python
        ml_df = (
            atlantic_count_df.set_index("SEASON")
            .join(nino_spring_df.set_index("year"))
        ).reset_index(names=["year"])
        
        display(ml_df.head())
        ```
        """,
        ends(ml_df),
    ).servable()
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'5492453a-5a62-4e72-bf9b-3ac64979fda8': {'version…

In [206]:
corr_df = (
    ml_df.groupby("month")
    .corr(numeric_only=True)["unique_names"]
    .sort_values()
    .rename_axis(["month", "parameter"])
)

corr_heatmap = corr_df.hvplot(
    kind="heatmap",
    x="month",
    y="parameter",
    c="unique_names",
    cmap="RdBu_r",
    colorbar=True,
    symmetric=True,
    title="Correlation Matrix",
    clabel="Correlation",
    height=800,
).opts(color_levels=12)

slide(
    pn.Row(
        """
        # Explore the best features for model

        - Sort by highest values and create a heatmap
        - **PRO TIP**: Set `color_levels` for discernible colors at a glance
        - Select features using highest correlation
        - Zonal winds at 850mb are quite correlated
        - Nino indices are not extremely correlated
        - April and May are the most correlated months

        ```python
        corr_df = (
            ml_df.groupby("month")
            .corr(numeric_only=True)["unique_names"]
            .sort_values()
            .rename_axis(["month", "parameter"])
        )
        corr_heatmap = corr_df.hvplot(
            kind="heatmap",
            x="month",
            y="parameter",
            c="unique_names",
            cmap="RdBu_r",
            colorbar=True,
            symmetric=True,
            title="Correlation Matrix",
            clabel="Correlation",
            height=800,
        ).opts(color_levels=12)
        ```
        """,
        pn.panel(corr_heatmap),
    ).servable()
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'8368116d-e098-431e-a12a-20954dd0234b': {'version…

In [234]:
ml_month_dfs = []
for month in range(4, 6):
    ml_month_df = ml_df.loc[ml_df["month"] == month].drop(columns=["month"])
    ml_month_df = ml_month_df.set_index(["year", "unique_names"])
    ml_month_df.columns = ml_month_df.columns + f"_m{month}"
    ml_month_dfs.append(ml_month_df)

ml_month_df = pd.concat(ml_month_dfs, axis=1).reset_index()
ml_feature_df = ml_month_df.filter(
    regex="t300|u850_c|olr|wwv_e|year|month|unique_names"
)

slide(
    pn.Column(
        """
        ## Subset by highest correlated months and features

        - Convert each month's row as individual predictor columns
        - **PRO TIP**: Filter column names by regex instead of typing them out

        ```python
        ml_month_dfs = []
        for month in range(4, 6):
            ml_month_df = ml_df.loc[ml_df["month"] == month].drop(columns=["month"])
            ml_month_df = ml_month_df.set_index(["year", "unique_names"])
            ml_month_df.columns = ml_month_df.columns + f"_m{month}"
            ml_month_dfs.append(ml_month_df)

        ml_month_df = pd.concat(ml_month_dfs, axis=1).reset_index()
        ml_feature_df = ml_month_df.filter(
            regex="t300|u850_c|olr|wwv_e|year|month|unique_names"
        )
        ```
        """,
        ends(ml_month_df),
    ).servable()
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'b178bfaf-4de9-4e10-a541-a43c2261b6f7': {'version…

In [235]:

import statsmodels.api as sm


def run_model(model, X, y, train_index, val_index, **model_kwargs):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    X_train = sm.add_constant(X_train)
    model = model(y_train, X_train, **model_kwargs)
    fitted_model = model.fit()

    X_val = sm.add_constant(X_val)
    y_pred_val = fitted_model.predict(X_val)

    val_df = pd.DataFrame(
        {"year": X_val["year"], "actual": y_val, "prediction": y_pred_val}
    ).sort_values("year")
    return val_df


def score_output(val_df):
    corr = val_df["actual"].corr(val_df["prediction"])
    rmse = ((val_df["actual"] - val_df["prediction"]) ** 2).mean() ** 0.5
    return pd.Series([corr, rmse], index=["corr", "rmse"])


slide(
    pn.Row(
        """
        ## Create ML helper functions

        - Splits into training and validation, fits model, and makes prediction
        - Scores the output using correlation and RMSE

        ```python
        import statsmodels.api as sm

        def run_model(model, X, y, train_index, val_index, **model_kwargs):
            X_train, X_val = X.iloc[train_index], X.iloc[val_index]
            y_train, y_val = y.iloc[train_index], y.iloc[val_index]

            X_train = sm.add_constant(X_train)
            model = model(y_train, X_train, **model_kwargs)
            fitted_model = model.fit()

            X_val = sm.add_constant(X_val)
            y_pred_val = fitted_model.predict(X_val)

            val_df = pd.DataFrame(
                {"year": X_val["year"], "actual": y_val, "prediction": y_pred_val}
            ).sort_values("year")
            return val_df


        def score_output(val_df):
            corr = val_df["actual"].corr(val_df["prediction"])
            rmse = ((val_df["actual"] - val_df["prediction"]) ** 2).mean() ** 0.5
            return pd.Series([corr, rmse], index=["corr", "rmse"])
        ```
        """,
    ).servable()
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'889d9675-bf27-48fe-b12b-58b841e08970': {'version…

In [236]:
from sklearn.model_selection import KFold

X = ml_feature_df.drop(columns=["unique_names"])
y = ml_feature_df["unique_names"]

num_folds = 4  # Adjust the number of folds as needed
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
val_dfs = []
for i, (train_index, val_index) in enumerate(kf.split(X)):
    val_df = run_model(sm.OLS, X, y, train_index, val_index)
    val_df["fold"] = i
    val_dfs.append(val_df)

val_df = pd.concat(val_dfs).sort_values("year")

score_df = val_df.groupby("fold").apply(score_output)
val_plot = val_df.hvplot(
    "year", ["actual", "prediction"], title="With Feature Selection"
)


slide(
    pn.Column(
        """
        # Run linear regression and score

        - Split dataset into X (predictors) and Y (goal)
        - Run K-Fold for reliability and preventing overfitting

        ```python
        from sklearn.model_selection import KFold

        X = ml_feature_df.drop(columns=["unique_names"])
        y = ml_feature_df["unique_names"]

        num_folds = 4  # Adjust the number of folds as needed
        kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
        val_dfs = []
        for i, (train_index, val_index) in enumerate(kf.split(X)):
            val_df = run_model(sm.OLS, X, y, train_index, val_index)
            val_df["fold"] = i
            val_dfs.append(val_df)

        val_df = pd.concat(val_dfs).sort_values("year")

        score_df = (val_df.groupby("fold").apply(score_output))
        val_plot = val_df.hvplot("year", ["actual", "prediction"])
        display(val_plot, score_df)
        ```
        """,
        pn.Row(
            val_plot,
            score_df
        )
    ).servable()
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'4caa743d-bfb5-40d9-8f54-faf471148fa8': {'version…

In [237]:
X = ml_month_df.drop(columns=["unique_names"]).select_dtypes("number")
y = ml_month_df["unique_names"]

num_folds = 4  # Adjust the number of folds as needed
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
val_dfs = []
for i, (train_index, val_index) in enumerate(kf.split(X)):
    val_df = run_model(sm.OLS, X, y, train_index, val_index)
    val_df["fold"] = i
    val_dfs.append(val_df)

val_df = pd.concat(val_dfs).sort_values("year")

new_score_df = val_df.groupby("fold").apply(score_output)
new_val_plot = val_df.hvplot(
    "year", ["actual", "prediction"], title="No Feature Selection"
)


slide(
    pn.Column(
        """
        ## Re-run and score w/o feature selection

        - Other features result in noise
        - This lowers the skill score

        ```python
        X = ml_month_df.drop(columns=["unique_names"]).select_dtypes("number")
        y = ml_month_df["unique_names"]

        num_folds = 4  # Adjust the number of folds as needed
        kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
        val_dfs = []
        for i, (train_index, val_index) in enumerate(kf.split(X)):
            val_df = run_model(sm.OLS, X, y, train_index, val_index)
            val_df["fold"] = i
            val_dfs.append(val_df)

        val_df = pd.concat(val_dfs).sort_values("year")

        score_df = (val_df.groupby("fold").apply(score_output))
        val_plot = val_df.hvplot("year", ["actual", "prediction"])
        display(val_plot, score_df)
        ```
        """,
    ).servable()
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'c2d1696f-c3cf-4d67-b763-653849b4111f': {'version…

In [241]:

slide(
    pn.Column(
        """
        Compare the two models

        - Feature selected model has better scores across the board
        - Higher correlation maxes and lower RMSE mins
        """,
        pn.Row(
            val_plot,
            score_df,
        ),
        pn.Row(
            new_val_plot,
            new_score_df,
        ),
    ).servable()
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'a8c10b16-e037-472f-afa2-52122960d60d': {'version…

In [248]:
slide(
    pn.Column(
        """
        ## Ideas for you to try!

        - Different models (deep learning?)
        - Using other predictors (other oscillations)
        - Feature engineering (previous year's count)
        - Share your results in community: https://discourse.holoviz.org/
        """,
    ).servable()
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'1f34d246-e906-4dab-8ed3-d25b2f6c5e41': {'version…

In [253]:
slide(
    pn.Column(
        """
        Scan QR code for presentation materials
        - Click 'Files' to see source files
        - Click 'App' to see the slides
        - Keep in touch: ahuang@anaconda.com
        """,
        pn.pane.Image("qrcode.png", sizing_mode="scale_both")
    ).servable()
)


BokehModel(combine_events=True, render_bundle={'docs_json': {'e5d0035e-ba13-4518-9278-4bc34493423d': {'version…