In [1]:
SAMPLE_SIZE = 100_000
DTYPES = {
    'date_time': 'string',
    'site_name': 'uint8',
    'posa_continent': 'uint8',
    'user_location_country': 'uint8',
    'user_location_region': 'uint16',
    'user_location_city': 'uint16',
    'orig_destination_distance': 'float32',
    'user_id': 'uint32',
    'is_mobile': 'bool',
    'is_package': 'bool',
    'channel': 'uint8',
    'srch_ci': 'string',
    'srch_co': 'string',
    'srch_adults_cnt': 'uint8',
    'srch_children_cnt': 'uint8',
    'srch_rm_cnt': 'uint8',
    'srch_destination_id': 'uint16',
    'srch_destination_type_id': 'uint8',
    'is_booking': 'bool',
    'cnt': 'uint16',
    'hotel_continent': 'uint8',
    'hotel_country': 'uint8',
    'hotel_market': 'uint16',
    'hotel_cluster': 'uint8',
}
DATETIME_COLUMNS = ['date_time', 'srch_ci', 'srch_co']
BOOLEAN_COLUMNS = ['is_booking', 'is_mobile', 'is_package']

In [14]:
import numpy as np
import polars as pl


def map_to_polars(dtype: str):
    conversion = {
        'string': pl.String,
        'uint8': pl.UInt8,
        'uint16': pl.UInt16,
        'uint32': pl.UInt32,
        'float32': pl.Float32,
        'bool': pl.UInt8 
    }
    return conversion[dtype]

dtypes = {k: map_to_polars(v) for k, v in DTYPES.items()}
df = pl.read_csv('../../data/raw/train.csv', dtypes=dtypes)
df = df.with_columns(
    *[pl.col(col).str.to_datetime() for col in DATETIME_COLUMNS]
)
for col in BOOLEAN_COLUMNS:
    df = df.replace(col, df[col] == 1)

In [15]:
def datetime_to_numerical(dt: pl.Series) -> pl.Series:
    '''
    Converts input datetime series into numerical representation, using this formula:
    result = hours + 24 * (day + 31 * (month + 12 * year))  
    '''

    return dt.dt.hour() + 24 * (dt.dt.day() + 31 * (dt.dt.month() + 12 * dt.dt.year()))

In [16]:
df.head(10)

date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,channel,srch_ci,srch_co,srch_adults_cnt,srch_children_cnt,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster
datetime[μs],u8,u8,u8,u16,u16,f32,u32,bool,bool,u8,datetime[μs],datetime[μs],u8,u8,u8,u16,u8,bool,u16,u8,u8,u16,u8
2014-08-11 07:46:59,2,3,66,348,48862,2234.26416,12,False,True,9,2014-08-27 00:00:00,2014-08-31 00:00:00,2,0,1,8250,1,False,3,2,50,628,1
2014-08-11 08:22:12,2,3,66,348,48862,2234.26416,12,False,True,9,2014-08-29 00:00:00,2014-09-02 00:00:00,2,0,1,8250,1,True,1,2,50,628,1
2014-08-11 08:24:33,2,3,66,348,48862,2234.26416,12,False,False,9,2014-08-29 00:00:00,2014-09-02 00:00:00,2,0,1,8250,1,False,1,2,50,628,1
2014-08-09 18:05:16,2,3,66,442,35390,913.193176,93,False,False,3,2014-11-23 00:00:00,2014-11-28 00:00:00,2,0,1,14984,1,False,1,2,50,1457,80
2014-08-09 18:08:18,2,3,66,442,35390,913.625916,93,False,False,3,2014-11-23 00:00:00,2014-11-28 00:00:00,2,0,1,14984,1,False,1,2,50,1457,21
2014-08-09 18:13:12,2,3,66,442,35390,911.514221,93,False,False,3,2014-11-23 00:00:00,2014-11-28 00:00:00,2,0,1,14984,1,False,1,2,50,1457,92
2014-07-16 09:42:23,2,3,66,189,10067,,501,False,False,2,2014-08-01 00:00:00,2014-08-02 00:00:00,2,0,1,8267,1,False,2,2,50,675,41
2014-07-16 09:45:48,2,3,66,189,10067,,501,False,True,2,2014-08-01 00:00:00,2014-08-02 00:00:00,2,0,1,8267,1,False,1,2,50,675,41
2014-07-16 09:52:11,2,3,66,189,10067,,501,False,False,2,2014-08-01 00:00:00,2014-08-02 00:00:00,2,0,1,8267,1,False,1,2,50,675,69
2014-07-16 09:55:24,2,3,66,189,10067,,501,False,False,2,2014-08-01 00:00:00,2014-08-02 00:00:00,2,0,1,8267,1,False,1,2,50,675,70


In [26]:
df_sample = df.sample(SAMPLE_SIZE)

In [None]:
from ydata_profiling import ProfileReport

profile_report = ProfileReport(df_sample, title='Data profile report', explorative=True)

In [None]:
profile_report.to_file('../reports/report.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
from pandas import Series
from plotly.express import bar


nulls = Series(df.null_count().row(0), index=df.columns)
bar((nulls.sort_values(ascending=False) * 100) / len(df), title='Missing values (in %, to 100)', orientation='h', labels={
    'x': 'Missing values (to 100%)',
    'y': 'Column'
})

In [17]:
numerical_columns = [column for column in df.columns if df[column].dtype not in [pl.Boolean, pl.String, pl.Datetime]]
categorical_columns = [column for column in df.columns if df[column].dtype in [pl.Boolean, pl.String]]
datetime_columns = [column for column in df.columns if column not in numerical_columns and column not in categorical_columns]

In [159]:
df = df.with_columns(
    co_ci_diff=(pl.col('srch_co') - pl.col('srch_ci')).dt.days().cast(pl.Int16)
)

In [160]:
from typing import TypeVar, Generic


T = TypeVar('T')
V = TypeVar('V')

class PairedDict(Generic[T, V]):
    def __init__(self, initial: dict[tuple[T, ...], V] | None = None) -> None:
        if initial is None:
            self._dict = dict()
        else:
            self._dict = {frozenset(k): v for k, v in initial.items()}

    def __contains__(self, item: tuple[T, ...]) -> bool:
        return frozenset(item) in self._dict
    
    def __getitem__(self, key: tuple[T, ...]) -> V:
        return self._dict[frozenset(key)]

In [161]:
from enum import Enum
from typing import Any

import datashader as ds
import datashader.transfer_functions as tf
import plotly.express as px
import plotly.graph_objects as go



from dash import Dash, dcc, html, Input, Output
from pandas import DataFrame
from plotly.graph_objects import Figure
from plotly_resampler import FigureResampler, FigureWidgetResampler


DASH_PORT = 8050
FIGURE_HEIGHT = 800


class FeatureType(Enum):
    NUMERICAL = 'numerical'
    CATEGORICAL = 'categorical'
    DATETIME = 'datetime'


def get_feature_type(feature: str) -> FeatureType:
    if feature in numerical_columns:
        return FeatureType.NUMERICAL
    elif feature in datetime_columns:
        return FeatureType.DATETIME
    return FeatureType.CATEGORICAL


def get_boxplot_data(df: pl.DataFrame, col: str) -> dict[str, Any]:
    s = df[col]
    
    res = {
        'q1': np.array([s.quantile(0.25)]),
        'q3': np.array([s.quantile(0.75)]),
        'median': np.array([s.median()]),
        'mean': np.array([s.mean()]),
        'sd': np.array([s.std()]),
        'notched': False
    }
    iqr = res['q3'] - res['q1']
    res['upperfence'] = res['q3'] + 1.5 * iqr
    res['lowerfence'] = res['q1'] - 1.5 * iqr
    return res


def boxplot(df: pl.DataFrame, x: str, y: str) -> Figure:
    if y in categorical_columns:
        x, y = y, x
    
    unique_values = list(df[x].unique())
    fig = Figure()
    for idx, value in enumerate(unique_values):
        fig.add_trace(
            go.Box(**get_boxplot_data(df.filter(df[x] == value), y), 
                   x0=idx, name=value)
        )

    fig.update_layout(
        xaxis={
            'tickmode': 'array',
            'tickvals': list(range(len(unique_values))),
            'ticktext': unique_values
        }
    )
    return fig


def scatterplot(df: pl.DataFrame, x: str, y: str) -> Figure:
    canvas = ds.Canvas()
    agg = canvas.points(df.select(x, y).to_pandas(), x, y)
    agg.values = agg.values.astype(np.float32)
    zeroes_idx = np.where(agg.values == 0)
    agg.values = np.log(agg.values)
    agg.values[zeroes_idx] = np.nan
    agg = tf.spread(agg, name='spread 1px')
    fig = px.imshow(agg, origin='lower', labels={'color':'Log10(count)'})
    fig.update_traces(hoverongaps=False)
    return fig


def lineplot(df: pl.DataFrame, x: str, y: str) -> Figure:
    canvas = ds.Canvas(x_range=(df[x].min(), df[x].max()))
    agg = canvas.line(df.select(x, y).to_pandas(), x, y, antialias=False)
    agg.values = agg.values.astype(np.float32)
    zeroes_idx = np.where(agg.values == 0)
    agg.values = np.log(agg.values)
    agg.values[zeroes_idx] = np.nan
    agg = tf.spread(agg, name='spread 1px')
    fig = px.imshow(agg, origin='lower', labels={'color':'Log10(count)'})
    fig.update_traces(hoverongaps=False)
    return fig

def lineplot_dt(df: pl.DataFrame, x: str, y: str) -> Figure:
    fig = FigureResampler(Figure())
    data = df.select(x, y)
    data = data.sort(by=x, descending=False)
    fig.add_trace(go.Scattergl(showlegend=True), hf_x=data[x].to_pandas(), hf_y=data[y].to_pandas())
    return fig


def make_interactive_plot(x_columns: list[str], y_columns: list[str], title: str | None = None,
                          *, vdf: DataFrame = df) -> Dash:
    """
    Creates interactive plot for given x and y columns
    :param x_columns: x columns to display
    :param y_columns: y columns to display
    :param title: title of the plot
    :param df: dataframe, default is data
    :return: Dash with interactive plot
    """
    assert set(x_columns).issubset(set(vdf.columns)), 'x_columns must be in data.columns'
    assert set(y_columns).issubset(set(vdf.columns)), 'y_columns must be in data.columns'
    assert len(x_columns) > 0
    assert len(y_columns) > 0

    if title is None:
        title = ', '.join(x_columns) + ' to ' + ', '.join(y_columns)

    app = Dash(__name__)
    display_types = PairedDict({
        (FeatureType.NUMERICAL, FeatureType.NUMERICAL): scatterplot,
        (FeatureType.NUMERICAL, FeatureType.CATEGORICAL): boxplot,
        (FeatureType.CATEGORICAL, FeatureType.CATEGORICAL): px.bar,
        (FeatureType.DATETIME, FeatureType.DATETIME): lineplot_dt,
        (FeatureType.DATETIME, FeatureType.NUMERICAL): lineplot,
        (FeatureType.DATETIME, FeatureType.CATEGORICAL): boxplot
    })
    
    app.layout = html.Div([
        html.H5(title),
        html.A('Open in browser', href=f'http://localhost:{DASH_PORT}'),
        html.P('x-axis:'),
        dcc.RadioItems(
            id='x-axis',
            options=x_columns,
            value=x_columns[0],
            inline=True,
        ),
        html.P('y-axis:'),
        dcc.RadioItems(
            id='y-axis',
            options=y_columns,
            value=y_columns[0],
            inline=True,
        ),
        dcc.Graph(id='graph')
    ], style={'height': '1300px', 'overflowY': 'auto'},)

    @app.callback(
        Output('graph', 'figure'),
        Input('x-axis', 'value'),
        Input('y-axis', 'value')
    )
    def generate_chart(x_value: str, y_value: str) -> Figure:
        if x_value == y_value:
           return px.line(x=[0, 1], y=[0, 1])  # to avoid some weird datashader behaviour when plotting x to x
        x_type = get_feature_type(x_value)
        y_type = get_feature_type(y_value)
        if x_type == y_type == FeatureType.CATEGORICAL:
            return px.histogram(vdf, x=x_value, color=y_value, histnorm='probability density')
        return display_types[x_type, y_type](vdf, x=x_value, y=y_value)

    return app


app = make_interactive_plot(df.columns, df.columns, 'Title', vdf=df)
app.run_server(port=DASH_PORT, debug=True)

In [162]:
def get_ts_name(truncation: list[str], agg: list[str], truncation_value: str, agg_value: str) -> str | None:
    if len(truncation) == len(agg) == 1:
        return None
    result = ''
    if len(agg) > 1:
        result += agg_value
    if len(truncation) > 1:
        if len(result) > 0:
            result += ' '
        result += truncation_value
    return result


def timeseries_plot(df: pl.DataFrame, x: str, y: str, truncation: str | list[str] | None = None, 
                    agg: str | list[str] | None = None, log_x: bool = False, 
                    log_y: bool = False) -> Figure:
    assert x in datetime_columns
    assert (truncation is None) == (agg is None)

    fig = FigureResampler(Figure())
    data = df.select(x, y).sort(by=x)

    if isinstance(truncation, str):
        truncation = [truncation]
    if isinstance(agg, str):
        agg = [agg]
    
    if truncation is None:
        fig.add_trace(go.Scattergl(), hf_x=data[x].to_numpy().reshape(-1), hf_y=data[y].to_numpy().reshape(-1))
    else:
        for a in agg:
            for t in truncation:
                name = get_ts_name(truncation, agg, a, t)
                grouped = data.group_by_dynamic(x, every=t).agg(getattr(pl.col(y), a)())
                fig.add_trace(go.Scattergl(name=name), hf_x=grouped[x].to_numpy().reshape(-1), hf_y=grouped[y].to_numpy().reshape(-1))

    
    fig.update_layout(
        title=f'{x} vs {y}',
        title_x=0.45,
        xaxis_title=x,
        yaxis_title=y,
        legend_title='Legend',
    )
    if log_x:
        fig.update_xaxes(type='log')
    if log_y:
        fig.update_yaxes(type='log')

    return fig

In [163]:
timeseries_plot(df, 'date_time', 'is_booking', truncation='1mo', agg=['count', 'sum'], log_y=True)

In [167]:
timeseries_plot(df, 'date_time', 'is_booking', truncation='1mo', agg=['mean'], log_y=True)

In [164]:
timeseries_plot(df, 'date_time', 'co_ci_diff', truncation='1mo', agg='mean')

In [41]:
import plotly.graph_objects as go
import numpy as np


def get_box_data(df: pl.DataFrame, col: str) -> dict[str, list[float]]:
    s = df[col]
    
    res = {
        'q1': np.array([s.quantile(0.25)]),
        'q3': np.array([s.quantile(0.75)]),
        'median': np.array([s.median()]),
        'mean': np.array([s.mean()]),
        'sd': np.array([s.std()]),
        'notched': False
    }
    iqr = res['q3'] - res['q1']
    res['upperfence'] = res['q3'] + 1.5 * iqr
    res['lowerfence'] = res['q1'] - 1.5 * iqr
    return res


fig = Figure()
fig.add_trace(go.Box(**get_box_data(df, 'orig_destination_distance'), name='Box', x0=1))
fig.add_trace(go.Box(**get_box_data(df, 'orig_destination_distance'), name='Box', x0=2))
fig.update_layout(
    xaxis = dict(
        tickmode='array',
        tickvals=[1, 2],
        ticktext=['Oh', 'yeah']
    )
)
fig.show()

In [63]:
import datashader as ds
import datashader.transfer_functions as tf

canvas = ds.Canvas()
agg = canvas.points(df.select('orig_destination_distance', 'srch_children_cnt').to_pandas(), 'orig_destination_distance', 'srch_children_cnt')

In [64]:
visualization_df['srch_children_cnt'].value_counts()

srch_children_cnt
0    29766143
1     4219414
2     3014356
3      491086
4      138308
5       19573
6       15415
7        2699
8        2416
9         883
Name: count, dtype: int64

In [65]:
agg.values = agg.values.astype(np.float32)
zeroes_idx = np.where(agg.values == 0)
agg.values = np.log(agg.values)
agg.values[zeroes_idx] = np.nan


divide by zero encountered in log



In [66]:
import plotly.express as px

agg = tf.spread(agg, name='spread 1px')
fig = px.imshow(agg, origin='lower', labels={'color':'Log10(count)'})
fig.update_traces(hoverongaps=False)
fig.update_layout(coloraxis_colorbar={'title': 'Count', 'tickprefix': '1.e'})
fig.show()