In [4]:
SAMPLE_SIZE = 100_000
DTYPES = {
    'date_time': 'string',
    'site_name': 'uint8',
    'posa_continent': 'uint8',
    'user_location_country': 'uint8',
    'user_location_region': 'uint16',
    'user_location_city': 'uint16',
    'orig_destination_distance': 'float32',
    'user_id': 'uint32',
    'is_mobile': 'bool',
    'is_package': 'bool',
    'channel': 'uint8',
    'srch_ci': 'string',
    'srch_co': 'string',
    'srch_adults_cnt': 'uint8',
    'srch_children_cnt': 'uint8',
    'srch_rm_cnt': 'uint8',
    'srch_destination_id': 'uint16',
    'srch_destination_type_id': 'uint8',
    'is_booking': 'bool',
    'cnt': 'uint16',
    'hotel_continent': 'uint8',
    'hotel_country': 'uint8',
    'hotel_market': 'uint16',
    'hotel_cluster': 'uint8',
}

In [None]:
import numpy as np
import polars as pl


def map_to_polars(dtype: str):
    conversion = {
        'string': pl.String,
        'uint8': pl.UInt8,
        'uint16': pl.UInt16,
        'uint32': pl.UInt32,
        'float32': pl.Float32,
        'bool': pl.UInt8
    }
    return conversion[dtype]

dtypes = {k: map_to_polars(v) for k, v in DTYPES.items()}
df = pl.read_csv('../../data/raw/train.csv', dtypes=dtypes)

In [5]:
import pandas as pd

df = pd.read_csv('../../data/raw/train.csv', dtype=DTYPES, low_memory=True)

In [6]:
df.head(10)

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,srch_children_cnt,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster
0,2014-08-11 07:46:59,2,3,66,348,48862,2234.26416,12,False,True,...,0,1,8250,1,False,3,2,50,628,1
1,2014-08-11 08:22:12,2,3,66,348,48862,2234.26416,12,False,True,...,0,1,8250,1,True,1,2,50,628,1
2,2014-08-11 08:24:33,2,3,66,348,48862,2234.26416,12,False,False,...,0,1,8250,1,False,1,2,50,628,1
3,2014-08-09 18:05:16,2,3,66,442,35390,913.193176,93,False,False,...,0,1,14984,1,False,1,2,50,1457,80
4,2014-08-09 18:08:18,2,3,66,442,35390,913.625916,93,False,False,...,0,1,14984,1,False,1,2,50,1457,21
5,2014-08-09 18:13:12,2,3,66,442,35390,911.514221,93,False,False,...,0,1,14984,1,False,1,2,50,1457,92
6,2014-07-16 09:42:23,2,3,66,189,10067,,501,False,False,...,0,1,8267,1,False,2,2,50,675,41
7,2014-07-16 09:45:48,2,3,66,189,10067,,501,False,True,...,0,1,8267,1,False,1,2,50,675,41
8,2014-07-16 09:52:11,2,3,66,189,10067,,501,False,False,...,0,1,8267,1,False,1,2,50,675,69
9,2014-07-16 09:55:24,2,3,66,189,10067,,501,False,False,...,0,1,8267,1,False,1,2,50,675,70


In [7]:
df_sample = df.sample(SAMPLE_SIZE)

In [None]:
from ydata_profiling import ProfileReport

profile_report = ProfileReport(df_sample, title='Data profile report', explorative=True)

In [None]:
profile_report.to_file('../reports/report.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
from plotly.express import bar

bar((df.isnull().sum().sort_values(ascending=False) * 100) / len(df), title='Missing values (in %, to 100)', orientation='h', labels={
    'x': 'Missing values (to 100%)',
    'y': 'Column'
})

In [8]:
numerical_columns = [column for column in df.columns if str(df[column].dtype) not in ['bool', 'string']]
categorical_columns = [column for column in df.columns if str(df[column].dtype) in ['bool', 'string']]

In [9]:
visualization_df = df.sample(1000)

In [11]:
from enum import Enum
from functools import partial

from dash import Dash, dcc, html, Input, Output
from pandas import DataFrame
from plotly.graph_objects import Figure

import plotly.express as px

DASH_PORT = 8050
FIGURE_HEIGHT = 800


class FeatureType(Enum):
    NUMERICAL = 'numerical'
    CATEGORICAL = 'categorical'


def get_feature_type(feature: str) -> FeatureType:
    return FeatureType.NUMERICAL if feature in numerical_columns else FeatureType.CATEGORICAL


def make_interactive_plot(x_columns: list[str], y_columns: list[str], title: str | None = None,
                          *, vdf: DataFrame = df) -> Dash:
    """
    Creates interactive plot for given x and y columns
    :param x_columns: x columns to display
    :param y_columns: y columns to display
    :param title: title of the plot
    :param df: dataframe, default is data
    :return: Dash with interactive plot
    """
    assert set(x_columns).issubset(set(vdf.columns)), 'x_columns must be in data.columns'
    assert set(y_columns).issubset(set(vdf.columns)), 'y_columns must be in data.columns'
    assert len(x_columns) > 0
    assert len(y_columns) > 0

    if title is None:
        title = ', '.join(x_columns) + ' to ' + ', '.join(y_columns)

    app = Dash(__name__)
    display_types = {
        (FeatureType.NUMERICAL, FeatureType.NUMERICAL): partial(px.scatter, render_mode='webgl'),
        (FeatureType.NUMERICAL, FeatureType.CATEGORICAL): px.box,
        (FeatureType.CATEGORICAL, FeatureType.NUMERICAL): px.box,
        (FeatureType.CATEGORICAL, FeatureType.CATEGORICAL): px.bar
    }
    app.layout = html.Div([
        html.H5(title),
        html.A('Open in browser', href=f'http://localhost:{DASH_PORT}'),
        html.P('x-axis:'),
        dcc.RadioItems(
            id='x-axis',
            options=x_columns,
            value=x_columns[0],
            inline=True,
        ),
        html.P('y-axis:'),
        dcc.RadioItems(
            id='y-axis',
            options=y_columns,
            value=y_columns[0],
            inline=True,
        ),
        dcc.Graph(id='graph')
    ], style={"height": "1300px", "overflowY": "auto"},)

    @app.callback(
        Output('graph', 'figure'),
        Input('x-axis', 'value'),
        Input('y-axis', 'value')
    )
    def generate_chart(x_value: str, y_value: str) -> Figure:
        x_type = get_feature_type(x_value)
        y_type = get_feature_type(y_value)
        if x_type == y_type == FeatureType.CATEGORICAL:
            return px.histogram(vdf, x=x_value, color=y_value, height=FIGURE_HEIGHT, histnorm='probability density')
        return display_types[x_type, y_type](vdf, x=x_value, y=y_value, height=FIGURE_HEIGHT)

    return app


app = make_interactive_plot(df.columns, df.columns, 'Title', vdf=visualization_df)
app.run_server(port=DASH_PORT, debug=True)

NameError: name 'visualization_df' is not defined