In [15]:
from pandas import read_csv

data = read_csv('../../data/interim/data.csv')

In [16]:
ID_COLUMN = 'EmployeeID'

In [17]:
data.describe()

Unnamed: 0,EmployeeID,Age,DistanceFromHome,EmployeeCount,JobLevel,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager,MeanWorkingTime,MedianWorkingTime,SkewWorkingTime
count,4410.0,4410.0,4410.0,4410.0,4410.0,4410.0,4391.0,4410.0,4410.0,4410.0,4401.0,4410.0,4410.0,4410.0,4410.0,4409.0,4409.0,4409.0
mean,2205.5,36.92381,9.192517,1.0,2.063946,65029.312925,2.69483,15.209524,8.0,0.793878,11.279936,2.79932,7.008163,2.187755,4.123129,7.700866,7.700737,0.000892
std,1273.201673,9.133301,8.105026,0.0,1.106689,47068.888559,2.498887,3.659108,0.0,0.851883,7.782222,1.288978,6.125135,3.221699,3.567327,1.340361,1.340485,0.156875
min,1.0,18.0,1.0,1.0,1.0,10090.0,0.0,11.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,5.950504,5.9275,-0.546444
25%,1103.25,30.0,2.0,1.0,1.0,29110.0,1.0,12.0,8.0,0.0,6.0,2.0,3.0,0.0,2.0,6.673017,6.667083,-0.107794
50%,2205.5,36.0,7.0,1.0,2.0,49190.0,2.0,14.0,8.0,1.0,10.0,3.0,5.0,1.0,3.0,7.406965,7.408611,0.000228
75%,3307.75,43.0,14.0,1.0,3.0,83800.0,4.0,18.0,8.0,1.0,15.0,3.0,9.0,3.0,7.0,8.369604,8.365278,0.104997
max,4410.0,60.0,29.0,1.0,5.0,199990.0,9.0,25.0,8.0,3.0,40.0,6.0,40.0,15.0,17.0,11.03096,11.051667,0.60868


In [18]:
from plotly.express import bar

bar((data.isnull().sum().sort_values() * 100) / len(data), title='Missing values (in %, to 100)', orientation='h', labels={
    'x': 'Missing values (to 100%)',
    'y': 'Column'
})

In [19]:
from ydata_profiling import ProfileReport

profile_report = ProfileReport(data, title='Data profile report')

In [20]:
profile_report.to_file('report.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [21]:
numerical_columns = [column for column in data.columns if data[column].dtype != object]
categorical_columns = [column for column in data.columns if data[column].dtype == object]

In [22]:
from enum import Enum

from dash import Dash, dcc, html, Input, Output
from pandas import DataFrame
from plotly.graph_objects import Figure

import plotly.express as px

DASH_PORT = 8050
FIGURE_HEIGHT = 800


class FeatureType(Enum):
    NUMERICAL = 'numerical'
    CATEGORICAL = 'categorical'


def get_feature_type(feature: str) -> FeatureType:
    return FeatureType.NUMERICAL if feature in numerical_columns else FeatureType.CATEGORICAL


def make_interactive_plot(x_columns: list[str], y_columns: list[str], title: str | None = None,
                          *, df: DataFrame = data, heatmap_scale: str = 'viridis') -> Dash:
    """
    Creates interactive plot for given x and y columns
    :param x_columns: x columns to display
    :param y_columns: y columns to display
    :param title: title of the plot
    :param df: dataframe, default is data
    :return: Dash with interactive plot
    """
    assert set(x_columns).issubset(set(df.columns)), 'x_columns must be in data.columns'
    assert set(y_columns).issubset(set(df.columns)), 'y_columns must be in data.columns'
    assert len(x_columns) > 0
    assert len(y_columns) > 0

    if title is None:
        title = ', '.join(x_columns) + ' to ' + ', '.join(y_columns)

    app = Dash(__name__)
    display_types = {
        (FeatureType.NUMERICAL, FeatureType.NUMERICAL): px.scatter,
        (FeatureType.NUMERICAL, FeatureType.CATEGORICAL): px.box,
        (FeatureType.CATEGORICAL, FeatureType.NUMERICAL): px.box,
        (FeatureType.CATEGORICAL, FeatureType.CATEGORICAL): px.bar
    }
    app.layout = html.Div([
        html.H5(title),
        html.A('Open in browser', href=f'http://localhost:{DASH_PORT}'),
        html.P('x-axis:'),
        dcc.RadioItems(
            id='x-axis',
            options=x_columns,
            value=x_columns[0],
            inline=True,
        ),
        html.P('y-axis:'),
        dcc.RadioItems(
            id='y-axis',
            options=y_columns,
            value=y_columns[0],
            inline=True,
        ),
        dcc.Graph(id='graph')
    ], style={"height": "1300px", "overflowY": "auto"},)

    @app.callback(
        Output('graph', 'figure'),
        Input('x-axis', 'value'),
        Input('y-axis', 'value')
    )
    def generate_chart(x_value: str, y_value: str) -> Figure:
        x_type = get_feature_type(x_value)
        y_type = get_feature_type(y_value)
        if x_type == y_type == FeatureType.CATEGORICAL:
            return px.histogram(data, x=x_value, color=y_value, height=FIGURE_HEIGHT, histnorm='probability density')
        return display_types[x_type, y_type](data, x=x_value, y=y_value, height=FIGURE_HEIGHT)

    return app


app = make_interactive_plot(data.columns, data.columns, 'Title', heatmap_scale='viridis')
app.run_server(port=DASH_PORT, debug=True)