# Comparison with Kaggle report

In [None]:
import os
import sys
from typing import Any
from typing import Dict
from typing import List
from typing import Optional
from typing import Union

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from holoviews import opts as hv_opts
from IPython.core.display import display, HTML
from IPython.lib import deepreload

import os
import sys

from IPython.lib import deepreload

import kagglelib as kglib

# https://stackoverflow.com/questions/8391411/how-to-block-calls-to-print
class disabled_print:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

# https://stackoverflow.com/questions/28101895/reloading-packages-and-their-submodules-recursively-in-python
def reload_kglib() -> None:
    with disabled_print():
        deepreload.reload(kglib, exclude={key for (key, value) in sys.modules.items() if "kagglelib" not in key})

hv.extension('bokeh', "plotly", "matplotlib", logo=False)

np.set_printoptions(linewidth=200)
pd.options.display.max_columns = None
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
reload_kglib()
udf = kglib.load_udf()
uds = kglib.load_role_df(udf, role="Data Scientist")
fdf = kglib.filter_df(udf, print_filters=True)
fds = kglib.load_role_df(fdf, role="Data Scientist")
udf, uds, fdf, fds = map(kglib.keep_demo_cols, (udf, uds, fdf, fds))

In [None]:
dataset1, dataset2 = uds, fds

## Gender

Page 4

In [None]:
column = "gender"
df = kglib.get_stacked_value_count_comparison(
    sr1=dataset1[column],
    sr2=dataset2[column],
    stack_label="No. participants",
    as_percentage=True,
    order=["Man", "Woman", "Nonbinary", "Prefer not to say", "Prefer to self-describe"],
)

kglib.sns_plot_value_count_comparison(
    df=df,
    orientation="h",
    order_by_labels=False,
    legend_location="center right",
    title="Gender identify of data scientists"
)

## Age

Page 5

In [None]:
column = "age"
df = kglib.get_stacked_value_count_comparison(
    sr1=dataset1[column],
    sr2=dataset2[column],
    stack_label="No. participants",
    as_percentage=True
)
kglib.sns_plot_value_count_comparison(df, orientation="h")

## Country

Page 6

In [None]:
column = "country"
df = kglib.get_value_count_comparison(
    sr1=dataset1[column],
    sr2=dataset2[column],
    as_percentage=True
)
df = df[df.original > 1.4]
df = df.sort_values(by="original")
df = kglib.stack_value_count_comparison(df, "participants (%)")
kglib.sns_plot_value_count_comparison(df, orientation="v", order_by_labels=False, x_ticklabels_rotation=40)

## Education

Page 7

In [None]:
reload_kglib()

column = "education"
order = [
    'No formal education past high school',
    'Some college/university study without earning a bachelor’s degree',
    'Bachelor’s degree',
    'Master’s degree',
    'Doctoral degree',
    'Professional degree',
    'I prefer not to answer',
]

df = kglib.get_stacked_value_count_comparison(
    sr1=dataset1["education"],
    sr2=dataset2["education"],
    stack_label="participants (%)",
    as_percentage=True,
    order=order,
)
kglib.sns_plot_value_count_comparison(df, orientation="h", order_by_labels=True)

## Programming Experience

Partially in Page 9

In [None]:
column = "code_exp"
df = kglib.get_stacked_value_count_comparison(
    sr1=dataset1[column],
    sr2=dataset2[column],
    stack_label="participants",
    as_percentage=True
)
kglib.sns_plot_value_count_comparison(
    df=df,
    orientation="h",
    order_by_labels=True,
    title="Coding experience global dataset"
)

## ML Experience

Partially in Page 10

In [None]:
column = "ml_exp"
df = kglib.get_stacked_value_count_comparison(
    sr1=dataset1[column],
    sr2=dataset2[column],
    stack_label="participants",
    as_percentage=True
)
kglib.sns_plot_value_count_comparison(
    df=df,
    orientation="h",
    order_by_labels=True,
    title="ML Experience Global dataset"
)

## Salary Global

Page 11

In [None]:
column = "salary"
df = kglib.get_stacked_value_count_comparison(
    sr1=dataset1[column],
    sr2=dataset2[column],
    stack_label="participants",
    as_percentage=True
)
kglib.sns_plot_value_count_comparison(
    df=df,
    orientation="h",
    order_by_labels=True,
)

## Salary USA

Page 12

In [None]:
column = "salary"
df = kglib.get_stacked_value_count_comparison(
    sr1=dataset1[dataset1.country == "USA"][column],
    sr2=dataset2[dataset2.country == "USA"][column],
    stack_label="participants",
    as_percentage=True
)
kglib.sns_plot_value_count_comparison(
    df=df,
    orientation="h",
    order_by_labels=True,
)

## Salary India

Page 13

In [None]:
column = "salary"
df = kglib.get_stacked_value_count_comparison(
    sr1=dataset1[dataset1.country == "India"][column],
    sr2=dataset2[dataset2.country == "India"][column],
    stack_label="participants",
    as_percentage=True
)
kglib.sns_plot_value_count_comparison(
    df=df,
    orientation="h",
    order_by_labels=True,
)