# Data science

In [None]:
%load_ext kedro.ipython

In [None]:
import seaborn.objects as so
import seaborn as sns
import matplotlib as mpl
from wordcloud import WordCloud, STOPWORDS
import polars as pl

sns.set()

In [None]:
df = catalog.load("openrepair-0_3")
df.head()

In [None]:
len(df)

In [None]:
df["country"].value_counts().sort("counts", descending=True).head()

In [None]:
df["product_category"].value_counts().sort(by="counts", descending=True).head(10)

In [None]:
df["repair_barrier_if_end_of_life"].value_counts()

In [None]:
repairs_by_year = (
    df
    .sort("event_date")
    .group_by_dynamic("event_date", every="1y", by="repair_status")
    .agg(pl.count())
    .sort("event_date")
)
repairs_by_year.head()

In [None]:
import warnings
warnings.simplefilter("ignore")

In [None]:
(
    so.Plot(
        data=repairs_by_year.with_columns(
            pl.col("event_date").dt.year().alias("year")
        ).to_pandas(),
        x="year",
        y="count",
        color="repair_status",
    ).add(so.Bar(), so.Stack())
)

In [None]:
barriers_by_year = (
    df
    .sort("event_date")
    .groupby_dynamic(
        "event_date",
        every="1y",
        by="repair_barrier_if_end_of_life",
    )
    .agg(pl.count())
    .select(
        pl.col("event_date").dt.year().alias("year"),
        pl.col("repair_barrier_if_end_of_life").fill_null("Unknown").alias("barrier"),
        pl.col("count"),
    )
    .sort("year")
)
barriers_by_year.head()

In [None]:
barriers_by_year.pivot(
    index="year",
    columns="barrier",
    values="count",
    aggregate_function="sum",
)

In [None]:
(
    so.Plot(
        data=barriers_by_year.filter(pl.col("barrier") != "Unknown"),
        x="year",
        y="count",
        color="barrier",
    ).add(so.Bar(), so.Stack())
)

In [None]:
(
    so.Plot(
        data=(
            barriers_by_year.with_columns(
                pl.col("count").sum().over("year").alias("year_totals"),
            )
            .with_columns((pl.col("count") / pl.col("year_totals")).alias("pct_count"))
            .filter(pl.col("barrier") != "Unknown")
        ),
        x="year",
        y="pct_count",
        color="barrier",
    ).add(so.Bar(), so.Stack())
)

In [None]:
eol = df.filter(pl.col("repair_status") == "End of life")
eol.head()

In [None]:
problems_gbr = list(eol.filter((pl.col("country") == "GBR"))["problem"].drop_nulls())
problems_gbr[:5]

In [None]:
wordcloud = WordCloud(
    background_color="white",
    stopwords=set(STOPWORDS),
    collocation_threshold=1,
    colormap=mpl.pyplot.cm.Dark2,
    scale=3,
    random_state=42,
).generate(" ".join(problems_gbr))

fig, ax = mpl.pyplot.subplots(figsize=(10, 8))
ax.imshow(wordcloud)
ax.axis("off")