In [2]:
import numpy as np
import pandas as pd
import os

In [115]:
# Clean Users dataframe
# Importing dataframe from "dirty" folder
df = pd.read_parquet(f"dirty/users.parquet")

# Drop information about users with no adults
drop_indexes = df[
    ((df["count_adults"] == 0) & (df["count_people"] > 0))
    | ((df["count_adults"] == 0) & (df["count_pets"] > 0))
].index

df_clean = df.copy()
df_clean.drop(index=drop_indexes, inplace=True)


# 95% of rows with missing info about people
# Comparing df.isna() with df.notna(), similar user_segment and user_nuts1 distribution
# Fill every missing value following column distribution

missing_columns = [
    "count_adults",
    "count_children",
    "count_babies",
    "count_pets",
    "user_nuts1",
]

for column in missing_columns:
    missing_values = df_clean[column].isna()
    s = df_clean[column].value_counts(normalize=True)
    np.random.seed(99)
    df_clean.loc[missing_values, column] = np.random.choice(
        s.index, size=missing_values.sum(), p=s.values
    )

# Recalculating count_people column where np.nan
missing_values = df_clean["count_people"].isna()
df_clean.loc[missing_values, "count_people"] = (
    df_clean.loc[missing_values, "count_adults"]
    + df_clean.loc[missing_values, "count_children"]
    + df_clean.loc[missing_values, "count_babies"]
)

# Exporting dataframe to clean folder
df_clean.to_parquet(f"clean/users.parquet")

In [116]:
# Clean Inventory dataframe
# Importing dataframe from "dirty" folder
df = pd.read_parquet(f"dirty/inventory.parquet")

#Adding np.nan to products with price equal to zero
df["price"].mask(df["price"] == 0, np.nan, inplace=True)
df["compare_at_price"].mask(df["compare_at_price"] == 0, np.nan, inplace=True)
df.to_parquet(f"clean/inventory.parquet")
df.head()

#Ya se de donde has sacado los datos xDDD
df[df["price"]==df["price"].min()]

Unnamed: 0,variant_id,price,compare_at_price,vendor,product_type,tags
586,40099114188932,0.01,,bother,,"[discontinue, trade-swap]"


In [6]:
# Merging orders dataframe

df_orders = pd.read_parquet(f"dirty/orders.parquet")
df_users = pd.read_parquet(f"clean/users.parquet")
df_inventory = pd.read_parquet(f"clean/inventory.parquet")

df_final = df_orders.copy()


# Left join to complete users information (count adults, count children, etc.)
df_final = pd.merge(df_final, df_users, how="left", on=["user_id"])


# Explode list of items whitin orders
df_explode = df_final.explode("ordered_items")
df_explode.rename(columns={"ordered_items": "variant_id"}, inplace=True)

# Merging with inventory dataframe
df_explode = pd.merge(df_explode, df_inventory, how="left", on=["variant_id"])
df_explode.dropna(inplace=True)

# Grouping information added
total_price = df_explode.groupby("id")["price"].agg("sum")
max_price_article = df_explode.groupby("id")["price"].agg("max")
total_compare_at_price = df_explode.groupby("id")["compare_at_price"].agg("sum")
vendors = df_explode.groupby("id")["vendor"].agg(list)
product_types = df_explode.groupby("id")["product_type"].agg(list)
tags = df_explode.groupby("id")["tags"].agg(list)

# Adding grouped information added to final dataframe
df_final = pd.merge(df_final, total_price, how="left", on=["id"])
df_final = pd.merge(df_final, total_compare_at_price, how="left", on=["id"])
df_final = pd.merge(df_final, max_price_article, how="left", on=["id"])

df_final = pd.merge(df_final, vendors, how="left", on=["id"])
df_final = pd.merge(df_final, product_types, how="left", on=["id"])
df_final = pd.merge(df_final, tags, how="left", on=["id"])


df_final.rename(columns={"price_x": "mean_price", "price_y": "max_article_price"}, inplace=True)

# If prices are zero, add np.nan with mask
df_final["mean_price"].mask(df_final["mean_price"] == 0, np.nan, inplace=True)
df_final["compare_at_price"].mask(
    df_final["compare_at_price"] == 0, np.nan, inplace=True
)

# Export to clean folder
df_final.to_parquet(f"clean/orders.parquet")


In [7]:
#Analyzing orders
df_orders = pd.read_parquet(f"clean/orders.parquet")

monthly_sales = df_orders.groupby([df_orders.order_date.dt.month, df_orders.order_date.dt.year])[["mean_price", "count_people"]].agg(["sum", "mean"])
monthly_sales
#Starts selling in April 2020
#Growth during frist year
#Sales remained constant during March 2021 and September 2021
#Fast growth from September 2021 (x2 in October 2021)
#Best sales month: January 2022
#More people buying. Means relatively constants

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_price,mean_price,count_people,count_people
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,sum,mean
order_date,order_date,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
1,2021,19288.75,50.100649,1092.0,2.8
1,2022,60955.33,56.492428,3015.0,2.796846
2,2021,18561.88,49.897527,1048.0,2.809651
2,2022,51480.89,58.368356,2506.0,2.81573
3,2021,22293.99,56.440481,1131.0,2.848866
3,2022,16745.7,60.453791,751.0,2.711191
4,2020,0.0,,12.0,4.0
4,2021,22612.99,57.833734,1192.0,3.048593
5,2020,659.44,27.476667,68.0,2.72
5,2021,24778.36,60.731275,1116.0,2.742015
