# dōTERRA Essential Oils EDA
Exploratory Data Analysis on the scraped essential oil data from dōTERRA German shop.

In [None]:
# !uv add pandas lxml --active

: 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set aesthetic parameters
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = [10, 6]
plt.rcParams['figure.dpi'] = 100

: 

# Read sitemap

In [None]:
import pandas as pd

url = "https://www.doterra.com/sitemaps/de_de_sitemap.xml"
sitemap_df = pd.read_xml(url)
sitemap_df.head()


In [None]:
for c in ["lastmod", "changefreq","priority"]:
    display(sitemap_df[c].value_counts())

## Load Data

In [None]:
csv_path = 'doterra_oils_sitemap.csv'
df = pd.read_csv(csv_path)
print(f"Dataset shape: {df.shape}")
df.head()

## Data Quality and Missing Values

In [None]:
missing = df.isnull().sum()
pct_missing = (missing / len(df)) * 100
missing_df = pd.DataFrame({'Missing': missing, 'Percentage': pct_missing})
print(missing_df[missing_df['Missing'] > 0])

plt.figure(figsize=(12, 6))
sns.barplot(x=missing_df.index, y=missing_df['Percentage'])
plt.xticks(rotation=45, ha='right')
plt.title("Percentage of Missing Values per Column")
plt.ylabel("Percentage (%)")
plt.show()

## Sourcing Locations
Where do these oils come from?

In [None]:
locations = df['product_howitworks_location'].dropna().value_counts()
if not locations.empty:
    plt.figure(figsize=(10, 8))
    sns.barplot(y=locations.index, x=locations.values, hue=locations.index, palette="viridis")
    plt.title("Product Sourcing Locations (Top Values)")
    plt.xlabel("Count")
    plt.show()
else:
    print("No location data available.")

## Text Length Analysis
Compare the richness of descriptions across products.

In [None]:
df['desc_len'] = df['product_description'].str.len().fillna(0)
df['lifestyle_len'] = df['brand_lifestyle_description'].str.len().fillna(0)

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.histplot(df['desc_len'], bins=20, kde=True, color='skyblue')
plt.title("Distribution of Description Lengths")

plt.subplot(1, 2, 2)
sns.histplot(df['lifestyle_len'], bins=20, kde=True, color='salmon')
plt.title("Distribution of Lifestyle Description Lengths")
plt.tight_layout()
plt.show()

## Product Categorization
Identify 'Touch' vs 'Oil' products.

In [None]:
df['type'] = 'Single Oil'
df.loc[df['product_name'].str.contains('Touch', na=False), 'type'] = 'Touch'
df.loc[df['product_name'].str.contains('Roll-On|Mischung', na=False), 'type'] = 'Blend/Roll-On'

type_counts = df['type'].value_counts()
plt.figure(figsize=(8, 8))
plt.pie(type_counts, labels=type_counts.index, autopct='%1.1f%%', colors=sns.color_palette('pastel'))
plt.title("Product Categories (by Name)")
plt.show()

In [None]:
df.product_sub_name.str.contains("Mischung").value_counts(dropna=False)
df["Mischung"] = df.product_sub_name.str.contains("Mischung").fillna(True)
df.Mischung.value_counts()

# serialization

In [None]:
from serialize import serialize_row

df["serialized_text"] = df.apply(serialize_row, axis=1)

In [None]:
df['serialized_text_length'] = df['serialized_text'].str.len()

In [None]:
single_oil_df = df[df.type == 'Single Oil']
single_oil_df = single_oil_df[single_oil_df.product_name != 'Fraktioniertes Kokosöl']
single_oil_df = single_oil_df[single_oil_df.product_name.notnull()].reset_index(drop=True)

In [None]:
single_oil_df.head()

In [None]:
assert single_oil_df.product_image_url.notnull().mean().item() == 1.

In [None]:
print(single_oil_df.iloc[2]["serialized_text"])

In [None]:
single_oil_df.serialized_text_length.plot.hist()

In [None]:
single_oil_df[["product_name","Mischung", "serialized_text"]].head()

In [None]:
single_oil_df.columns

In [None]:
single_oil_df.to_csv('single_oil.csv', index=False, encoding='utf-8-sig')