# 2. Cluster Analysis

Using the clusters created in program 1, identify which clusters best fit the data and areas requiring cleanup.

In [1]:
# Initialize necessary packages
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
import re

# Scroll through larger dataframes (more necessary for the HDBSCAN output
from IPython.display import display, HTML

## Load Clustered Data

Begin by loading in the Excel files containing the clustered data. Save them as pandas data frames.

In [2]:
# Load KMeans Clustered Data
kmeans_19 = pd.read_excel('/Volumes/Samsung1TB/programming/data-science/baby-product-dashboard/clustered-data/kmeans.xlsx',
                          sheet_name = "cluster_42")

kmeans_42 = pd.read_excel('/Volumes/Samsung1TB/programming/data-science/baby-product-dashboard/clustered-data/kmeans.xlsx',
                          sheet_name = "cluster_42")

# Load HDBSCAN
hdb_scan = pd.read_excel('/Volumes/Samsung1TB/programming/data-science/baby-product-dashboard/clustered-data/hdbscan.xlsx',
                         sheet_name = 'umap-cosine-hdbscan')

## Identify Most Commonly Used Words by Cluster

Removing stopwords, identify the most commonly used words by each cluster. This will provide us with a better sense of the categories the clusters grouped the baby products into.

Besides just word frequency, we can also use other methods to potentially pull more meaningful words from each cluster. Because of the pre-processing in program 1 as well as teh removal of stopwords here, this may not be necessary. However, we will use this method and analyze how it performs compared to raw word frequency.

In [3]:
# Define functions to perform the above actions

# The following function removes stopwords from the analysis (also does additional processing on text, but this
# was already taken care of in the first program, so it shouldn't do anything)
def preprocess(text):
    # Confirm text is a string
    if not isinstance(text, str):
        return []
    # Lowercase and remove non-alphabetic characters
    words = re.findall(r'\b[a-z]{2,}\b', text.lower())
    # Remove stop words
    return [word for word in words if word not in ENGLISH_STOP_WORDS]

# Define function for counting the most frequently used words
def get_top_words(df, cluster_col, tokens_col, top_n=3):
    top_words = {}
    for cluster in df[cluster_col].unique():
        words = [word for tokens in df[df[cluster_col] == cluster][tokens_col] for word in tokens]
        most_common = [word for word, _ in Counter(words).most_common(top_n)]
        top_words[cluster] = most_common
    return top_words


In [4]:
# Use TF-IDF to identify more meaningful frequently used words
def get_tfidf_top_words(df, cluster_col, tokens_col, top_n=3):
    tfidf_top_words = []

    for cluster in sorted(df[cluster_col].dropna().unique()):
        # Filter rows for this cluster
        cluster_df = df[df[cluster_col] == cluster]

        # Reconstruct raw text from tokens
        cluster_docs = cluster_df[tokens_col].apply(lambda tokens: ' '.join(tokens))

        # Skip empty clusters
        if cluster_docs.empty:
            continue

        # Vectorize using TF-IDF
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(cluster_docs)
        feature_names = vectorizer.get_feature_names_out()

        # Compute average TF-IDF per word in this cluster
        tfidf_means = tfidf_matrix.mean(axis=0).A1  # convert sparse matrix to array
        top_indices = tfidf_means.argsort()[::-1][:top_n]
        top_words = [feature_names[i] for i in top_indices]

        tfidf_top_words.append({
            'cluster': cluster,
            'top_words_tfidf': top_words
        })

    return pd.DataFrame(tfidf_top_words)



### K-Means: 19

Identify the most frequently used words for the K-Means = 19 clusters dataset.

In [6]:
# Pre-process text
kmeans_19['tokens'] = kmeans_19['text'].apply(preprocess)

# Identify the most frequently used words from each cluster
top_words_dict = get_top_words(kmeans_19, 'cluster', 'tokens', top_n=3)

# Map top words back to each row
kmeans_19['top_words'] = kmeans_19['cluster'].map(top_words_dict).apply(lambda words: ', '.join(words))

In [7]:
# Print out the top 3 words for each cluster

# Keep only necessary columns
df_cluster_words_19 = kmeans_19[['cluster', 'top_words']].copy()

# Convert top words to a tuple and only get unique instances of clusters and top words
df_cluster_words_19['top_words_tuple'] = df_cluster_words_19['top_words'].apply(lambda x: tuple(x))
df_cluster_words_19 = df_cluster_words_19.drop_duplicates(subset=['cluster', 'top_words_tuple'])

# Drop the tuple column
df_cluster_words_19 = df_cluster_words_19.drop(columns='top_words_tuple')

# Sort by cluster
df_cluster_words_19 = df_cluster_words_19.sort_values(by='cluster').reset_index(drop=True)

df_cluster_words_19.head(100)

Unnamed: 0,cluster,top_words
0,0,"cloth, diapers, diapering"
1,1,"storage, furniture, nursery"
2,2,"blankets, swaddling, nursery"
3,3,"feeding, solid, cups"
4,4,"pillows, nursery, maternity"
5,5,"wall, stickers, nursery"
6,6,"care, hair, accessories"
7,7,"gates, gate, safety"
8,8,"strollers, accessories, standard"
9,9,"care, health, kids"


Now us TF-IDF and compare results.

In [8]:
# Identify the most frequently used words from each cluster
top_words_dict_tfidf = get_tfidf_top_words(kmeans_19, 'cluster', 'tokens', top_n=3)

# SMerge tfidf top words back onto your original dataframe by cluster
kmeans_19 = kmeans_19.merge(top_words_dict_tfidf, on = 'cluster', how = 'left')

# Svae top_words_tfidf as a readable string
kmeans_19['top_words_tfidf'] = kmeans_19['top_words_tfidf'].apply(
    lambda words: ', '.join(words) if isinstance(words, list) else ''
)

In [9]:
# Print out the top 3 words for each cluster

# Keep only necessary columns
df_cluster_words_19_tfidf = kmeans_19[['cluster', 'top_words_tfidf']].copy()

# Convert top words to a tuple and only get unique instances of clusters and top words
df_cluster_words_19_tfidf['top_words_tuple'] = df_cluster_words_19_tfidf['top_words_tfidf'].apply(lambda x: tuple(x))
df_cluster_words_19_tfidf = df_cluster_words_19_tfidf.drop_duplicates(subset=['cluster', 'top_words_tuple'])

# Drop the tuple column
df_cluster_words_19_tfidf = df_cluster_words_19_tfidf.drop(columns='top_words_tuple')

# Sort by cluster
df_cluster_words_19_tfidf = df_cluster_words_19_tfidf.sort_values(by='cluster').reset_index(drop=True)

df_cluster_words_19_tfidf.head(100)

Unnamed: 0,cluster,top_words_tfidf
0,0,"diapers, cloth, covers"
1,1,"storage, organization, furniture"
2,2,"blankets, swaddling, bedding"
3,3,"feeding, cups, solid"
4,4,"pillows, nursery, maternity"
5,5,"nursery, stickers, wall"
6,6,"care, hair, accessories"
7,7,"gates, extensions, stair"
8,8,"strollers, accessories, standard"
9,9,"care, health, thermometers"


### K-Means: 42

Identify the most frequently used words for the K-Means = 42 clusters dataset.

In [10]:
# Pre-process text
kmeans_42['tokens'] = kmeans_42['text'].apply(preprocess)

# Identify the most frequently used words from each cluster
top_words_dict = get_top_words(kmeans_42, 'cluster', 'tokens', top_n=3)

# Map top words back to each row
kmeans_42['top_words'] = kmeans_42['cluster'].map(top_words_dict).apply(lambda words: ', '.join(words))

In [11]:
# Print out the top 3 words for each cluster

# Keep only necessary columns
df_cluster_words_42 = kmeans_42[['cluster', 'top_words']].copy()

# Convert top words to a tuple and only get unique instances of clusters and top words
df_cluster_words_42['top_words_tuple'] = df_cluster_words_42['top_words'].apply(lambda x: tuple(x))
df_cluster_words_42 = df_cluster_words_42.drop_duplicates(subset=['cluster', 'top_words_tuple'])

# Drop the tuple column
df_cluster_words_42 = df_cluster_words_42.drop(columns='top_words_tuple')

# Sort by cluster
df_cluster_words_42 = df_cluster_words_42.sort_values(by='cluster').reset_index(drop=True)

df_cluster_words_42.head(100)

Unnamed: 0,cluster,top_words
0,0,"cloth, diapers, diapering"
1,1,"storage, furniture, nursery"
2,2,"blankets, swaddling, nursery"
3,3,"feeding, solid, cups"
4,4,"pillows, nursery, maternity"
5,5,"wall, stickers, nursery"
6,6,"care, hair, accessories"
7,7,"gates, gate, safety"
8,8,"strollers, accessories, standard"
9,9,"care, health, kids"


Now use TF-IDF to compare results.

In [12]:
# Identify the most frequently used words from each cluster
top_words_dict_tfidf = get_tfidf_top_words(kmeans_42, 'cluster', 'tokens', top_n=3)

# SMerge tfidf top words back onto your original dataframe by cluster
kmeans_42 = kmeans_42.merge(top_words_dict_tfidf, on = 'cluster', how = 'left')

# Svae top_words_tfidf as a readable string
kmeans_42['top_words_tfidf'] = kmeans_42['top_words_tfidf'].apply(
    lambda words: ', '.join(words) if isinstance(words, list) else ''
)

In [13]:
# Print out the top 3 words for each cluster

# Keep only necessary columns
df_cluster_words_42_tfidf = kmeans_42[['cluster', 'top_words_tfidf']].copy()

# Convert top words to a tuple and only get unique instances of clusters and top words
df_cluster_words_42_tfidf['top_words_tuple'] = df_cluster_words_42_tfidf['top_words_tfidf'].apply(lambda x: tuple(x))
df_cluster_words_42_tfidf = df_cluster_words_42_tfidf.drop_duplicates(subset=['cluster', 'top_words_tuple'])

# Drop the tuple column
df_cluster_words_42_tfidf = df_cluster_words_42_tfidf.drop(columns='top_words_tuple')

# Sort by cluster
df_cluster_words_42_tfidf = df_cluster_words_42_tfidf.sort_values(by='cluster').reset_index(drop=True)

df_cluster_words_42_tfidf.head(100)

Unnamed: 0,cluster,top_words_tfidf
0,0,"diapers, cloth, covers"
1,1,"storage, organization, furniture"
2,2,"blankets, swaddling, bedding"
3,3,"feeding, cups, solid"
4,4,"pillows, nursery, maternity"
5,5,"nursery, stickers, wall"
6,6,"care, hair, accessories"
7,7,"gates, extensions, stair"
8,8,"strollers, accessories, standard"
9,9,"care, health, thermometers"


### HDBSCAN: 235

Identify the most frequently used words for the HDBSDCAN Procedure which produced 235 clusters.

In [14]:
# Pre-process text
hdb_scan['tokens'] = hdb_scan['text'].apply(preprocess)

# Identify the most frequently used words from each cluster
top_words_dict = get_top_words(hdb_scan, 'cluster', 'tokens', top_n=3)

# Map top words back to each row
hdb_scan['top_words'] = hdb_scan['cluster'].map(top_words_dict).apply(lambda words: ', '.join(words))

In [15]:
# Print out the top 3 words for each cluster

# Keep only necessary columns
df_cluster_words_hdbscan= hdb_scan[['cluster', 'top_words']].copy()

# Convert top words to a tuple and only get unique instances of clusters and top words
df_cluster_words_hdbscan['top_words_tuple'] = df_cluster_words_hdbscan['top_words'].apply(lambda x: tuple(x))
df_cluster_words_hdbscan = df_cluster_words_hdbscan.drop_duplicates(subset=['cluster', 'top_words_tuple'])

# Drop the tuple column
df_cluster_words_hdbscan = df_cluster_words_hdbscan.drop(columns='top_words_tuple')

# Sort by cluster
df_cluster_words_hdbscan = df_cluster_words_hdbscan.sort_values(by='cluster').reset_index(drop=True)

# Make it possible to scroll through this dataframe
# Display the DataFrame in a scrollable box
display(HTML(df_cluster_words_hdbscan.to_html(
    max_rows=300, notebook=True, index=False, justify='center',
    classes='scrollable-table', escape=False, table_id='cluster_table'
)))

# Inject some CSS to enable scrollable output
display(HTML('''
<style>
.scrollable-table {
    display: block;
    max-height: 400px;
    overflow-y: auto;
    border: 1px solid #ccc;
    padding: 10px;
}
</style>
'''))

cluster,top_words
-1,"bedding, nursery, accessories"
0,"nursery, blankets, swaddling"
1,"bedding, sheets, crib"
2,"care, hair, accessories"
3,"nursery, furniture, storage"
4,"gifts, keepsakes, hand"
5,"bedding, toddler, nursery"
6,"accessories, stroller, strollers"
7,"nursery, furniture, playards"
8,"accessories, stroller, strollers"


Now use TF-IDF.

In [17]:
# Identify the most frequently used words from each cluster
top_words_dict_tfidf = get_tfidf_top_words(hdb_scan, 'cluster', 'tokens', top_n=3)

# SMerge tfidf top words back onto your original dataframe by cluster
hdb_scan = hdb_scan.merge(top_words_dict_tfidf, on = 'cluster', how = 'left')

# Svae top_words_tfidf as a readable string
hdb_scan['top_words_tfidf'] = hdb_scan['top_words_tfidf'].apply(
    lambda words: ', '.join(words) if isinstance(words, list) else ''
)

In [18]:
# Print out the top 3 words for each cluster

# Keep only necessary columns
df_cluster_words_hdbscan_tfidf = hdb_scan[['cluster', 'top_words_tfidf']].copy()

# Convert top words to a tuple and only get unique instances of clusters and top words
df_cluster_words_hdbscan_tfidf['top_words_tuple'] = df_cluster_words_hdbscan_tfidf['top_words_tfidf'].apply(lambda x: tuple(x))
df_cluster_words_hdbscan_tfidf = df_cluster_words_hdbscan_tfidf.drop_duplicates(subset=['cluster', 'top_words_tuple'])

# Drop the tuple column
df_cluster_words_hdbscan_tfidf = df_cluster_words_hdbscan_tfidf.drop(columns='top_words_tuple')

# Sort by cluster
df_cluster_words_hdbscan_tfidf = df_cluster_words_hdbscan_tfidf.sort_values(by='cluster').reset_index(drop=True)

display(HTML(df_cluster_words_hdbscan_tfidf.to_html(
    max_rows=300, notebook=True, index=False, justify='center',
    classes='scrollable-table', escape=False, table_id='cluster_table'
)))

# Inject some CSS to enable scrollable output
display(HTML('''
<style>
.scrollable-table {
    display: block;
    max-height: 400px;
    overflow-y: auto;
    border: 1px solid #ccc;
    padding: 10px;
}
</style>
'''))

cluster,top_words_tfidf
-1,"bedding, accessories, nursery"
0,"bedding, swaddling, nursery"
1,"sheets, bedding, nursery"
2,"hair, care, clips"
3,"storage, stools, step"
4,"makers, keepsakes, hand"
5,"bedding, sets, nursery"
6,"accessories, strollers, hooks"
7,"playards, furniture, nursery"
8,"accessories, strollers, connectors"


## Identify "Outcasts"

Identify descriptions in each cluster that do not have obne of the top 3 most used words in their description.

In [19]:
# Create a function which flags records that do not have one of the top 3 words in their description
def lacks_top_words(row):
    # Ensure tokens is a list
    if not isinstance(row['tokens'], list):
        return True

    # Convert top_words and tfidf to lists if they are comma-separated strings
    top_words = row['top_words']
    tfidf_words = row['top_words_tfidf']

    if isinstance(top_words, str):
        top_words = [w.strip() for w in top_words.split(',')]
    if isinstance(tfidf_words, str):
        tfidf_words = [w.strip() for w in tfidf_words.split(',')]

    # Check if any top word or tfidf word appears in tokens
    return not any(word in row['tokens'] for word in top_words) and \
           not any(word in row['tokens'] for word in tfidf_words)


### K-Means: 19

Check the number of records which do not have a description with one of the top words from either the top words or TF-IDF top words check.

In [20]:
# Identify records in the kmeans_19 dataframe with descriptions lacking the top words of their respective cluster
# Apply the function to flag rows missing top words
kmeans_19['missing_top_word'] = kmeans_19.apply(lacks_top_words, axis=1)

In [21]:
# Get a sense of how many observations have the top words and how many do not
grouped_counts_19 = (
    kmeans_19
    .groupby(['cluster', 'top_words', 'top_words_tfidf', 'missing_top_word'])
    .size()
    .reset_index(name='count')
)

# Order by cluster
grouped_counts_19 = grouped_counts_19.sort_values(by = ['cluster', 'missing_top_word']).reset_index(drop = True)

# View the result
grouped_counts_19.head(100)


Unnamed: 0,cluster,top_words,top_words_tfidf,missing_top_word,count
0,0,"cloth, diapers, diapering","diapers, cloth, covers",False,6457
1,0,"cloth, diapers, diapering","diapers, cloth, covers",True,5
2,1,"storage, furniture, nursery","storage, organization, furniture",False,9510
3,1,"storage, furniture, nursery","storage, organization, furniture",True,3
4,2,"blankets, swaddling, nursery","blankets, swaddling, bedding",False,18371
...,...,...,...,...,...
67,39,"changing, portable, pads","diapering, pads, portable",False,1243
68,39,"changing, portable, pads","diapering, pads, portable",True,1
69,40,"towels, washcloths, bathing","towels, washcloths, hooded",False,3032
70,40,"towels, washcloths, bathing","towels, washcloths, hooded",True,4


In [22]:
# Count the number of "True" records in this dataframe
kmeans_19.missing_top_word.value_counts()


missing_top_word
False    207246
True      10478
Name: count, dtype: int64

In [23]:
# Percentage of cases missing one of the top words
counts = kmeans_19['missing_top_word'].value_counts()
percent_true = (counts.get(True, 0) / counts.sum()) * 100
percent_false = (counts.get(False, 0) / counts.sum()) * 100
print(f"Records WITHOUT the Top Words in their Description - {percent_true:.2f}%")
print(f"Records WITH the Top Words in their Description - {percent_false:.2f}%")

Records WITHOUT the Top Words in their Description - 4.81%
Records WITH the Top Words in their Description - 95.19%


### K-Means: 42

Identify the number of records without any of the top words in their descriptions for the K-Means 42 cluster dataset.

In [24]:
# Identify records in the kmeans_42 dataframe with descriptions lacking the top words of their respective cluster
# Apply the function to flag rows missing top words
kmeans_42['missing_top_word'] = kmeans_42.apply(lacks_top_words, axis=1)

In [26]:
# Get a sense of how many observations have the top words and how many do not
grouped_counts_42 = (
    kmeans_42
    .groupby(['cluster', 'top_words', 'top_words_tfidf', 'missing_top_word'])
    .size()
    .reset_index(name='count')
)

# Order by cluster
grouped_counts_42 = grouped_counts_42.sort_values(by = ['cluster', 'missing_top_word']).reset_index(drop = True)

# View the result
display(HTML(grouped_counts_42.to_html(
    max_rows=300, notebook=True, index=False, justify='center',
    classes='scrollable-table', escape=False, table_id='cluster_table'
)))

# Inject some CSS to enable scrollable output
display(HTML('''
<style>
.scrollable-table {
    display: block;
    max-height: 400px;
    overflow-y: auto;
    border: 1px solid #ccc;
    padding: 10px;
}
</style>
'''))

cluster,top_words,top_words_tfidf,missing_top_word,count
0,"cloth, diapers, diapering","diapers, cloth, covers",False,6457
0,"cloth, diapers, diapering","diapers, cloth, covers",True,5
1,"storage, furniture, nursery","storage, organization, furniture",False,9510
1,"storage, furniture, nursery","storage, organization, furniture",True,3
2,"blankets, swaddling, nursery","blankets, swaddling, bedding",False,18371
2,"blankets, swaddling, nursery","blankets, swaddling, bedding",True,15
3,"feeding, solid, cups","feeding, cups, solid",False,9013
3,"feeding, solid, cups","feeding, cups, solid",True,1
4,"pillows, nursery, maternity","pillows, nursery, maternity",False,2687
4,"pillows, nursery, maternity","pillows, nursery, maternity",True,49


In [27]:
# Count the number of "True" records in this dataframe
kmeans_42.missing_top_word.value_counts()

missing_top_word
False    207246
True      10478
Name: count, dtype: int64

In [28]:
# Percentage of cases missing one of the top words
counts = kmeans_42['missing_top_word'].value_counts()
percent_true = (counts.get(True, 0) / counts.sum()) * 100
percent_false = (counts.get(False, 0) / counts.sum()) * 100
print(f"Records WITHOUT the Top Words in their Description - {percent_true:.2f}%")
print(f"Records WITH the Top Words in their Description - {percent_false:.2f}%")

Records WITHOUT the Top Words in their Description - 4.81%
Records WITH the Top Words in their Description - 95.19%


### HDBSCAN: 235

Finally, go ahead and perform the same analysis with the HDBSCAN Clusters.

In [29]:
# Identify records in the hdb_scan dataframe with descriptions lacking the top words of their respective cluster
# Apply the function to flag rows missing top words
hdb_scan['missing_top_word'] = hdb_scan.apply(lacks_top_words, axis=1)

In [30]:
# Get a sense of how many observations have the top words and how many do not
grouped_counts_hdbscan = (
    hdb_scan
    .groupby(['cluster', 'top_words', 'top_words_tfidf', 'missing_top_word'])
    .size()
    .reset_index(name='count')
)

# Order by cluster
grouped_counts_hdbscan = grouped_counts_hdbscan.sort_values(by = ['cluster', 'missing_top_word']).reset_index(drop = True)

# View the result
display(HTML(grouped_counts_hdbscan.to_html(
    max_rows=8000, notebook=True, index=False, justify='center',
    classes='scrollable-table', escape=False, table_id='cluster_table'
)))

# Inject some CSS to enable scrollable output
display(HTML('''
<style>
.scrollable-table {
    display: block;
    max-height: 400px;
    overflow-y: auto;
    border: 1px solid #ccc;
    padding: 10px;
}
</style>
'''))

cluster,top_words,top_words_tfidf,missing_top_word,count
-1,"bedding, nursery, accessories","bedding, accessories, nursery",False,3569
-1,"bedding, nursery, accessories","bedding, accessories, nursery",True,8433
0,"nursery, blankets, swaddling","bedding, swaddling, nursery",False,2555
1,"bedding, sheets, crib","sheets, bedding, nursery",False,3490
2,"care, hair, accessories","hair, care, clips",False,1186
3,"nursery, furniture, storage","storage, stools, step",False,997
4,"gifts, keepsakes, hand","makers, keepsakes, hand",False,405
5,"bedding, toddler, nursery","bedding, sets, nursery",False,836
6,"accessories, stroller, strollers","accessories, strollers, hooks",False,464
7,"nursery, furniture, playards","playards, furniture, nursery",False,1140


In [31]:
# Count the number of "True" records in this dataframe
hdb_scan.missing_top_word.value_counts()

missing_top_word
False    206573
True      11151
Name: count, dtype: int64

In [32]:
# Percentage of cases missing one of the top words
counts = hdb_scan['missing_top_word'].value_counts()
percent_true = (counts.get(True, 0) / counts.sum()) * 100
percent_false = (counts.get(False, 0) / counts.sum()) * 100
print(f"Records WITHOUT the Top Words in their Description - {percent_true:.2f}%")
print(f"Records WITH the Top Words in their Description - {percent_false:.2f}%")

Records WITHOUT the Top Words in their Description - 5.12%
Records WITH the Top Words in their Description - 94.88%


### Save Non-Matches
Create dataframes of just the records whose descriptions do not match the top words.

In [33]:
# Save dataframes
kmeans_19_nonmatching = kmeans_19[kmeans_19['missing_top_word'] == True]
kmeans_42_nonmatching = kmeans_42[kmeans_42['missing_top_word'] == True]
hdb_scan_nonmatching = hdb_scan[hdb_scan['missing_top_word'] == True]

## Output to Excel

Output the results produced above to Excel. Specifically, focus on the datasets with the number of top words and the breakdown of True and False by topword. Also get the datasets of records without the top words in their descriptions.

In [34]:
# K-Means 19 Datasets
with pd.ExcelWriter('/Volumes/Samsung1TB/programming/data-science/baby-product-dashboard/clustered-data/kmeans-19-breakdown.xlsx', engine='openpyxl') as writer:
    kmeans_19_nonmatching.to_excel(writer, sheet_name='cluster_19_nonmatching', index=False)
    grouped_counts_19.to_excel(writer, sheet_name='cluster_19_counts', index=False)
    df_cluster_words_19.to_excel(writer, sheet_name='cluster-by-freq')
    df_cluster_words_19_tfidf.to_excel(writer, sheet_name='cluster-tfidf')

# K-Means 42 Datasets
with pd.ExcelWriter('/Volumes/Samsung1TB/programming/data-science/baby-product-dashboard/clustered-data/kmeans-42-breakdown.xlsx', engine='openpyxl') as writer:
    kmeans_42_nonmatching.to_excel(writer, sheet_name='cluster_42_nonmatching', index=False)
    grouped_counts_42.to_excel(writer, sheet_name='cluster_42_counts', index=False)
    df_cluster_words_42.to_excel(writer, sheet_name='cluster-by-freq')
    df_cluster_words_42_tfidf.to_excel(writer, sheet_name='cluster-tfidf')

# HDBSCAN Datasets
with pd.ExcelWriter('/Volumes/Samsung1TB/programming/data-science/baby-product-dashboard/clustered-data/hdbscan-breakdown.xlsx', engine='openpyxl') as writer:
    hdb_scan_nonmatching.to_excel(writer, sheet_name='cluster_hdbscan_nonmatching', index=False)
    grouped_counts_hdbscan.to_excel(writer, sheet_name='cluster_hdbscan_counts', index=False)
    df_cluster_words_hdbscan.to_excel(writer, sheet_name='cluster-by-freq')
    df_cluster_words_hdbscan_tfidf.to_excel(writer, sheet_name='cluster-tfidf')

    