In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, DBSCAN
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

# Set random seed for reproducibility
np.random.seed(42)

# Generate structured synthetic data with meaningful clusters
num_rows = 2710
clusters = 4  # Assume four distinct groups

# Define distinct cluster properties
cluster_properties = [
    {'price_range': (50, 100), 'sales_range': (800, 1000), 'discount_range': (5, 20), 'marketing_score': (7, 10)},
    {'price_range': (100, 150), 'sales_range': (400, 700), 'discount_range': (10, 30), 'marketing_score': (5, 8)},
    {'price_range': (150, 200), 'sales_range': (100, 400), 'discount_range': (20, 40), 'marketing_score': (3, 6)},
    {'price_range': (200, 250), 'sales_range': (50, 200), 'discount_range': (30, 50), 'marketing_score': (1, 4)}
]

data_list = []
for i in range(num_rows):
    group = np.random.choice(clusters)
    props = cluster_properties[group]
    data_list.append([
        np.random.uniform(*props['price_range']),
        np.random.uniform(*props['sales_range']),
        np.random.uniform(*props['discount_range']),
        np.random.randint(props['marketing_score'][0], props['marketing_score'][1]+1),
        group  # True cluster label (for evaluation purposes)
    ])

data = pd.DataFrame(data_list, columns=['Price', 'Sale Quantity', 'Discount (%)', 'Marketing Appeal Score', 'True Cluster'])

# Save to CSV
csv_filename = 'nike_clustering_data.csv'
data.to_csv(csv_filename, index=False)
print(f"Synthetic dataset saved to {csv_filename}")

Synthetic dataset saved to nike_clustering_data.csv
