In [4]:
import pandas as pd
import networkx as nx
from sklearn.metrics import jaccard_score
from itertools import combinations
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from grakel.kernels import GraphletSampling
from grakel.utils import graph_from_networkx
from sklearn.cluster import AgglomerativeClustering
import h5py


# i
# Open the HDF5 file
file = h5py.File(
    "market_data.h5",
    "r",
)

region_map = {
    0: "Southeast Asia",
    1: "South Asia",
    2: "Oceania",
    3: "Eastern Asia",
    4: "West Asia",
    5: "West of USA",
    6: "US Center",
    7: "West Africa",
    8: "Central Africa",
    9: "North Africa",
    10: "Western Europe",
    11: "Northern Europe",
    12: "Central America",
    13: "Caribbean",
    14: "South America",
    15: "East Africa",
    16: "Southern Europe",
    17: "East of USA",
    18: "Canada",
    19: "Southern Africa",
    20: "Central Asia",
    21: "Eastern Europe",
    22: "South of USA",
}

sheet_names = ['0', '1', '2', '3', '5', '6', '7', '9', '10', '12', '14', '16', '17', '22']

# Initialize an empty list to store DataFrames from each sheet
dataframes = []

# Read and concatenate DataFrames from each sheet
for sheet_name in sheet_names:
    # Read the dataset using the current sheet name
    dataset = file[sheet_name][:]
    dataset = pd.DataFrame(dataset)

    # Read the column names from the attributes
    column_names = file[sheet_name].attrs["columns"]

    # Assign column names to the dataset
    dataset.columns = column_names


    # Append the DataFrame to the list
    dataframes.append(dataset)

# Concatenate all DataFrames into a single DataFrame
df = pd.concat(dataframes, ignore_index=True)

# Replace "Region Index" with "Order Region"
df.rename(columns={"Region Index": "Order Region"}, inplace=True)

# Replace numbers with corresponding names using region_map
df["Order Region"] = df["Order Region"].map(region_map)

df


Unnamed: 0,Order Id,Order Customer Id,Order Item Id,Order Item Product Price,Department Id,Category Id,shipping_month,Benefit per order,Product Card Id,Product Name,...,order_year,Order Item Discount,Department Name,Market,Order City,Days for shipment (scheduled),Customer Segment,Customer Full Name,Sales,Order Region
0,77202.0,20755.0,180517.0,327.750000,2.0,73.0,2.0,91.250000,1360.0,78.0,...,2018.0,13.110000,4.0,3.0,336.0,4.0,0.0,1876.0,327.750000,Southeast Asia
1,75929.0,19482.0,179244.0,327.750000,2.0,73.0,1.0,45.689999,1360.0,78.0,...,2018.0,59.000000,4.0,3.0,1962.0,2.0,0.0,4257.0,327.750000,Southeast Asia
2,75928.0,19481.0,179243.0,327.750000,2.0,73.0,1.0,21.760000,1360.0,78.0,...,2018.0,65.550003,4.0,3.0,1962.0,2.0,1.0,2170.0,327.750000,Southeast Asia
3,75905.0,19458.0,179220.0,327.750000,2.0,73.0,1.0,152.600006,1360.0,78.0,...,2018.0,9.830000,4.0,3.0,2981.0,1.0,1.0,1071.0,327.750000,Southeast Asia
4,75900.0,19453.0,179215.0,327.750000,2.0,73.0,1.0,143.160004,1360.0,78.0,...,2018.0,29.500000,4.0,3.0,2981.0,0.0,0.0,3138.0,327.750000,Southeast Asia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146277,36420.0,5728.0,90932.0,299.980011,7.0,43.0,6.0,84.379997,957.0,21.0,...,2016.0,9.000000,3.0,4.0,1729.0,2.0,2.0,12067.0,299.980011,South of USA
146278,35075.0,802.0,87597.0,39.990002,5.0,29.0,6.0,-117.779999,627.0,102.0,...,2016.0,6.400000,6.0,4.0,2083.0,4.0,0.0,9325.0,159.960007,South of USA
146279,35471.0,9173.0,88607.0,39.990002,5.0,29.0,6.0,2.910000,627.0,102.0,...,2016.0,14.400000,6.0,4.0,768.0,4.0,0.0,9558.0,159.960007,South of USA
146280,37874.0,11254.0,94538.0,59.990002,4.0,17.0,7.0,99.580002,365.0,71.0,...,2016.0,40.790001,0.0,4.0,178.0,4.0,0.0,12281.0,239.960007,South of USA


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import pairwise_distances
import matplotlib.pyplot as plt
import seaborn as sns

# ... (your previous code for reading the data)

# Copy the original DataFrame to avoid modifying the original data
df_cluster = df.copy()

# Encode the "Order Region" column into numerical values
label_encoder = LabelEncoder()
df_cluster['Order Region'] = label_encoder.fit_transform(df_cluster['Order Region'])

# Get unique Order Regions
unique_regions = df_cluster['Order Region'].unique()

# Visualize Jaccard distance matrix for each Order Region separately
plt.figure(figsize=(15, 12))

for region in unique_regions:
    # Create a subset DataFrame for the current Order Region
    subset_df = df_cluster[df_cluster['Order Region'] == region].drop(columns=['Order Region'])
    
    # Calculate Jaccard distance matrix for the subset
    jaccard_distance_matrix = pairwise_distances(subset_df, metric='jaccard')
    
    # Plot the Jaccard distance matrix
    plt.subplot(4, 6, region + 1)  # Adjust the subplot parameters as needed
    sns.heatmap(jaccard_distance_matrix, annot=True, fmt=".2f", cmap="YlGnBu", xticklabels=False, yticklabels=False)
    plt.title(f'Jaccard Distance - {region}')

# Adjust layout and display the plot
plt.tight_layout()
plt.show()




: 