In [27]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.decomposition import PCA
import plotly.graph_objects as go
# Load your data
cluster_num = 150
mode = "female"
epoch = 60
data_dir = "drama_sci-fi"
user_embeddings = np.load(f'res/{data_dir}/{cluster_num}/{mode}/user_embedding_{epoch}.npy')
cluster_labels = np.load(f'res/{data_dir}/{cluster_num}/{mode}/user_proto_label_{epoch}.npy')
cluster_embedding = np.load(f'res/{data_dir}/{cluster_num}/{mode}/cluster_embedding_{epoch}.npy')

# Perform PCA to reduce to 3 dimensions
pca = PCA(n_components=2)
embeddings_reduced = pca.fit_transform(user_embeddings)
cluster_embedding_reduced = pca.transform(cluster_embedding)
# Prepare the data for plotting
df_plot = pd.DataFrame(embeddings_reduced, columns=['x', 'y'])
df_plot['cluster'] = cluster_labels
cluster_range = sorted(list(df_plot.value_counts("cluster")[:10].index.values))
df_plot_filtered = df_plot[(df_plot['cluster'].isin(cluster_range))].copy().sort_values('cluster')
df_plot_filtered['cluster'] = df_plot_filtered['cluster'].astype(str)

df_clusters = pd.DataFrame(cluster_embedding_reduced[cluster_range], columns=['x', 'y'])
df_clusters['cluster'] = range(len(cluster_embedding_reduced[cluster_range]))  
fig = go.Figure()
symbols_for_gender = {'male': 'circle', 'female': 'square'}
colors_for_legend = px.colors.qualitative.G10
for i in range(len(cluster_range)):
    fig.add_trace(
        go.Scatter(
            x=[None], y=[None],  # No data points, so it doesn't show in the plot
            mode='markers',
            marker=dict(
                size=10,
                color=colors_for_legend[i % len(colors_for_legend)],  # Use cluster color
            ),
            name=f'Cluster {i}',
            legendgroup='cluster'
        )
    )

# Add a trace for each cluster-gender combination without adding them to legend
for i,cluster in enumerate(df_plot_filtered['cluster'].unique()):
    df_subset = df_plot_filtered[(df_plot_filtered['cluster'] == cluster)]
    fig.add_trace(
        go.Scatter(
            x=df_subset['x'],
            y=df_subset['y'],
            mode='markers',
            marker=dict(
                size=12,
                color=colors_for_legend[i % len(colors_for_legend)],  # Loop through colors
                symbol="circle",
                opacity=0.7
            ),
            # name=f'{cluster}, {gender}',
            # legendgroup=gender,
            showlegend=False  # Set showlegend to False
        )
    )
fig.update_traces(marker=dict(size=10, opacity=0.8))
for i, row in df_clusters.iterrows():
    fig.add_trace(
        go.Scatter(
            x=[row['x']],
            y=[row['y']],
            mode='markers',
            marker=dict(
                size=12,
                color=colors_for_legend[i % len(colors_for_legend)],  # Color for cluster centers
                line=dict(
                    color='DarkSlateGrey',
                    width=2
                ),
                symbol='x'
            ),
            showlegend=False
        )
    )
fig.update_layout(
   
    height=700,  # Adjust the height as needed
    width=900,   # Adjust the width as needed
)
fig.show()



The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`.



In [30]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.decomposition import PCA
import plotly.graph_objects as go
# Load your data
seed = 0
cluster_num = 100
mode = "male"
epoch = 60
cluster_range =(5,10) 
data_dir = "action_comedy"
user_embeddings = np.load(f'res/{data_dir}/{cluster_num}/{mode}/user_embedding_{epoch}.npy')
cluster_labels = np.load(f'res/{data_dir}/{cluster_num}/{mode}/user_proto_label_{epoch}.npy')
cluster_embedding = np.load(f'res/{data_dir}/{cluster_num}/{mode}/cluster_embedding_{epoch}.npy')

# Perform PCA to reduce to 3 dimensions
pca = PCA(n_components=2)
embeddings_reduced = pca.fit_transform(user_embeddings)
cluster_embedding_reduced = pca.transform(cluster_embedding)[cluster_range[0]:cluster_range[1]]
# Prepare the data for plotting
df_plot = pd.DataFrame(embeddings_reduced, columns=['x', 'y'])
df_plot['cluster'] = cluster_labels
df_plot_filtered = df_plot[(df_plot['cluster'] < cluster_range[1])&(df_plot['cluster']>=cluster_range[0])].copy()
df_plot_filtered['cluster'] = df_plot_filtered['cluster'].astype(str)

df_clusters = pd.DataFrame(cluster_embedding_reduced, columns=['x', 'y'])
df_clusters['cluster'] = range(len(cluster_embedding_reduced))  
# Create and show the 3D scatter plot
fig = px.scatter(df_plot_filtered, x='x', y='y',
                    color='cluster', 
                    title='2D Plot of User Embeddings',
                    labels={'cluster': 'Cluster Label'},
                    width=1000,  # Specify the width here
                    height=800,
)
fig.update_traces(marker=dict(size=10))
for i, row in df_clusters.iterrows():
    fig.add_trace(
        go.Scatter(
            x=[row['x']],
            y=[row['y']],
            mode='markers',
            marker=dict(
                size=8,
                color='black',  # Color for cluster centers
                line=dict(
                    color='DarkSlateGrey',
                    width=2
                ),
            ),
            showlegend=False
        )
    )
fig.show()


In [16]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.decomposition import PCA
import plotly.graph_objects as go

cluster_num = 200
mode = "joint"
epoch = 60
data_dir = "drama_sci-fi"
# cluster_range = (5, 10)
user_embeddings = np.load(f'res/{data_dir}/{cluster_num}/{mode}/user_embedding_{epoch}.npy')
cluster_labels = np.load(f'res/{data_dir}/{cluster_num}/{mode}/user_proto_label_{epoch}.npy')
cluster_embedding = np.load(f'res/{data_dir}/{cluster_num}/{mode}/cluster_embedding_{epoch}.npy')
gender_labels = np.load(f'res/{data_dir}/{cluster_num}/{mode}/gender_label_{epoch}.npy')

mask = gender_labels==1
male =np.unique(cluster_labels[mask], return_counts=True)
female =np.unique(cluster_labels[~mask], return_counts=True)
male_label,male_count = male[0], male[1]
female_label,female_count = female[0], female[1]
# cluster_range = sorted(list(set(male_label[np.where(male_count<10)]).intersection(set(female_label[np.where(female_count>10)]))))
set(male_label[np.where(male_count<20)]).intersection(set(female_label[np.where(female_count>20)]))
intersec_label = np.array(list(set(male_label).intersection(set(female_label))))
inter_female_count = female_count[[ True if l in intersec_label else False for l in female_label ]]
inter_male_count = male_count[[ True if l in intersec_label else False for l in male_label ]]
# cluster_range = intersec_label[[True if fc>mc else False for fc,mc in zip(inter_female_count,inter_male_count) ]]
target_label = intersec_label[[True if fc>mc else False for fc,mc in zip(inter_female_count,inter_male_count) ]]
diff = [fc/mc  for fc,mc in zip(inter_female_count,inter_male_count) if fc>mc]
res = [(l,dif) for l,dif in zip(target_label,diff)]
res.sort(key=lambda x:x[1],reverse=True)
cluster_range = sorted([l for l,d in res])
print(cluster_range)
pca = PCA(n_components=2)
embeddings_reduced = pca.fit_transform(user_embeddings)
cluster_embedding_reduced = pca.transform(cluster_embedding)[cluster_range]
df_plot = pd.DataFrame(embeddings_reduced, columns=['x', 'y'])
df_plot['cluster'] = cluster_labels
df_plot['gender'] = gender_labels
df_plot['gender'] = df_plot['gender'].apply(lambda x: "male" if x == 1 else "female")
# print("number of male data:",len(df_plot[df_plot['gender']=="male"]))
df_plot=df_plot[df_plot['gender']=="female"].reset_index(drop=True)
df_plot_filtered = df_plot[(df_plot['cluster'].isin(cluster_range))].sort_values('cluster')
target_index = df_plot_filtered.index   
print("number of female data:",len(df_plot_filtered))


mode = "female"
user_embeddings = np.load(f'res/{data_dir}/{cluster_num}/{mode}/user_embedding_{epoch}.npy')
cluster_labels = np.load(f'res/{data_dir}/{cluster_num}/{mode}/user_proto_label_{epoch}.npy')
cluster_embedding = np.load(f'res/{data_dir}/{cluster_num}/{mode}/cluster_embedding_{epoch}.npy')

# Perform PCA to reduce to 3 dimensions
pca = PCA(n_components=2)
embeddings_reduced = pca.fit_transform(user_embeddings)
cluster_embedding_reduced = pca.transform(cluster_embedding)
# Prepare the data for plotting
df_plot = pd.DataFrame(embeddings_reduced, columns=['x', 'y'])
df_plot['cluster'] = cluster_labels
df_plot = df_plot.iloc[target_index]

df_clusters = pd.DataFrame(cluster_embedding_reduced[df_plot['cluster'].unique()], columns=['x', 'y'])
df_clusters['cluster'] = range(len(cluster_embedding_reduced[df_plot['cluster'].unique()]))  

df_plot['cluster'] = df_plot['cluster'].astype(str)
# Create and show the 3D scatter plot
# fig = px.scatter(df_plot_filtered, x='x', y='y',
#                     color='cluster', 
#                     title='2D Plot of User Embeddings',
#                     labels={'cluster': 'Cluster Label'},
#                     width=1000,  # Specify the width here
#                     height=800,
# )
fig = go.Figure()
symbols_for_gender = {'male': 'circle', 'female': 'square'}
colors_for_legend = px.colors.qualitative.G10
for i in range(len(df_clusters)):
    fig.add_trace(
        go.Scatter(
            x=[None], y=[None],  # No data points, so it doesn't show in the plot
            mode='markers',
            marker=dict(
                size=10,
                color=colors_for_legend[i % len(colors_for_legend)],  # Use cluster color
            ),
            name=f'Cluster {i}',
            legendgroup='cluster'
        )
    )

# Add a trace for each cluster-gender combination without adding them to legend
for i, cluster in enumerate(df_plot['cluster'].unique()):
    df_subset = df_plot[(df_plot['cluster'] == cluster)]
    fig.add_trace(
        go.Scatter(
            x=df_subset['x'],
            y=df_subset['y'],
            mode='markers',
            marker=dict(
                size=12,
                color=colors_for_legend[i % len(colors_for_legend)],  # Loop through colors
                symbol="circle",
                opacity=0.7
            ),
            # name=f'{cluster}, {gender}',
            # legendgroup=gender,
            showlegend=False  # Set showlegend to False
        )
    )
fig.update_traces(marker=dict(size=10, opacity=0.8))
for i, row in df_clusters.iterrows():
    fig.add_trace(
        go.Scatter(
            x=[row['x']],
            y=[row['y']],
            mode='markers',
            marker=dict(
                size=12,
                color='black',  # Color for cluster centers
                line=dict(
                    color=colors_for_legend[i % len(colors_for_legend)],
                    width=2
                ),
                symbol='x'
            ),
            showlegend=False
        )
    )
fig.update_layout(
   
    height=700,  # Adjust the height as needed
    width=900,   # Adjust the width as needed
)
fig.show()


[2, 10, 45, 121, 251, 313, 324, 326]
number of female data: 190


## Plot separate and joint cluster 
*  the number of female embedding in the cluster is the top10 highest


In [12]:
# Perform PCA to reduce to 2 dimensions

import plotly.express as px
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sklearn.manifold import TSNE
#"action_comedy", "sci-fi_thriller", "drama_sci-fi", "comedy_drama"
for data_dir in ["action_comedy"]:
    for cluster_num in [50,100,150,200,250]:
        print("data_dir:",  data_dir)
        print("cluster_num:",  cluster_num)
        print("mode: separate")
        mode = "female"
        epoch = 40
        # data_dir = "drama_sci-fi"
        user_embeddings = np.load(f'res/{data_dir}/{cluster_num}/{mode}/user_embedding_{epoch}.npy')
        cluster_labels = np.load(f'res/{data_dir}/{cluster_num}/{mode}/user_proto_label_{epoch}.npy')
        cluster_embedding = np.load(f'res/{data_dir}/{cluster_num}/{mode}/cluster_embedding_{epoch}.npy')

        # Perform PCA to reduce to 3 dimensions

        tsne = TSNE(n_components=2)
        embeddings_reduced = tsne.fit_transform(user_embeddings)
        cluster_embedding_reduced = tsne.fit_transform(cluster_embedding)
        # Prepare the data for plotting
        df_plot = pd.DataFrame(embeddings_reduced, columns=['x', 'y'])
        df_plot['cluster'] = cluster_labels
        cluster_range = sorted(list(df_plot.value_counts("cluster")[:10].index.values))
        df_plot_filtered = df_plot[(df_plot['cluster'].isin(cluster_range))].copy().sort_values('cluster')
        df_plot_filtered['cluster'] = df_plot_filtered['cluster'].astype(str)

        df_clusters = pd.DataFrame(cluster_embedding_reduced[cluster_range], columns=['x', 'y'])
        df_clusters['cluster'] = range(len(cluster_embedding_reduced[cluster_range]))  
        fig = go.Figure()
        symbols_for_gender = {'male': 'circle', 'female': 'square'}
        colors_for_legend = px.colors.qualitative.G10
        
        # fig.add_trace(
        #     go.Scatter(
        #         x=[None],
        #         y=[None],
        #         mode='markers',
        #         marker=dict(
        #             size=12,
        #             color='black',  # Color for cluster centers
        #             line=dict(
        #                 color='black',
        #                 width=2
        #             ),
        #             symbol='x',
        #         ),
        #         showlegend=True,
        #         name= "Centroid"
        #     )
        # )
        for i in range(1,len(cluster_range)+1):
            fig.add_trace(
                go.Scatter(
                    x=[None], y=[None],  # No data points, so it doesn't show in the plot
                    mode='markers',
                    marker=dict(
                        size=12,
                        color=colors_for_legend[i % len(colors_for_legend)],  # Use cluster color
                    ),
                    name=f'Cluster {i}',
                    legendgroup='cluster'
                )
            )

        # Add a trace for each cluster-gender combination without adding them to legend
        for i,cluster in enumerate(df_plot_filtered['cluster'].unique()):
            df_subset = df_plot_filtered[(df_plot_filtered['cluster'] == cluster)]
            fig.add_trace(
                go.Scatter(
                    x=df_subset['x'],
                    y=df_subset['y'],
                    mode='markers',
                    marker=dict(
                        size=12,
                        color=colors_for_legend[i % len(colors_for_legend)],  # Loop through colors
                        symbol="circle",
                        opacity=0.8
                    ),
                    # name=f'{cluster}, {gender}',
                    # legendgroup=gender,
                    showlegend=False  # Set showlegend to False
                )
            )
        fig.update_traces(marker=dict(size=12, opacity=0.8))
        # for i, row in df_clusters.iterrows():
        #     fig.add_trace(
        #         go.Scatter(
        #             x=[row['x']],
        #             y=[row['y']],
        #             mode='markers',
        #             marker=dict(
        #                 size=12,
        #                 color=colors_for_legend[i % len(colors_for_legend)],  # Color for cluster centers
        #                 line=dict(
        #                     color='DarkSlateGrey',
        #                     width=2
        #                 ),
        #                 symbol='x'
        #             ),
        #             showlegend=False
        #         )
        #     )
        
        fig.update_layout(
             legend=dict(font=dict(size=15)),
            height=700,  # Adjust the height as needed
            width=900,   # Adjust the width as needed
        )
        fig.show()
        print("mode: joint")
        mode = "joint"
        user_embeddings = np.load(f'res/{data_dir}/{cluster_num}/{mode}/user_embedding_{epoch}.npy')
        cluster_labels = np.load(f'res/{data_dir}/{cluster_num}/{mode}/user_proto_label_{epoch}.npy')
        cluster_embedding = np.load(f'res/{data_dir}/{cluster_num}/{mode}/cluster_embedding_{epoch}.npy')
        gender_labels = np.load(f'res/{data_dir}/{cluster_num}/{mode}/gender_label_{epoch}.npy')

        mask = gender_labels==1
        male =np.unique(cluster_labels[mask], return_counts=True)
        female =np.unique(cluster_labels[~mask], return_counts=True)
        male_label,male_count = male[0], male[1]
        female_label,female_count = female[0], female[1]
        # cluster_range = sorted(list(set(male_label[np.where(male_count<10)]).intersection(set(female_label[np.where(female_count>10)]))))
        # set(male_label[np.where(male_count<20)]).intersection(set(female_label[np.where(female_count>20)]))
        intersec_label = np.array(list(set(male_label).intersection(set(female_label))))
        inter_female_count = female_count[[ True if l in intersec_label else False for l in female_label ]]
        inter_male_count = male_count[[ True if l in intersec_label else False for l in male_label ]]
        # cluster_range = intersec_label[[True if fc>mc else False for fc,mc in zip(inter_female_count,inter_male_count) ]]
        target_label = intersec_label[[True if fc>mc else False for fc,mc in zip(inter_female_count,inter_male_count) ]]
        diff = [fc  for fc,mc in zip(inter_female_count,inter_male_count) if fc>mc]
        res = [(l,dif) for l,dif in zip(target_label,diff)] 
        res.sort(key=lambda x:x[1],reverse=True)
        cluster_range = sorted([l for l,d in res[:]])
        tsne = TSNE(n_components=2)
        embeddings_reduced = tsne.fit_transform(user_embeddings)
        tsne = TSNE(n_components=2)
        cluster_embedding_reduced = tsne.fit_transform(cluster_embedding)

        # Prepare the data for plotting
        df_plot = pd.DataFrame(embeddings_reduced, columns=['x', 'y'])
        df_plot['cluster'] = cluster_labels
        df_plot['gender'] = gender_labels
        df_plot['gender'] = df_plot['gender'].apply(lambda x: "male" if x == 1 else "female")
        
        df_plot = df_plot[df_plot['gender']=='female']
        cluster_range = sorted(list(df_plot.value_counts("cluster")[:10].index.values))
        df_plot_filtered = df_plot[(df_plot['cluster'].isin(cluster_range))].sort_values('cluster')
        
        df_clusters = pd.DataFrame(cluster_embedding_reduced[cluster_range], columns=['x', 'y'])
        df_clusters['cluster'] = range(len(cluster_embedding_reduced[cluster_range]))  
        # Create the user embedding scatter plot
        fig = go.Figure()
        symbols_for_gender = {'male': 'circle', 'female': 'circle'}
        colors_for_legend = px.colors.qualitative.G10
        # Add dummy traces for gender legend
        # for gender, symbol in symbols_for_gender.items():
        #     fig.add_trace(
        #         go.Scatter(
        #             x=[None], y=[None],  # No data points, so it doesn't show in the plot
        #             mode='markers',
        #             marker=dict(
        #                 size=10,
        #                 symbol=symbol,
        #                 color='black',
        #             ),
        #             name=str(gender),
        #             legendgroup='gender',
        #         )
        #     )

        
        # fig.add_trace(
        #     go.Scatter(
        #         x=[None],
        #         y=[None],
        #         mode='markers',
        #         marker=dict(
        #             size=15,
        #             color='black',  # Color for cluster centers
        #             line=dict(
        #                 color='black',
        #                 width=2
        #             ),
        #             symbol='x',
        #         ),
        #         showlegend=True,
        #         name= "Centroid"
        #     )
        # )

    

        # Add a trace for each cluster-gender combination without adding them to legend
        for i,cluster in enumerate(df_plot_filtered['cluster'].unique()):
            for gender, symbol in symbols_for_gender.items():
                df_subset = df_plot_filtered[(df_plot_filtered['cluster'] == cluster) & (df_plot_filtered['gender'] == gender)]
                fig.add_trace(
                    go.Scatter(
                        x=df_subset['x'],
                        y=df_subset['y'],
                        mode='markers',
                        marker=dict(
                            size=12,
                            color=colors_for_legend[i % len(colors_for_legend)],  # Loop through colors
                            symbol=symbol,
                            opacity=0.8
                        ),
                        name=f'{cluster}, {gender}',
                        legendgroup=gender,
                        showlegend=False  # Set showlegend to False
                    )
                )
        # Add dummy traces for cluster legend
        for i in range(1,len(cluster_range)+1):
            fig.add_trace(
                go.Scatter(
                    x=[None], y=[None],  # No data points, so it doesn't show in the plot
                    mode='markers',
                    marker=dict(
                        size=12,
                        color=colors_for_legend[i % len(colors_for_legend)],  # Use cluster color
                    ),
                    name=f'Cluster {i}',
                    legendgroup='cluster'
                )
            )
        # for i, row in df_clusters.iterrows():
        #     fig.add_trace(
        #         go.Scatter(
        #             x=[row['x']],
        #             y=[row['y']],
        #             mode='markers',
        #             marker=dict(
        #                 size=12,
        #                 color='black',  # Color for cluster centers
        #                 line=dict(
        #                     color=colors_for_legend[i % len(colors_for_legend)],
        #                     width=2
        #                 ),
        #                 symbol='x',
        #             ),
        #             showlegend=False
        #         )
        #     )
        fig.update_layout(
            legend=dict(font=dict(size=15)),
            # legend=dict(
            #     # yanchor="top",
            #     # y=0.99,
            #     # xanchor="left",
            #     # x=0.80,
            #     title=dict(text=''),  # Empty title for cleaner look
            #     traceorder='grouped',
                
            # ),
            height=700,  # Adjust the height as needed
            width=900,   # Adjust the width as needed
        )

        # Show figure
        fig.show()

data_dir: action_comedy
cluster_num: 50
mode: separate



The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`.



mode: joint



The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`.



data_dir: action_comedy
cluster_num: 100
mode: separate



The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`.



mode: joint



The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`.



data_dir: action_comedy
cluster_num: 150
mode: separate



The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`.



mode: joint



The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`.



data_dir: action_comedy
cluster_num: 200
mode: separate



The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`.



mode: joint



The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`.



data_dir: action_comedy
cluster_num: 250
mode: separate



The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`.



mode: joint



The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`.



## plot 3d cluster embedding for joint method

In [16]:
# Perform PCA to reduce to 3 dimensions
import plotly.express as px
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
import plotly.graph_objects as go
cluster_num = 100
mode = "joint"
epoch = 40
data_dir = "sci-fi_thriller"
user_embeddings = np.load(f'res/{data_dir}/{cluster_num}/{mode}/user_embedding_{epoch}.npy')
cluster_labels = np.load(f'res/{data_dir}/{cluster_num}/{mode}/user_proto_label_{epoch}.npy')
cluster_embedding = np.load(f'res/{data_dir}/{cluster_num}/{mode}/cluster_embedding_{epoch}.npy')
gender_labels = np.load(f'res/{data_dir}/{cluster_num}/{mode}/gender_label_{epoch}.npy')


# Load data
# ...

# Perform PCA to reduce to 3 dimensions for user embeddings
pca = PCA(n_components=3)
embeddings_reduced = pca.fit_transform(user_embeddings)

# Apply the same PCA transformation to the cluster centers
cluster_embedding_reduced = pca.transform(cluster_embedding)[5:10]

# Prepare the data for plotting
df_plot = pd.DataFrame(embeddings_reduced, columns=['x', 'y', 'z'])
df_plot['cluster'] = cluster_labels
df_plot['gender'] = gender_labels
df_plot['gender'] = df_plot['gender'].apply(lambda x: "male" if x == 1 else "female")

# Filter if necessary
df_plot_filtered = df_plot[(df_plot['cluster'] >= 5) & (df_plot['cluster'] < 10)]
df_plot_filtered['cluster'] = df_plot_filtered['cluster'].astype(str)
# Create a DataFrame for cluster centers for plotting
df_clusters = pd.DataFrame(cluster_embedding_reduced, columns=['x', 'y', 'z'])
df_clusters['cluster'] = range(len(cluster_embedding_reduced))  # Assuming there's one center per cluster

# Create the user embedding scatter plot
colors_for_legend = px.colors.qualitative.G10 

symbols_for_gender = {'male': 'circle', 'female': 'square'}  # Define symbols for genders


# ... (previous code stays the same)

# Initialize figure
fig = go.Figure()

# Add dummy traces for gender legend
for gender, symbol in symbols_for_gender.items():
    fig.add_trace(
        go.Scatter3d(
            x=[None], y=[None], z=[None],  # No data points, so it doesn't show in the plot
            mode='markers',
            marker=dict(
                size=10,
                symbol=symbol,
                color='black',
            ),
            name=str(gender),
            legendgroup='gender',
         
        )
    )
for i, row in df_clusters.iterrows():
    fig.add_trace(
        go.Scatter3d(
            x=[row['x']],
            y=[row['y']],
            z=[row['z']],
            mode='markers',
            marker=dict(
                size=8,
                color='black',  # Color for cluster centers
                line=dict(
                    color='DarkSlateGrey',
                    width=2
                ),
            ),
            showlegend=False
        )
    )
# Add dummy traces for cluster legend
for i in range(5, 10):
    fig.add_trace(
        go.Scatter3d(
            x=[None], y=[None], z=[None],  # No data points, so it doesn't show in the plot
            mode='markers',
            marker=dict(
                size=10,
                color=colors_for_legend[i % len(colors_for_legend)],  # Use cluster color
            ),
            name=f'Cluster {i}',
            legendgroup='cluster'
        )
    )

# Add a trace for each cluster-gender combination without adding them to legend
for cluster in df_plot_filtered['cluster'].unique():
    for gender, symbol in symbols_for_gender.items():
        df_subset = df_plot_filtered[(df_plot_filtered['cluster'] == cluster) & (df_plot_filtered['gender'] == gender)]
        fig.add_trace(
            go.Scatter3d(
                x=df_subset['x'],
                y=df_subset['y'],
                z=df_subset['z'],
                mode='markers',
                marker=dict(
                    size=6,
                    color=colors_for_legend[int(cluster) % len(colors_for_legend)],  # Loop through colors
                    symbol=symbol,
                ),
                name=f'{cluster}, {gender}',
                legendgroup=gender,
                showlegend=False  # Set showlegend to False
            )
        )
fig.update_layout(
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01,
        title=dict(text=''),  # Empty title for cleaner look
        traceorder='grouped'
    ),
    height=1000,  # Set a larger height for the plot
    width=1200    # Set a larger width for the plot
)

# Show figure
fig.show()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [2]:
from sklearn.metrics import silhouette_score, davies_bouldin_score
import glob 
import pandas as pd
import numpy as np
folder_list = glob.glob("res/*")
folder_list = [folder.split("/")[-1] for folder in folder_list]
df = pd.DataFrame()
for data_dir in folder_list:
    print(" ")
    print(f"Data : {data_dir}")
    cluster_num = 200
    epoch = 60
    mode = "joint"
    user_embeddings = np.load(f'res/{data_dir}/{cluster_num}/{mode}/user_embedding_{epoch}.npy')
    cluster_labels = np.load(f'res/{data_dir}/{cluster_num}/{mode}/user_proto_label_{epoch}.npy')
    gender_label = np.load(f'res/{data_dir}/{cluster_num}/{mode}/gender_label_{epoch}.npy')
    mask = gender_label==1
    male_user_embeddings = user_embeddings[mask]
    male_cluster_labels = cluster_labels[mask]
    female_user_embeddings = user_embeddings[~mask]
    female_cluster_labels = cluster_labels[~mask]

    mode = "male"
    male_sep_user_embeddings = np.load(f'res/{data_dir}/{cluster_num}/{mode}/user_embedding_{epoch}.npy')
    male_sep_cluster_labels = np.load(f'res/{data_dir}/{cluster_num}/{mode}/user_proto_label_{epoch}.npy')
    mode = "female"
    female_seq_user_embeddings = np.load(f'res/{data_dir}/{cluster_num}/{mode}/user_embedding_{epoch}.npy')
    female_seq_cluster_labels = np.load(f'res/{data_dir}/{cluster_num}/{mode}/user_proto_label_{epoch}.npy')

    male_silhouette_avg = silhouette_score(male_user_embeddings, male_cluster_labels)
    print(f'joint :  male Silhouette Score: {male_silhouette_avg}')
    female_silhouette_avg = silhouette_score(female_user_embeddings, female_cluster_labels)
    df = pd.concat([df, pd.DataFrame({"data_dir":data_dir,"method":"joint","male":male_silhouette_avg,"female":female_silhouette_avg},index=[0])])
    print(f'joint : female Silhouette Score: {female_silhouette_avg}')
   

    # print("-"*50)
    # davies_bouldin_idx = davies_bouldin_score(male_user_embeddings, male_cluster_labels)
    # print(f'joint :  male Davies-Bouldin Index: {davies_bouldin_idx}')
    # davies_bouldin_idx = davies_bouldin_score(male_sep_user_embeddings, male_sep_cluster_labels)
    # print(f'Separate : male Davies-Bouldin Index: {davies_bouldin_idx}')


    print("-"*50)
    male_silhouette_avg = silhouette_score(male_sep_user_embeddings, male_sep_cluster_labels)
    print(f'Separate : male Silhouette Score: {male_silhouette_avg}')
    female_silhouette_avg = silhouette_score(female_seq_user_embeddings, female_seq_cluster_labels)
    df = pd.concat([df, pd.DataFrame({"data_dir":data_dir,"method":"separate","male":male_silhouette_avg,"female":female_silhouette_avg},index=[0])])
    print(f'Separate : female Silhouette Score: {female_silhouette_avg}')
    # print("-"*50)
    # davies_bouldin_idx = davies_bouldin_score(female_user_embeddings, female_cluster_labels)
    # print(f'joint : female Davies-Bouldin Index: {davies_bouldin_idx}')
    # davies_bouldin_idx = davies_bouldin_score(female_seq_user_embeddings, female_seq_cluster_labels)
    # print(f'Separate : female Davies-Bouldin Index: {davies_bouldin_idx}')


 
Data : drama_sci-fi
joint :  male Silhouette Score: 0.04134579747915268
joint : female Silhouette Score: 0.02880140021443367
--------------------------------------------------
Separate : male Silhouette Score: 0.04892382025718689
Separate : female Silhouette Score: 0.05061156302690506
 
Data : sci-fi_thriller
joint :  male Silhouette Score: 0.03873197361826897
joint : female Silhouette Score: 0.033666688948869705
--------------------------------------------------
Separate : male Silhouette Score: 0.04612148553133011
Separate : female Silhouette Score: 0.04418221861124039
 
Data : comedy_drama
joint :  male Silhouette Score: 0.04651302471756935
joint : female Silhouette Score: 0.043561097234487534
--------------------------------------------------
Separate : male Silhouette Score: 0.05434872955083847
Separate : female Silhouette Score: 0.05843405798077583
 
Data : action_comedy
joint :  male Silhouette Score: 0.045495279133319855
joint : female Silhouette Score: 0.038512714207172394
-