In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.manifold import TSNE
import plotly.express as px
from sklearn.pipeline import Pipeline
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import umap
from sklearn.cluster import DBSCAN

In [3]:
# Load the dataset
df = pd.read_csv('final_df.csv')

In [4]:
df.columns

Index(['series', 'identifier_swisscollections', 'subject_form',
       'library_call_number', 'resource_type',
       'date_of_publication_or_production_standardised', 'place_standardised',
       'physical_description', 'language', 'title', 'found_date',
       'Languages_Clean', 'Languages_Clean_other_mapped',
       'corrected_subject_form', 'res_Clean_other_mapped',
       'subject_Clean_other_mapped', 'corrected_resource_type',
       'resource_Clean_other_mapped', 'city_info', 'city', 'lat', 'lng',
       'country'],
      dtype='object')

In [None]:
# Process MultiLabelBinarizer for specified columns
def process_multilabel_columns(df, columns):
    label_encoded_X = []
    mlb_list = []
    for col in columns:
        mlb = MultiLabelBinarizer()
        label_encoded = mlb.fit_transform(df[col])  # Ensure correct list handling
        label_encoded_X.append(label_encoded)
        mlb_list.append(mlb)
    return np.hstack(label_encoded_X), mlb_list

# Multi-label encoding
multi_label_columns = ['subject_Clean_other_mapped', 'resource_Clean_other_mapped', 'Languages_Clean_other_mapped']
X_combined, mlb_list = process_multilabel_columns(df, multi_label_columns)

In [6]:
# UMAP dimensionality reduction
umap_reducer = umap.UMAP(n_components=2, random_state=42)
X_umap = umap_reducer.fit_transform(X_combined)  # Use a subset for visualization

  warn(


In [33]:
# Apply DBSCAN clustering
dbscan = DBSCAN( eps=0.5, min_samples=150)
# Add UMAP results to the DataFrame
umap_df = pd.DataFrame(X_umap, columns=['UMAP 1', 'UMAP 2'])
umap_df['index'] = df.index
umap_df['DBSCAN_Cluster'] = dbscan.fit_predict(umap_df[['UMAP 1', 'UMAP 2']])

# Merge and process DataFrame
umap_df = umap_df.merge(df, left_on='index', right_index=True)
umap_df['found_date'] = pd.to_datetime(umap_df['found_date'], format='%Y', errors='coerce')
umap_df = umap_df.dropna(subset=['found_date'])


In [None]:
# Ensure discrete colors with a larger color palette for DBSCAN clusters
import plotly.colors as colors


# Map DBSCAN cluster IDs to specific colors
umap_df['Cluster Color'] = umap_df['DBSCAN_Cluster'].map(lambda x: color_scale[x % len(color_scale)] if x != -1 else 'black')

# Plot using Plotly with 20 different colors
fig = px.scatter(
    umap_df,
    x='UMAP 1',
    y='UMAP 2',
    color=umap_df['DBSCAN_Cluster'].astype(str),  # Ensure clusters are treated as discrete categories
    color_discrete_sequence=color_scale[:20],  # Use only the first 20 colors
    title="UMAP dimensionality reduction",
    labels={'color': 'Cluster Label', 'Languages_Clean_other_mapped':'Lang'},
    hover_data={'UMAP 1': True, 'UMAP 2': True, 'DBSCAN_Cluster': True, 'Languages_Clean_other_mapped':True,'subject_Clean_other_mapped':True, 
               , 'resource_Clean_other_mapped':True}
)

fig.update_layout(
    xaxis_title="UMAP 1",
    yaxis_title="UMAP 2",
    legend_title="Cluster",
    hoverlabel=dict(bgcolor="white", font_size=12)
)

fig.show()


SyntaxError: invalid syntax (2168242991.py, line 20)

In [98]:
import dash
from dash import dcc, html, Input, Output
import plotly.express as px
# Create a larger discrete color scale
color_scale = colors.qualitative.Dark24+colors.qualitative.Light24+colors.qualitative.Bold

# Summarize cluster attributes (from previous steps)
cluster_summaries = (
    umap_df.groupby('DBSCAN_Cluster')
    .apply(summarize_cluster)
    .apply(pd.Series)
)

cluster_summaries['Cluster Name'] = (
    'Lang: ' + cluster_summaries['Languages'] +
    ', Subj: ' + cluster_summaries['Subject'] +
    ', Res: ' + cluster_summaries['Resource']
)

cluster_name_map = cluster_summaries['Cluster Name'].to_dict()
umap_df['Cluster Name'] = umap_df['DBSCAN_Cluster'].map(cluster_name_map)
umap_df.loc[umap_df['DBSCAN_Cluster'] == -1, 'Cluster Name'] = 'Unclustered'

# Create initial scatter plot
fig = px.scatter(
    umap_df,
    x='UMAP 1',
    y='UMAP 2',
    color='Cluster Name',
    color_discrete_sequence=color_scale,
    title="UMAP Dimensionality Reduction with Cluster Names",
    hover_data={
        'UMAP 1': True,
        'UMAP 2': True,
        'DBSCAN_Cluster': True,
        'Languages_Clean_other_mapped': True,
        'subject_Clean_other_mapped': True,
        'resource_Clean_other_mapped': True
    },
    labels={'color': 'Cluster Label'}
)

fig.update_layout(
    legend=dict(
        orientation="v",  # Vertical orientation for better scrolling
        yanchor="top",
        y=1.0,
        xanchor="left",
        x=1.02,  # Position the legend outside the plot
        font=dict(
            size=5  # Adjust font size
        ),
        traceorder="normal",  # Keep the order as is
        title=dict(text="Cluster Labels"),  # Optional: Add a legend title
        itemclick="toggle",  # Allow toggling traces by clicking legend items
        itemdoubleclick="toggleothers",  # Double-click behavior
    ),
    margin=dict(r=200),  # Add margin to accommodate the legend
)

# Add a fixed height to allow scrolling in the legend
fig.update_layout(
    legend=dict(
        itemsizing="constant",
        valign="top",
    ),
   
)



# Dash App
app = dash.Dash(__name__)

app.layout = html.Div([
    dcc.Graph(
        id='umap-plot',
        figure=fig
    ),
    html.Div(id='cluster-info', style={'padding': '20px', 'font-size': '18px'})
])

# Callback to display cluster information
@app.callback(
    Output('cluster-info', 'children'),
    Input('umap-plot', 'clickData')
)
def display_cluster_info(click_data):
    if click_data is None:
        return "Click on a cluster point to view detailed information."

    # Extract cluster name from clicked point
    cluster_name = click_data['points'][0]['customdata'][-1]  # Assuming 'Cluster Name' is the last hover data field
    cluster_id = click_data['points'][0]['customdata'][2]  # Assuming 'DBSCAN_Cluster' is the 3rd hover data field

    # Retrieve detailed information about the cluster
    cluster_details = cluster_summaries.loc[cluster_id]

    # Format cluster information for display
    details = f"""
    **Cluster Name:** {cluster_name}
    - Dominant Language: {cluster_details['Languages']}
    - Dominant Subject: {cluster_details['Subject']}
    - Dominant Resource: {cluster_details['Resource']}
    """
    return details

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)






In [117]:
umap_df[['UMAP 1', 'UMAP 2', 'index', 'DBSCAN_Cluster', 'series',
       'identifier_swisscollections', 'subject_form', 'library_call_number',
        'date_of_publication_or_production_standardised',
        'physical_description',  'title','Cluster Name',
       'found_date', 'Languages_Clean_other_mapped',
       'subject_Clean_other_mapped',
       'resource_Clean_other_mapped' , 'lat', 'lng', 'Cluster Color', 'city']].to_csv('final_umap_included.csv')

In [114]:
umap_df['title']=umap_df['title'].apply(lambda x: x.replace('\"', '').replace('\'', ''))

In [116]:
umap_df['title']

0              Brief an Breitkopf und Härtel
1               Brief an [Gottlieb Hufeland]
2                            Brief an Hofrat
3                         Brief an Unbekannt
4         Brief an [Christian Gottlob] Voigt
                         ...                
119288        6 Briefe an Gustav Teichmüller
119289        3 Briefe an Gustav Teichmüller
119290        2 Briefe an Gustav Teichmüller
119291                         Brief an Behm
119292        2 Briefe an Gustav Teichmüller
Name: title, Length: 119253, dtype: object

In [107]:
umap_df['DBSCAN_Cluster'].unique()

array([ 0,  1,  3,  4,  5, -1,  6,  7,  8,  9, 10, 12, 11, 13, 14, 15, 16,
       17,  2, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
       33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46],
      dtype=int64)