In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.manifold import TSNE
import plotly.express as px
from sklearn.pipeline import Pipeline

In [2]:
# Load the dataset
df = pd.read_csv('final_df.csv')

In [3]:
df.columns

Index(['series', 'identifier_swisscollections', 'subject_form',
       'library_call_number', 'resource_type',
       'date_of_publication_or_production_standardised', 'place_standardised',
       'physical_description', 'language', 'title', 'found_date',
       'Languages_Clean', 'Languages_Clean_other_mapped',
       'corrected_subject_form', 'res_Clean_other_mapped',
       'subject_Clean_other_mapped', 'corrected_resource_type',
       'resource_Clean_other_mapped', 'city_info', 'city', 'lat', 'lng',
       'country'],
      dtype='object')

In [20]:
# Process MultiLabelBinarizer for specified columns
def process_multilabel_columns(df, columns):
    label_encoded_X = []
    mlb_list = []
    for col in columns:
        mlb = MultiLabelBinarizer()
        label_encoded = mlb.fit_transform(df[col])  # Ensure correct list handling
        label_encoded_X.append(label_encoded)
        mlb_list.append(mlb)
    return np.hstack(label_encoded_X), mlb_list

# Multi-label encoding
multi_label_columns = ['subject_Clean_other_mapped', 'resource_Clean_other_mapped', 'Languages_Clean_other_mapped']
label_encoded_combined, mlb_list = process_multilabel_columns(df, multi_label_columns)

# Define numerical and categorical features
numerical_features = [ ]
categorical_features = []  # Add categorical columns if needed

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

# Apply preprocessing to numerical and categorical features
X_preprocessed = preprocessor.fit_transform(df[numerical_features + categorical_features])

# Combine preprocessed data with label-encoded data
X_combined = np.hstack((X_preprocessed, label_encoded_combined))

In [24]:
import umap

# UMAP dimensionality reduction
umap_reducer = umap.UMAP(n_components=2, random_state=42)
X_umap = umap_reducer.fit_transform(X_combined[:10000])  # Use a subset for visualization



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [25]:
# Add t-SNE results to the DataFrame
tsne_df = pd.DataFrame(X_umap, columns=['t-SNE 1', 't-SNE 2'])
tsne_df['index'] = df.index[:10000]

# Scatter plot with interactive tooltips
tsne_df = tsne_df.merge(df, left_on='index', right_index=True)
tsne_df['found_date'] = pd.to_datetime(tsne_df['found_date'], format='%Y', errors='coerce')
tsne_df=tsne_df.dropna(subset=['found_date'])

In [26]:
fig = px.scatter(
    tsne_df,
    x='t-SNE 1',
    y='t-SNE 2',
   
    hover_data={
        'subject_Clean_other_mapped': True,

        'Languages_Clean_other_mapped': True,
        'resource_Clean_other_mapped': True,
        'lat':True,
        'lng':True,
        'city_info':True,
        'index': False  # Hide index from tooltip
    },
    title="t-SNE Visualization of Custom Classification Dataset"
)

fig.update_layout(
    xaxis_title="First t-SNE Component",
    yaxis_title="Second t-SNE Component",
    hoverlabel=dict(bgcolor="white", font_size=12)
)

# Show interactive plot
fig.show()

In [None]:

import seaborn as sns
from matplotlib import pyplot as plt
fig, ax=plt.subplots(nrows=1, ncols=1, dpi=200)
sns.scatterplot(data=tsne_df, x='t-SNE 1', y='t-SNE 2', s=1)