In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.manifold import TSNE
import plotly.express as px
from sklearn.pipeline import Pipeline

In [None]:
# Load the dataset
df = pd.read_csv('final_df.csv')

In [None]:
# Process MultiLabelBinarizer for specified columns
def process_multilabel_columns(df, columns):
    label_encoded_X = []
    mlb_list = []
    for col in columns:
        mlb = MultiLabelBinarizer()
        label_encoded = mlb.fit_transform(df[col])  # Ensure correct list handling
        label_encoded_X.append(label_encoded)
        mlb_list.append(mlb)
    return np.hstack(label_encoded_X), mlb_list

# Multi-label encoding
multi_label_columns = ['corrected_resource_type', 'corrected_subject_form', 'corrected_lang']
label_encoded_combined, mlb_list = process_multilabel_columns(df, multi_label_columns)

# Define numerical and categorical features
numerical_features = ['found_date', 'lat', 'lng']
categorical_features = ['library_city']  # Add categorical columns if needed

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

# Apply preprocessing to numerical and categorical features
X_preprocessed = preprocessor.fit_transform(df[numerical_features + categorical_features])

# Combine preprocessed data with label-encoded data
X_combined = np.hstack((X_preprocessed, label_encoded_combined))


In [None]:
# t-SNE dimensionality reduction
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_combined[:10000])  # Use a subset for visualization


In [None]:

# Add t-SNE results to the DataFrame
tsne_df = pd.DataFrame(X_tsne, columns=['t-SNE 1', 't-SNE 2'])
tsne_df['index'] = df.index[:10000]

# Scatter plot with interactive tooltips

tsne_df = tsne_df.merge(df, left_on='index', right_index=True)
tsne_df['found_date'] = pd.to_datetime(tsne_df['found_date'], format='%Y', errors='coerce')
tsne_df=tsne_df.dropna(subset=['found_date'])


In [None]:
fig = px.scatter(
    tsne_df,
    x='t-SNE 1',
    y='t-SNE 2',
   
    hover_data={
        'corrected_resource_type': True,

        'corrected_subject_form': True,
        'corrected_lang': True,
        'lat':True,
        'lng':True,
        'city_info':True,
        'index': False  # Hide index from tooltip
    },
    title="t-SNE Visualization of Custom Classification Dataset"
)

fig.update_layout(
    xaxis_title="First t-SNE Component",
    yaxis_title="Second t-SNE Component",
    hoverlabel=dict(bgcolor="white", font_size=12)
)
fig.update_layout(width=1200, height=800)
# Show interactive plot
fig.show()

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt
fig, ax=plt.subplots(nrows=1, ncols=1, dpi=200)
sns.scatterplot(data=tsne_df, x='t-SNE 1', y='t-SNE 2', s=1)

## Clustering

In [None]:
#Kmeans
from sklearn.cluster import KMeans
C_tsne = tsne_df[['t-SNE 1', 't-SNE 2']].values 

In [None]:
#Finding Optimal K
wcss = []

# Try different values of k
k_values = range(1, 11)  # Test for k from 1 to 10
for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(C_tsne)
    wcss.append(kmeans.inertia_)  # WCSS for each k

# Plot the elbow graph
plt.figure(figsize=(8, 5))
plt.plot(k_values, wcss, marker='o', linestyle='--', color='b')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
plt.xticks(k_values)
plt.grid()
plt.show()

In [None]:
k=5

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(C_tsne)
cluster_labels = kmeans.labels_

In [None]:
plt.figure(figsize=(12, 8))
plt.scatter(C_tsne[:, 0], C_tsne[:, 1],c=cluster_labels)
plt.title("Scatter Plot after applying Kmeans")
plt.show()

In [None]:
#Using Plotly

In [None]:
fig = px.scatter(
    tsne_df,
    x='t-SNE 1',
    y='t-SNE 2',
   color=cluster_labels,
    hover_data={
        'corrected_resource_type': True,
        'corrected_subject_form': True,
        'corrected_lang': True,
        'lat':True,
        'lng':True,
        'city_info':True,
        'index': False  # Hide index from tooltip
    },
    title="Scatter Plot after applying Kmeans"
)

fig.update_layout(
    xaxis_title="First t-SNE Component",
    yaxis_title="Second t-SNE Component",
    hoverlabel=dict(bgcolor="white", font_size=12)
    #,coloraxis_showscale=False
)
fig.update_layout(width=1200, height=800)
# Show interactive plot
fig.show()

In [None]:
from sklearn.cluster import DBSCAN

In [None]:
# DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan.fit(C_tsne)
cluster_labels_db = dbscan.labels_

In [None]:
plt.figure(figsize=(12,8))
plt.scatter(C_tsne[:, 0], C_tsne[:, 1],c=cluster_labels_db)
plt.title("Scatter Plot after applying DBSCAN")
plt.show()

In [None]:
#Using Plotly
fig = px.scatter(
    tsne_df,
    x='t-SNE 1',
    y='t-SNE 2',
   color=cluster_labels_db,
    hover_data={
        'corrected_resource_type': True,
        'corrected_subject_form': True,
        'corrected_lang': True,
        'lat':True,
        'lng':True,
        'city_info':True,
        'index': False  # Hide index from tooltip
    },
    title="Scatter Plot after applying DBSCAN"
)

fig.update_layout(
    xaxis_title="First t-SNE Component",
    yaxis_title="Second t-SNE Component",
    hoverlabel=dict(bgcolor="white", font_size=12)
)
fig.update_layout(width=1200, height=800)
# Show interactive plot
fig.show()

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage 
from scipy.cluster.hierarchy import fcluster

In [None]:
# Hierarchical clustering
linked = linkage(C_tsne, 'single') 


plt.figure(figsize=(16, 12))
dendrogram(linked,
           orientation='top',
           distance_sort='descending',
           show_leaf_counts=True)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample index')
plt.ylabel('Distance')
plt.show()

max_d = 5  # Example maximum distance to cut the dendrogram, adjust based on your data
clusters_labels_hierarchical = fcluster(linked, max_d, criterion='distance')



In [None]:
plt.figure(figsize=(12,6))
scatter = plt.scatter(C_tsne[:, 0], C_tsne[:, 1], c=clusters_labels_hierarchical, cmap='viridis')
plt.title('Scatter Plot after applying Hierarchical Clustering')
plt.show()

In [None]:
#Using Plotly
fig = px.scatter(
    tsne_df,
    x='t-SNE 1',
    y='t-SNE 2',
   color=clusters_labels_hierarchical,
    hover_data={
        'corrected_resource_type': True,
        'corrected_subject_form': True,
        'corrected_lang': True,
        'lat':True,
        'lng':True,
        'city_info':True,
        'index': False  # Hide index from tooltip
    },
    title="Scatter Plot after applying Hierarchical Clustering"
)

fig.update_layout(
    xaxis_title="First t-SNE Component",
    yaxis_title="Second t-SNE Component",
    hoverlabel=dict(bgcolor="white", font_size=12)
)
fig.update_layout(width=1200, height=800)
# Show interactive plot
fig.show()