In [None]:
import os
import gzip
import json
import isbnlib
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime
from matplotlib import pyplot as plt

import warnings
warnings.filterwarnings("ignore")

%load_ext autoreload
%autoreload 2

In [None]:
data_base_path = "../data/"

In [None]:
extracted_files = {
    "editions": {
        "file_path": os.path.join(data_base_path, "editions_after_2015.txt"),
        "total_lines": 4259552,
    },
    "works": {
        "file_path": os.path.join(data_base_path, "filtered_works_with_editions.txt"),
        "total_lines": 909883,
    },
    "authors": {
        "file_path": os.path.join(data_base_path, "filtered_authors_with_works_and_editions.txt"),
        "total_lines": 853086,
    }
}

need to ask shell scripts here

## Data Cleaning

In [None]:
def read_txt(file_path, total_lines, valid_cols):
    """
    Reads a tab-separated text file and extracts JSON data from the fifth column, 
    filtering it based on valid column keys.

    Parameters
    ----------
    file_path : str
        Path to the input text file.
    total_lines : int
        Estimated total number of lines in the file (used for tqdm progress bar).
    valid_cols : list of str
        List of valid keys to extract from the JSON data.

    Returns
    -------
    dict
        A dictionary where each key is a line index and the value is a dictionary 
        containing the extracted JSON fields that match `valid_cols`.
    """
    
    df_dict = {}
    
    with open(file_path, "r", encoding="utf-8") as infile:
        for idx, line in enumerate(tqdm(infile, total=total_lines, desc="Processing lines")):
            # if idx > 10000:
            #     break
            parts = line.strip().split("\t")
            
            if len(parts) < 5:
                continue 

            try:
                json_data = json.loads(parts[4])
                
                for key in json_data.keys():
                    df_dict[idx] = {key: json_data.get(key, np.nan) for key in valid_cols if key in json_data}
                
            except json.JSONDecodeError:
                continue

    return df_dict

In [None]:
record_type = "editions"

ed_cols = [
    "key", "works", "title", "publishers", "description", "first_sentence", "subjects", "languages",
    "publish_date", "publish_country", "number_of_pages", "latest_revision", "revision"
]

file_path, total_lines = extracted_files[record_type]["file_path"], extracted_files[record_type]["total_lines"]
editions_dict = read_txt(file_path, total_lines, ed_cols)

In [None]:
editions_df = pd.DataFrame.from_dict(editions_dict, orient="index")
editions_df.head()

In [None]:
del editions_dict

In [None]:
editions_df.shape

In [None]:
editions_df.isna().sum() / len(editions_df) * 100

keep cols with below 70% nans

In [None]:
final_cols = ["key", "works", "title", "publishers", "publish_date", "languages", "subjects"]

drp_df = editions_df[final_cols]
del editions_df

In [None]:
drp_df["publishers"] = drp_df["publishers"].str[0]
drp_df["publishers"].fillna("Self-published", inplace=True)
drp_df["publish_year"] = ([int(x[:4]) for x in drp_df["publish_date"]])
drp_df["subjects"] = drp_df["subjects"].str.join(", ")
drp_df["works"] = drp_df["works"].apply(lambda x: x[0]["key"] if isinstance(x, list) and x else None)
drp_df["languages"] = drp_df["languages"].apply(
    lambda x: x[0]["key"].split("/")[-1] if isinstance(x, list) and x and "key" in x[0] else np.nan
)

final_editions_df = drp_df.dropna(subset=["title", "languages", "subjects"])

del drp_df

In [None]:
final_editions_df.isna().sum()

In [None]:
final_editions_df

In [None]:
final_editions_df["text"] = final_editions_df[["title", "subjects"]].astype(str).agg(" ".join, axis=1)

In [None]:
east_asian_editions_df = final_editions_df[final_editions_df["languages"].isin(["jpn", "kor", "chi"])]
east_asian_editions_df

## Clustering

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/stsb-xlm-r-multilingual")
embeddings = model.encode(east_asian_editions_df["text"].tolist(), batch_size=32, show_progress_bar=True)

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler

svd = TruncatedSVD(n_components=min(embeddings.shape) - 1)
X_svd = svd.fit_transform(embeddings)

# Determine components for 90% variance
explained_variance = np.cumsum(svd.explained_variance_ratio_)
n_components = np.searchsorted(explained_variance, 0.90) + 1
print(f"Number of components to retain 90% variance: {n_components}")

In [None]:
svd = TruncatedSVD(n_components=n_components)
X_svd = svd.fit_transform(embeddings)

In [None]:
X_final = pd.DataFrame(X_svd, columns=[f"SV {i+1}" for i in range(X_svd.shape[1])])
X_final["publish_year"] = east_asian_editions_df["publish_year"]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_final)

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], s = 0.5)

### DBSCAN

In [None]:
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.5, min_samples=5)
east_asian_editions_df["dbscan_cluster"] = dbscan.fit_predict(X_scaled)

plt.figure(figsize=(10, 6))
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=east_asian_editions_df["dbscan_cluster"], cmap="Spectral", alpha=0.5)
plt.colorbar()
plt.title("Book Clusters Visualization")
plt.show()

### HDBSCAN

In [None]:
import hdbscan

clusterer = hdbscan.HDBSCAN(min_cluster_size=50, metric="euclidean", cluster_selection_method="eom")
east_asian_editions_df["hdbscan_cluster"] = clusterer.fit_predict(X_scaled)

### Agglomerative

In [None]:
import scipy.cluster.hierarchy as sch

linkage_matrix = sch.linkage(X_scaled, method="ward")

# Plot dendrogram
plt.figure(figsize=(10, 6))
sch.dendrogram(linkage_matrix, truncate_mode="level", p=5)
plt.title("Ward - Hierarchical Clustering Dendrogram")
plt.xlabel("Book Samples")
plt.ylabel("Distance")
plt.show()

In [None]:
east_asian_editions_df["hierarchical_cluster"] = sch.fcluster(linkage_matrix, t=120, criterion="distance")

plt.figure(figsize=(10, 6))
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=east_asian_editions_df["hierarchical_cluster"])
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.title("Ward Linkage Clustering")
plt.show()