In [1]:
!pip install datasets
!pip install pandas
!pip install numpy
!pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting filelock (from datasets)
  Downloading filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting numpy>=1.17 (from datasets)
  Downloading numpy-2.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-18.1.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Downloading pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadat

In [2]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import json

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder

In [3]:
# Load the embeddings data
with open('../data/processed/meanpool__intfloat__multilingual-e5-large-instruct_identify_author/meanpool__intfloat__multilingual-e5-large-instruct_identify_author.json', 'r') as f:
    embeddings_data = [json.loads(line) for line in f]

# Convert embeddings to a DataFrame
embeddings_df = pd.DataFrame(embeddings_data)
embeddings_df['embedding'] = embeddings_df['embedding'].apply(np.array)

# Load metadata from Hugging Face dataset
metadata = load_dataset('chcaa/memo-canonical-novels')['train'].to_pandas()

# Merge datasets
merged_df = pd.merge(metadata, embeddings_df, left_on='FILENAME', right_on='filename')

# Create a new column 'classes' as a copy of 'CATEGORY'
merged_df['classes'] = merged_df['CATEGORY']

# Rename specific values in the 'classes' column
merged_df['classes'] = merged_df['classes'].replace({
    'CANON_HISTORICAL': 'CANON',
    'CE_CANON': 'CANON',
    'LEX_CANON': 'CANON'
})

README.md:   0%|          | 0.00/4.32k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/17.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/839 [00:00<?, ? examples/s]

In [4]:
merged_df.groupby('classes')['classes'].count()

classes
CANON         114
HISTORICAL     57
O             667
Name: classes, dtype: int64

In [8]:
merged_df['average_sentence_length'] = merged_df['TEXT'].apply(
    lambda x: sum(len(sentence) for sentence in x.split('.')) / len(x.split('.')) if x.strip() else 0)

## With historical novels

In [9]:
# Assume df is your DataFrame with features and target
# 'classes' is the target column
class_column = 'classes'

# Step 1: Find the minimum class size
min_class_size = merged_df[class_column].value_counts().min()

# Step 2: Down-sample each class
balanced_df = (
    merged_df.groupby(class_column)
    .apply(lambda x: x.sample(n=min_class_size, random_state=42))
    .reset_index(drop=True)
)

# Step 3: Shuffle the dataset
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Check the class distribution
print(balanced_df[class_column].value_counts())

classes
HISTORICAL    57
CANON         57
O             57
Name: count, dtype: int64


  .apply(lambda x: x.sample(n=min_class_size, random_state=42))


## The whole shebang, including historical novels

In [50]:
# Number of iterations
num_iterations = 20

# Define feature combinations
feature_combinations = {
    'embeddings': lambda df: np.stack(df['embedding'].values),
    'price': lambda df: df['PRICE'].values.reshape(-1, 1),
    'publisher': lambda df: publisher_encoder.fit_transform(df['PUBLISHER'].values.reshape(-1, 1)),
    'embeddings_price': lambda df: np.hstack([np.stack(df['embedding'].values), df['PRICE'].values.reshape(-1, 1)]),
    'embeddings_publisher': lambda df: np.hstack([np.stack(df['embedding'].values), 
                                                  publisher_encoder.fit_transform(df['PUBLISHER'].values.reshape(-1, 1))]),
    'publisher_price': lambda df: np.hstack([publisher_encoder.fit_transform(df['PUBLISHER'].values.reshape(-1, 1)),
                                             df['PRICE'].values.reshape(-1, 1)]),
    'embeddings_publisher_price': lambda df: np.hstack([np.stack(df['embedding'].values), 
                                                        publisher_encoder.fit_transform(df['PUBLISHER'].values.reshape(-1, 1)),
                                                        df['PRICE'].values.reshape(-1, 1)])
}

# Dictionary to store class-wise metrics for all feature combinations
results = {feature_set: {} for feature_set in feature_combinations}

# Column names for classes and features
class_column = 'classes'

# OneHotEncoder for the 'publisher' feature
publisher_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

for feature_set_name, feature_set_func in feature_combinations.items():
    print(f"Evaluating feature set: {feature_set_name}")
    
    # Initialize storage for class-wise metrics
    class_performance = {}
    
    for i in range(num_iterations):
        # Step 1: Find the minimum class size
        min_class_size = merged_df[class_column].value_counts().min()

        # Step 2: Down-sample each class
        balanced_df = (
            merged_df.groupby(class_column)
            .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
            .reset_index(drop=True)
        )

        # Step 3: Shuffle the dataset
        balanced_df = balanced_df.sample(frac=1, random_state=i).reset_index(drop=True)

        # Step 4: Create feature matrix and target array
        X = feature_set_func(balanced_df)
        y = balanced_df['classes'].values

        # Train/test split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)

        # Train Random Forest Classifier
        clf = RandomForestClassifier(n_estimators=100, random_state=i)
        clf.fit(X_train, y_train)

        # Evaluate the model
        y_pred = clf.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True)  # Get report as a dictionary

        # Store class-wise scores
        for class_name, metrics in report.items():
            if class_name in ['accuracy', 'macro avg', 'weighted avg']:
                continue  # Skip non-class entries

            if class_name not in class_performance:
                class_performance[class_name] = {'precision': [], 'recall': [], 'f1-score': []}

            class_performance[class_name]['precision'].append(metrics['precision'])
            class_performance[class_name]['recall'].append(metrics['recall'])
            class_performance[class_name]['f1-score'].append(metrics['f1-score'])
    
    # Calculate mean performance for each class and store results
    results[feature_set_name] = {
        class_name: {
            'mean_precision': np.mean(scores['precision']),
            'mean_recall': np.mean(scores['recall']),
            'mean_f1': np.mean(scores['f1-score'])
        }
        for class_name, scores in class_performance.items()
    }

# Display results
for feature_set_name, class_metrics in results.items():
    print(f"Feature Set: {feature_set_name}")
    for class_name, metrics in class_metrics.items():
        print(f"  Class {class_name}:")
        print(f"    Mean Precision: {metrics['mean_precision']:.3f}")
        print(f"    Mean Recall: {metrics['mean_recall']:.3f}")
        print(f"    Mean F1-Score: {metrics['mean_f1']:.3f}")
    print()

Evaluating feature set: embeddings


  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary rando

Evaluating feature set: price


  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary rando

Evaluating feature set: publisher


  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary rando

Evaluating feature set: embeddings_price


  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary rando

Evaluating feature set: embeddings_publisher


  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary rando

Evaluating feature set: publisher_price


  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary rando

Evaluating feature set: embeddings_publisher_price


  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary rando

Feature Set: embeddings
  Class CANON:
    Mean Precision: 0.623
    Mean Recall: 0.667
    Mean F1-Score: 0.631
  Class HISTORICAL:
    Mean Precision: 0.817
    Mean Recall: 0.767
    Mean F1-Score: 0.778
  Class O:
    Mean Precision: 0.513
    Mean Recall: 0.513
    Mean F1-Score: 0.499

Feature Set: price
  Class CANON:
    Mean Precision: 0.401
    Mean Recall: 0.403
    Mean F1-Score: 0.390
  Class HISTORICAL:
    Mean Precision: 0.375
    Mean Recall: 0.345
    Mean F1-Score: 0.345
  Class O:
    Mean Precision: 0.354
    Mean Recall: 0.387
    Mean F1-Score: 0.344

Feature Set: publisher
  Class CANON:
    Mean Precision: 0.542
    Mean Recall: 0.788
    Mean F1-Score: 0.633
  Class HISTORICAL:
    Mean Precision: 0.544
    Mean Recall: 0.353
    Mean F1-Score: 0.379
  Class O:
    Mean Precision: 0.463
    Mean Recall: 0.346
    Mean F1-Score: 0.347

Feature Set: embeddings_price
  Class CANON:
    Mean Precision: 0.633
    Mean Recall: 0.693
    Mean F1-Score: 0.644
  Class 

### Embeddings as feature

In [45]:
y = balanced_df['classes'].values
clf = RandomForestClassifier(n_estimators=100, random_state=42)

#### Mean over x iterations

In [13]:
# Number of iterations
num_iterations = 20

# Dictionary to store class-wise metrics
class_performance = {}

# Column names for classes and features
class_column = 'classes'

for i in range(num_iterations):
    # Step 1: Find the minimum class size
    min_class_size = merged_df[class_column].value_counts().min()

    # Step 2: Down-sample each class
    balanced_df = (
        merged_df_without.groupby(class_column)
        .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
        .reset_index(drop=True)
    )

    # Step 3: Shuffle the dataset
    balanced_df = balanced_df.sample(frac=1, random_state=i).reset_index(drop=True)

    # Extract features and target
    X = np.stack(balanced_df['average_sentence_length'].values.reshape(-1, 1))
    y = balanced_df['classes'].values

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)

    # Train Random Forest Classifier
    clf = RandomForestClassifier(n_estimators=100, random_state=i)
    clf.fit(X_train, y_train)

    # Evaluate the model
    y_pred = clf.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)  # Get report as a dictionary

    # Store class-wise scores
    for class_name, metrics in report.items():
        if class_name in ['accuracy', 'macro avg', 'weighted avg']:
            continue  # Skip non-class entries

        if class_name not in class_performance:
            class_performance[class_name] = {'precision': [], 'recall': [], 'f1-score': []}

        class_performance[class_name]['precision'].append(metrics['precision'])
        class_performance[class_name]['recall'].append(metrics['recall'])
        class_performance[class_name]['f1-score'].append(metrics['f1-score'])

# Calculate mean performance for each class
mean_class_performance = {
    class_name: {
        'mean_precision': np.mean(scores['precision']),
        'mean_recall': np.mean(scores['recall']),
        'mean_f1': np.mean(scores['f1-score'])
    }
    for class_name, scores in class_performance.items()
}

# Display results
for class_name, metrics in mean_class_performance.items():
    print(f"Class {class_name}:")
    print(f"  Mean Precision: {metrics['mean_precision']:.3f}")
    print(f"  Mean Recall: {metrics['mean_recall']:.3f}")
    print(f"  Mean F1-Score: {metrics['mean_f1']:.3f}")
    print()

  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary rando

Class CANON:
  Mean Precision: 0.510
  Mean Recall: 0.522
  Mean F1-Score: 0.502

Class O:
  Mean Precision: 0.544
  Mean Recall: 0.554
  Mean F1-Score: 0.533



  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state


#### Single run

In [27]:
# Extract features and target
X = np.stack(balanced_df['embedding'].values)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Classifier
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       CANON       0.80      0.92      0.86        13
  HISTORICAL       0.82      0.82      0.82        11
           O       0.78      0.64      0.70        11

    accuracy                           0.80        35
   macro avg       0.80      0.79      0.79        35
weighted avg       0.80      0.80      0.80        35



### Price as feature

In [28]:
# Extract features and target
balanced_df['PRICE'] = balanced_df['PRICE'].fillna(0)
X = balanced_df['PRICE'].values.reshape(-1, 1)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Classifier
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       CANON       0.42      0.38      0.40        13
  HISTORICAL       0.50      0.36      0.42        11
           O       0.40      0.55      0.46        11

    accuracy                           0.43        35
   macro avg       0.44      0.43      0.43        35
weighted avg       0.44      0.43      0.43        35



### Publishing house

In [29]:
# Process 'PUBLISHER' column: Convert strings to one-hot encoding
publisher_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
publisher_encoded = publisher_encoder.fit_transform(balanced_df['PUBLISHER'].values.reshape(-1, 1))

# Combine embeddings, PRICE, and PUBLISHER features into a single array
X = publisher_encoded

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Classifier
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       CANON       0.69      0.69      0.69        13
  HISTORICAL       0.42      0.45      0.43        11
           O       0.30      0.27      0.29        11

    accuracy                           0.49        35
   macro avg       0.47      0.47      0.47        35
weighted avg       0.48      0.49      0.48        35



### Combining embeddings and book price

In [30]:
# Extract embeddings and reshape PRICE
embeddings = np.stack(balanced_df['embedding'].values)  # Convert embeddings to a 2D array
price = balanced_df['PRICE'].values.reshape(-1, 1)  # Reshape PRICE into a 2D array

# Combine embeddings and PRICE into a single feature array
X = np.hstack((embeddings, price))  # Horizontally stack embeddings and PRICE

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Classifier
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

       CANON       0.77      0.77      0.77        13
  HISTORICAL       0.71      0.91      0.80        11
           O       0.75      0.55      0.63        11

    accuracy                           0.74        35
   macro avg       0.74      0.74      0.73        35
weighted avg       0.75      0.74      0.74        35



### Adding publishing house

In [31]:
# Process 'PUBLISHER' column: Convert strings to one-hot encoding
publisher_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
publisher_encoded = publisher_encoder.fit_transform(balanced_df['PUBLISHER'].values.reshape(-1, 1))

# Combine embeddings, PRICE, and PUBLISHER features into a single array
X = np.hstack((embeddings, price, publisher_encoded))  # Horizontally stack embeddings, PRICE, and encoded PUBLISHER

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Classifier
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       CANON       0.77      0.77      0.77        13
  HISTORICAL       0.83      0.91      0.87        11
           O       0.60      0.55      0.57        11

    accuracy                           0.74        35
   macro avg       0.73      0.74      0.74        35
weighted avg       0.74      0.74      0.74        35



### Only price and publishing house

In [32]:
# Combine embeddings, PRICE, and PUBLISHER features into a single array
X = np.hstack((price, publisher_encoded))  # Horizontally stack embeddings, PRICE, and encoded PUBLISHER

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Classifier
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       CANON       0.71      0.38      0.50        13
  HISTORICAL       0.50      0.64      0.56        11
           O       0.43      0.55      0.48        11

    accuracy                           0.51        35
   macro avg       0.55      0.52      0.51        35
weighted avg       0.56      0.51      0.51        35



### Only embeddings and publishing house

In [33]:
# Combine embeddings, PRICE, and PUBLISHER features into a single array
X = np.hstack((embeddings, publisher_encoded))  # Horizontally stack embeddings, PRICE, and encoded PUBLISHER

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Classifier
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       CANON       0.71      0.77      0.74        13
  HISTORICAL       0.82      0.82      0.82        11
           O       0.60      0.55      0.57        11

    accuracy                           0.71        35
   macro avg       0.71      0.71      0.71        35
weighted avg       0.71      0.71      0.71        35



## Without historical novels

In [12]:
merged_df_without = merged_df[merged_df['classes'] != 'HISTORICAL']
merged_df_without.shape

(781, 42)

In [52]:
# Step 1: Find the minimum class size
min_class_size = merged_df_without[class_column].value_counts().min()

# Step 2: Down-sample each class
balanced_df = (
    merged_df_without.groupby(class_column)
    .apply(lambda x: x.sample(n=min_class_size, random_state=42))
    .reset_index(drop=True)
)

# Step 3: Shuffle the dataset
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Check the class distribution
print(balanced_df[class_column].value_counts())

classes
O        114
CANON    114
Name: count, dtype: int64


  .apply(lambda x: x.sample(n=min_class_size, random_state=42))


## The whole shebang, without historical novels


In [54]:
# Number of iterations
num_iterations = 20

# Define feature combinations
feature_combinations = {
    'embeddings': lambda df: np.stack(df['embedding'].values),
    'price': lambda df: df['PRICE'].values.reshape(-1, 1),
    'publisher': lambda df: publisher_encoder.fit_transform(df['PUBLISHER'].values.reshape(-1, 1)),
    'embeddings_price': lambda df: np.hstack([np.stack(df['embedding'].values), df['PRICE'].values.reshape(-1, 1)]),
    'embeddings_publisher': lambda df: np.hstack([np.stack(df['embedding'].values), 
                                                  publisher_encoder.fit_transform(df['PUBLISHER'].values.reshape(-1, 1))]),
    'publisher_price': lambda df: np.hstack([publisher_encoder.fit_transform(df['PUBLISHER'].values.reshape(-1, 1)),
                                             df['PRICE'].values.reshape(-1, 1)]),
    'embeddings_publisher_price': lambda df: np.hstack([np.stack(df['embedding'].values), 
                                                        publisher_encoder.fit_transform(df['PUBLISHER'].values.reshape(-1, 1)),
                                                        df['PRICE'].values.reshape(-1, 1)])
}

# Dictionary to store class-wise metrics for all feature combinations
results = {feature_set: {} for feature_set in feature_combinations}

# Column names for classes and features
class_column = 'classes'

# OneHotEncoder for the 'publisher' feature
publisher_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

for feature_set_name, feature_set_func in feature_combinations.items():
    print(f"Evaluating feature set: {feature_set_name}")
    
    # Initialize storage for class-wise metrics
    class_performance = {}
    
    for i in range(num_iterations):
        # Step 1: Find the minimum class size
        min_class_size = merged_df[class_column].value_counts().min()

        # Step 2: Down-sample each class
        balanced_df = (
            merged_df_without.groupby(class_column)
            .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
            .reset_index(drop=True)
        )

        # Step 3: Shuffle the dataset
        balanced_df = balanced_df.sample(frac=1, random_state=i).reset_index(drop=True)

        # Step 4: Create feature matrix and target array
        X = feature_set_func(balanced_df)
        y = balanced_df['classes'].values

        # Train/test split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)

        # Train Random Forest Classifier
        clf = RandomForestClassifier(n_estimators=100, random_state=i)
        clf.fit(X_train, y_train)

        # Evaluate the model
        y_pred = clf.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True)  # Get report as a dictionary

        # Store class-wise scores
        for class_name, metrics in report.items():
            if class_name in ['accuracy', 'macro avg', 'weighted avg']:
                continue  # Skip non-class entries

            if class_name not in class_performance:
                class_performance[class_name] = {'precision': [], 'recall': [], 'f1-score': []}

            class_performance[class_name]['precision'].append(metrics['precision'])
            class_performance[class_name]['recall'].append(metrics['recall'])
            class_performance[class_name]['f1-score'].append(metrics['f1-score'])
    
    # Calculate mean performance for each class and store results
    results[feature_set_name] = {
        class_name: {
            'mean_precision': np.mean(scores['precision']),
            'mean_recall': np.mean(scores['recall']),
            'mean_f1': np.mean(scores['f1-score'])
        }
        for class_name, scores in class_performance.items()
    }

# Display results
for feature_set_name, class_metrics in results.items():
    print(f"Feature Set: {feature_set_name}")
    for class_name, metrics in class_metrics.items():
        print(f"  Class {class_name}:")
        print(f"    Mean Precision: {metrics['mean_precision']:.3f}")
        print(f"    Mean Recall: {metrics['mean_recall']:.3f}")
        print(f"    Mean F1-Score: {metrics['mean_f1']:.3f}")
    print()

Evaluating feature set: embeddings


  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary rando

Evaluating feature set: price


  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary rando

Evaluating feature set: publisher


  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary rando

Evaluating feature set: embeddings_price


  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary rando

Evaluating feature set: embeddings_publisher


  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary rando

Evaluating feature set: publisher_price


  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary rando

Evaluating feature set: embeddings_publisher_price


  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary random_state
  .apply(lambda x: x.sample(n=min_class_size, random_state=i))  # Vary rando

Feature Set: embeddings
  Class CANON:
    Mean Precision: 0.677
    Mean Recall: 0.687
    Mean F1-Score: 0.662
  Class O:
    Mean Precision: 0.687
    Mean Recall: 0.667
    Mean F1-Score: 0.659

Feature Set: price
  Class CANON:
    Mean Precision: 0.541
    Mean Recall: 0.521
    Mean F1-Score: 0.504
  Class O:
    Mean Precision: 0.523
    Mean Recall: 0.542
    Mean F1-Score: 0.510

Feature Set: publisher
  Class CANON:
    Mean Precision: 0.680
    Mean Recall: 0.848
    Mean F1-Score: 0.750
  Class O:
    Mean Precision: 0.788
    Mean Recall: 0.597
    Mean F1-Score: 0.671

Feature Set: embeddings_price
  Class CANON:
    Mean Precision: 0.683
    Mean Recall: 0.696
    Mean F1-Score: 0.672
  Class O:
    Mean Precision: 0.699
    Mean Recall: 0.677
    Mean F1-Score: 0.673

Feature Set: embeddings_publisher
  Class CANON:
    Mean Precision: 0.667
    Mean Recall: 0.681
    Mean F1-Score: 0.655
  Class O:
    Mean Precision: 0.689
    Mean Recall: 0.663
    Mean F1-Score: 0.

In [36]:
y = balanced_df['classes'].values

In [37]:
# Extract features and target
X = np.stack(balanced_df['embedding'].values)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Classifier
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       CANON       0.78      0.82      0.80        22
           O       0.83      0.79      0.81        24

    accuracy                           0.80        46
   macro avg       0.80      0.80      0.80        46
weighted avg       0.81      0.80      0.80        46



### Book price

In [38]:
# Extract features and target
balanced_df['PRICE'] = balanced_df['PRICE'].fillna(0)
X = balanced_df['PRICE'].values.reshape(-1, 1)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Classifier
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       CANON       0.60      0.27      0.38        22
           O       0.56      0.83      0.67        24

    accuracy                           0.57        46
   macro avg       0.58      0.55      0.52        46
weighted avg       0.58      0.57      0.53        46



### Publishing house

In [39]:
# Process 'PUBLISHER' column: Convert strings to one-hot encoding
publisher_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
publisher_encoded = publisher_encoder.fit_transform(balanced_df['PUBLISHER'].values.reshape(-1, 1))

# Combine embeddings, PRICE, and PUBLISHER features into a single array
X = publisher_encoded

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Classifier
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       CANON       0.69      0.91      0.78        22
           O       0.88      0.62      0.73        24

    accuracy                           0.76        46
   macro avg       0.79      0.77      0.76        46
weighted avg       0.79      0.76      0.76        46



### Combining embeddings and book price

In [40]:
# Extract embeddings and reshape PRICE
embeddings = np.stack(balanced_df['embedding'].values)  # Convert embeddings to a 2D array
price = balanced_df['PRICE'].values.reshape(-1, 1)  # Reshape PRICE into a 2D array

# Combine embeddings and PRICE into a single feature array
X = np.hstack((embeddings, price))  # Horizontally stack embeddings and PRICE

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Classifier
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       CANON       0.77      0.91      0.83        22
           O       0.90      0.75      0.82        24

    accuracy                           0.83        46
   macro avg       0.83      0.83      0.83        46
weighted avg       0.84      0.83      0.83        46



### Adding publishing house

In [41]:
# Process 'PUBLISHER' column: Convert strings to one-hot encoding
publisher_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
publisher_encoded = publisher_encoder.fit_transform(balanced_df['PUBLISHER'].values.reshape(-1, 1))

# Combine embeddings, PRICE, and PUBLISHER features into a single array
X = np.hstack((embeddings, price, publisher_encoded))  # Horizontally stack embeddings, PRICE, and encoded PUBLISHER

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Classifier
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       CANON       0.77      0.91      0.83        22
           O       0.90      0.75      0.82        24

    accuracy                           0.83        46
   macro avg       0.83      0.83      0.83        46
weighted avg       0.84      0.83      0.83        46



### Only price and publishing house

In [42]:
# Combine embeddings, PRICE, and PUBLISHER features into a single array
X = np.hstack((price, publisher_encoded))  # Horizontally stack embeddings, PRICE, and encoded PUBLISHER

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Classifier
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       CANON       0.75      0.82      0.78        22
           O       0.82      0.75      0.78        24

    accuracy                           0.78        46
   macro avg       0.78      0.78      0.78        46
weighted avg       0.79      0.78      0.78        46



### Only embeddings and publishing house

In [43]:
# Combine embeddings, PRICE, and PUBLISHER features into a single array
X = np.hstack((embeddings, publisher_encoded))  # Horizontally stack embeddings, PRICE, and encoded PUBLISHER

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Classifier
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       CANON       0.82      0.82      0.82        22
           O       0.83      0.83      0.83        24

    accuracy                           0.83        46
   macro avg       0.83      0.83      0.83        46
weighted avg       0.83      0.83      0.83        46

