### Import the necessary libraries      

In [2]:
!pip install pandas
!pip install transformers
!pip install torch
!pip install numpy

Collecting pandas
  Obtaining dependency information for pandas from https://files.pythonhosted.org/packages/52/11/9eac327a38834f162b8250aab32a6781339c69afe7574368fffe46387edf/pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl.metadata
  Using cached pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting numpy>=1.23.2 (from pandas)
  Obtaining dependency information for numpy>=1.23.2 from https://files.pythonhosted.org/packages/8d/29/076999b69bd9264b8df5e56f2be18da2de6b2a2d0e10737e5307592e01de/numpy-2.2.3-cp311-cp311-macosx_14_0_arm64.whl.metadata
  Downloading numpy-2.2.3-cp311-cp311-macosx_14_0_arm64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m482.8 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting pytz>=2020.1 (from pandas)
  Obtaining dependency information for pytz>=2020.1 from https://files.pythonhosted.org/packages/eb/38/ac33370d784287baa1c3d538978b5e2ea064d4c1b93ffbd12826c190dd10/pytz

In [9]:
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertModel
import torch
import numpy as np
import os

In [6]:
dtype_spec = {
    'ISBN': str,
    'Book-Title': str,
    'Book-Author': str,
    'Year-Of-Publication': str,
    'Publisher': str,
    'Image-URL-S': str,
    'Image-URL-M': str,
    'Image-URL-L': str
}
books_df = pd.read_csv("books.csv", encoding='latin1', delimiter=';', on_bad_lines='skip', dtype=dtype_spec)


In [3]:
# Load pre-trained DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [4]:
# Function to get BERT embeddings for a single text
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():  # Disable gradient calculation for inference
        outputs = model(**inputs)
    # Get the [CLS] token's embedding
    cls_embedding = outputs.last_hidden_state[0][0].numpy()
    return cls_embedding

In [12]:
batch_size = 10
num_batches = len(books_df) // batch_size + 1

# Check if there is a progress file
progress_file = "progress.txt"
start_batch = 0
if os.path.exists(progress_file):
    with open(progress_file, "r") as f:
        start_batch = int(f.read().strip())

try:
    for i in range(start_batch, num_batches):
        batch_df = books_df.iloc[i*batch_size:(i+1)*batch_size].copy()
        batch_df.loc[:, 'embedding'] = batch_df['Book-Title'].apply(lambda title: get_bert_embedding(title))
        batch_df.loc[:, 'embedding'] = batch_df['embedding'].apply(lambda x: x.tolist())  # Convert numpy array to list
        if i == 0 and start_batch == 0:
            batch_df.to_csv("books_with_embeddings.csv", index=False, mode='w')
        else:
            batch_df.to_csv("books_with_embeddings.csv", index=False, mode='a', header=False)

        # Save progress
        with open(progress_file, "w") as f:
            f.write(str(i + 1))

        print(f"Processed batch {i+1}/{num_batches}")

except KeyboardInterrupt:
    # Save progress on interruption
    with open(progress_file, "w") as f:
        f.write(str(i))
    print(f"Process interrupted. Progress saved at batch {i}.")

Processed batch 159/272
Processed batch 160/272
Processed batch 161/272
Processed batch 162/272
Processed batch 163/272
Processed batch 164/272
Process interrupted. Progress saved at batch 164.


In [18]:
import pandas as pd

# Load the CSV file
books_df = pd.read_csv("books_with_embeddings.csv", encoding='latin1', delimiter=';', on_bad_lines='skip')

# Get the number of rows
num_rows = len(books_df)

print(f"The number of rows in the CSV file is: {num_rows}")

The number of rows in the CSV file is: 1832
