In [4]:
!pip install pandas sentence-transformers faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [42]:
import gc
import time
import pandas as pd
import numpy as np
import pickle
import faiss
import csv
import os
import requests
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

In [None]:
dataset_path = 'path'
dataset = pd.read_csv(dataset_path, encoding='ISO-8859-1')


In [None]:
df_sample = dataset.sample(n=30000, random_state=42)
df = df_sample[['name', 'wikidata_code']].copy()
len(df)

30000

In [None]:

# Function to fetch long description from Wikipedia
def fetch_long_wikidata_description(wikidata_code):
    wikidata_url = f"https://www.wikidata.org/w/api.php"
    params = {
        'action': 'wbgetentities',
        'ids': wikidata_code,
        'format': 'json',
        'props': 'sitelinks',
        'languages': 'en'
    }

    try:
        response = requests.get(wikidata_url, params=params)
        response.raise_for_status()
        data = response.json()

        sitelinks = data.get('entities', {}).get(wikidata_code, {}).get('sitelinks', {})
        en_wikipedia_title = sitelinks.get('enwiki', {}).get('title')

        if not en_wikipedia_title:
            return None

        wikipedia_url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{en_wikipedia_title.replace(' ', '_')}"
        response = requests.get(wikipedia_url)
        response.raise_for_status()
        article_data = response.json()

        return article_data.get('extract', None)
    except requests.RequestException:
        return None

# Function to fetch a short description from Wikidata directly
def get_wikidata_description(wikidata_code):
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{wikidata_code}.json"
    try:
        response = requests.get(url)
        response.raise_for_status()

        data = response.json()
        entity_data = data.get('entities', {}).get(wikidata_code, {})

        if not entity_data:
            return "Entity not found."

        description = entity_data.get('descriptions', {}).get('en', {}).get('value', "Description not available.")
        return description
    except requests.RequestException as e:
        return f"Error fetching data: {e}"

# Process dataset and fetch biographies
def add_biographies_to_dataset(df, wikidata_column, rate_limit=0.1):
    """
    Adds a biography column to the dataset by fetching descriptions for each Wikidata code,
    with a progress bar and rate limiting.

    Args:
        df (pd.DataFrame): The input dataset.
        wikidata_column (str): The name of the column with Wikidata codes.
        rate_limit (float): Delay between requests in seconds.

    Returns:
        pd.DataFrame: The updated dataset with a new 'biography' column.
    """
    biographies = []

    for wikidata_code in tqdm(df[wikidata_column], desc="Fetching biographies"):
        retries = 3
        biography = None

        # Retry logic
        while retries > 0:
            biography = fetch_long_wikidata_description(wikidata_code)
            if biography:
                break
            biography = get_wikidata_description(wikidata_code)
            if biography:
                break
            retries -= 1
            time.sleep(rate_limit)  # Wait before retrying

        biographies.append(biography if biography else "Failed to fetch biography")
        time.sleep(rate_limit)  # Rate limiting to avoid overwhelming the server

    df['biography'] = biographies
    return df

In [None]:
df = add_biographies_to_dataset(df, 'wikidata_code', rate_limit=0.2)

Fetching biographies: 100%|██████████| 30000/30000 [8:54:03<00:00,  1.07s/it]


In [None]:
path = "path"

# Saving dataframe

df.to_csv(path, index=False)

In [6]:
# Pulling the dataframe

df = pd.read_csv(path)

In [8]:
model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [13]:
biographies = df["biography"].astype(str).tolist()

In [16]:
# Embeddings

encoding_batch_size = 512

embeddings = []

encoding_pbar = tqdm(total=len(biographies), desc="Encoding biographies")

for i in range(0, len(biographies), encoding_batch_size):
    batch = biographies[i:i + encoding_batch_size]
    batch_embeddings = model.encode(batch, batch_size=encoding_batch_size, show_progress_bar=False)
    embeddings.extend(batch_embeddings)
    encoding_pbar.update(len(batch))

    del batch, batch_embeddings
    gc.collect()

encoding_pbar.close()

Encoding biographies: 100%|██████████| 30000/30000 [01:26<00:00, 347.45it/s]


In [17]:
# Check

# float32 required by FAISS
embeddings = np.array(embeddings).astype('float32')

print(f"Shape of embeddings: {embeddings.shape}")
print(f"embeddings size: {len(embeddings)}")

Shape of embeddings: (30000, 384)
embeddings size: 30000


In [None]:
# Saving Embeddings

with open('path', 'wb') as f:
    pickle.dump(embeddings, f)

print("Embeddings saved to 'path'")

Embeddings saved to '/content/drive/MyDrive/Mineria/embeddings.pkl'


In [23]:
# Adding embeddings to the index

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
print(f"Is the index trained? {index.is_trained}")

Is the index trained? True


In [24]:

index.add(embeddings)
print(f"Number of vectors in the index: {index.ntotal}")

Number of vectors in the index: 30000


In [None]:
# Saving the FAISS index file

faiss.write_index(index, 'path')
print("FAISS index saved successfully.")

FAISS index saved successfully.


In [49]:
# Initializer helper function

def initialize_components(model_name='all-MiniLM-L6-v2',
                          data_path=None,
                          embeddings_path=None,
                          faiss_index_path=None):
    """
    Initialize and return the necessary components:
    - model: Embedding model.
    - index: FAISS index.
    - df: DataFrame with 'name' and 'biography'.
    - embeddings: Numpy array of embeddings.

    Parameters:
    - model_name (str): The name of the SentenceTransformer model to load.
    - data_path (str): Path to the CSV file containing 'name' and 'biography' columns.
                       If None, a sample DataFrame is created.
    - embeddings_path (str): Path to the .npy file containing precomputed embeddings.
                              If None, embeddings are generated from the DataFrame.
    - faiss_index_path (str): Path to the FAISS index file.
                               If None, a new index is built from embeddings.

    Returns:
    - model: Pre-trained SentenceTransformer model.
    - index: FAISS index built from embeddings.
    - df: Pandas DataFrame containing 'name' and 'biography'.
    - embeddings: Numpy array of embeddings.
    """
    # Initialize the embedding model
    print("Loading the embedding model...")
    model = SentenceTransformer(model_name)

    # Load your DataFrame
    if data_path and os.path.exists(data_path):
        print(f"Loading the DataFrame from {data_path}...")
        df = pd.read_csv(data_path)
        if not {'name', 'biography'}.issubset(df.columns):
            raise ValueError("DataFrame must contain 'name' and 'biography' columns.")

    # Load or generate embeddings
    if embeddings_path:
        print(f"Loading embeddings from {embeddings_path}...")
        with open('/content/drive/MyDrive/Mineria/embeddings.pkl', 'rb') as f:
          embeddings = pickle.load(f)

    else:
        print("Generating embeddings for the DataFrame entries...")
        embeddings = model.encode(df['biography'].tolist(), show_progress_bar=True)
        embeddings = np.array(embeddings).astype('float32')
        if embeddings_path:
            print(f"Saving embeddings to {embeddings_path}...")
            np.save(embeddings_path, embeddings)
            print("Embeddings saved.")

    # Load or build FAISS index
    if faiss_index_path and os.path.exists(faiss_index_path):
        print(f"Loading FAISS index from {faiss_index_path}...")
        index = faiss.read_index(faiss_index_path)
    else:
        print("Building the FAISS index...")
        dimension = embeddings.shape[1]
        index = faiss.IndexFlatL2(dimension)
        index.add(embeddings)
        print(f"FAISS index built with {index.ntotal} vectors.")
        if faiss_index_path:
            print(f"Saving FAISS index to {faiss_index_path}...")
            faiss.write_index(index, faiss_index_path)
            print("FAISS index saved.")

    return model, index, df, embeddings

In [50]:
# Seach helper function

def interactive_faiss_search(model, index, df, embeddings, top_k=10,
                            embeddings_output='embedding_projector_embeddings.tsv',
                            metadata_output='embedding_projector_metadata.tsv'):
    """
    Interactive function to perform FAISS search based on user input queries.
    Prompts the user to enter queries, performs the search, and saves the results.

    Parameters:
    - model: The embedding model with an `encode` method.
    - index: FAISS index object.
    - df: DataFrame containing the data with 'name' and 'biography' columns.
    - embeddings: Numpy array of all embeddings used in FAISS.
    - top_k (int): Number of top results to retrieve per query.
    - embeddings_output (str): Filename for saving embeddings.
    - metadata_output (str): Filename for saving metadata.

    Returns:
    - results_df: DataFrame containing all search results.
    """
    try:
        print("\n--- Interactive FAISS Search ---")
        print("Enter your queries one by one. Type 'DONE' when you are finished.\n")

        queries = []
        while True:
            query = input("Enter your query (or type 'DONE' to finish): ").strip()
            if query.upper() == 'DONE':
                break
            elif query:
                queries.append(query)
            else:
                print("Empty query. Please enter a valid query.")

        if not queries:
            print("No queries entered. Exiting the search function.")
            return

        print(f"\nEncoding {len(queries)} query(ies)...")
        query_embeddings = model.encode(queries, batch_size=5, show_progress_bar=True)
        query_embeddings = np.array(query_embeddings).astype('float32')

        all_results = []

        print("\nProcessing Queries...")
        for i, query in enumerate(tqdm(queries, desc="Processing", unit="query")):
            print(f"\nQuery {i+1}: {query}")

            # Reshape for FAISS
            q_embedding = query_embeddings[i].reshape(1, -1)

            # Perform the search
            distances, indices = index.search(q_embedding, top_k)

            # Retrieve and display the top-K results
            for rank, (distance, idx) in enumerate(zip(distances[0], indices[0]), 1):
                try:
                    person_name = df.iloc[idx]['name']
                    biography = df.iloc[idx]['biography']
                    all_results.append({
                        'query': query,
                        'rank': rank,
                        'name': person_name,
                        'distance': distance,
                        'biography': biography,
                        'embedding': embeddings[idx].tolist()
                    })
                    print(f"Rank {rank}: {person_name} (Distance: {distance:.4f})")
                except IndexError:
                    print(f"Warning: Index {idx} is out of bounds for the DataFrame.")

        # Create a DataFrame from the results
        results_df = pd.DataFrame(all_results)

        # Prepare data for embedding projector
        embedding_projector_embeddings = results_df['embedding'].tolist()
        embedding_projector_metadata = results_df['name'].tolist()

        # Ensure output directories exist
        embeddings_dir = os.path.dirname(embeddings_output)
        metadata_dir = os.path.dirname(metadata_output)
        if embeddings_dir:
            os.makedirs(embeddings_dir, exist_ok=True)
        if metadata_dir:
            os.makedirs(metadata_dir, exist_ok=True)

        # Save embeddings to TSV
        print(f"\nSaving embeddings to '{embeddings_output}'...")
        with open(embeddings_output, 'w', newline='', encoding='utf-8') as f_emb:
            writer = csv.writer(f_emb, delimiter='\t')
            for emb in embedding_projector_embeddings:
                writer.writerow(emb)
        print(f"Embeddings successfully saved to '{embeddings_output}'.")

        # Save metadata to TSV without header
        print(f"Saving metadata to '{metadata_output}' without a header row...")
        with open(metadata_output, 'w', newline='', encoding='utf-8') as f_meta:
            writer = csv.writer(f_meta, delimiter='\t')
            for name in embedding_projector_metadata:
                writer.writerow([name])
        print(f"Metadata successfully saved to '{metadata_output}' without a header row.")

        print("\n--- FAISS Search Completed Successfully ---")
        return results_df

    except Exception as e:
        print(f"An error occurred during the FAISS search process: {e}")
        return None


In [None]:
# Interactive Experience

def main():
    """
    Main function to initialize components and execute the interactive FAISS search.
    """
    data_path = 'path'
    embeddings_path = 'path'
    faiss_index_path = 'path'

    # Initialize components
    model, index, df, embeddings = initialize_components(
        model_name='all-MiniLM-L6-v2',
        data_path=data_path,
        embeddings_path=embeddings_path,
        faiss_index_path=faiss_index_path
    )

    # Execute the interactive FAISS search
    results_df = interactive_faiss_search(
        model=model,
        index=index,
        df=df,
        embeddings=embeddings,
        top_k=10,
        embeddings_output='embedding_projector_embeddings.tsv',
        metadata_output='embedding_projector_metadata.tsv'
    )

if __name__ == "__main__":
    main()

Loading the embedding model...
Loading the DataFrame from /content/drive/MyDrive/Mineria/df.csv...
Loading embeddings from /content/drive/MyDrive/Mineria/embeddings.pkl...
Loading FAISS index from /content/drive/MyDrive/Mineria/notable_people_faiss.index...

--- Interactive FAISS Search ---
Enter your queries one by one. Type 'DONE' when you are finished.

Enter your query (or type 'DONE' to finish): man who discovered calculus
Enter your query (or type 'DONE' to finish): DONE

Encoding 1 query(ies)...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Processing Queries...


Processing: 100%|██████████| 1/1 [00:00<00:00, 48.32query/s]


Query 1: man who discovered calculus
Rank 1: George_B._Thomas (Distance: 0.7123)
Rank 2: Stephen_M._Robinson (Distance: 0.7549)
Rank 3: Max_Steck (Distance: 0.8000)
Rank 4: Albert_Edrei (Distance: 0.8490)
Rank 5: John_Speidell (Distance: 0.8642)
Rank 6: William_Fogg_Osgood (Distance: 0.8681)
Rank 7: Alexander_Weinstein (Distance: 0.8811)
Rank 8: Leslie_Fox (Distance: 0.8902)
Rank 9: Julio_Cesar_Firrufino (Distance: 0.9068)
Rank 10: Corrado_Ciamberlini (Distance: 0.9343)

Saving embeddings to 'embedding_projector_embeddings.tsv'...
Embeddings successfully saved to 'embedding_projector_embeddings.tsv'.
Saving metadata to 'embedding_projector_metadata.tsv' without a header row...
Metadata successfully saved to 'embedding_projector_metadata.tsv' without a header row.

--- FAISS Search Completed Successfully ---



