In [None]:
import csv
import numpy as np
import torch
import PyPDF2
from nltk.tokenize import word_tokenize
import nltk
import faiss
import hnswlib
from annoy import AnnoyIndex
import pandas as pd
from transformers import AutoTokenizer, AutoModel, GPT2Tokenizer, GPT2Model, RobertaTokenizer, RobertaModel

nltk.download('punkt', quiet=True)

def load_model_and_tokenizer(model_name):
    if model_name.startswith('gpt2'):
        tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        model = GPT2Model.from_pretrained(model_name)
    elif model_name.startswith('roberta'):
        tokenizer = RobertaTokenizer.from_pretrained(model_name)
        model = RobertaModel.from_pretrained(model_name)
    elif model_name.startswith('bert'):
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModel.from_pretrained(model_name)
    elif model_name.startswith('bart'):
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModel.from_pretrained(model_name)
    elif model_name.startswith('xlnet'):
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModel.from_pretrained(model_name)
    else:
        raise ValueError("Unsupported model name")
    return model, tokenizer

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page in pdf_reader.pages:
            text += page.extract_text() if page.extract_text() else ""
    return text

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.isalpha()]
    return tokens

def tokenize_and_vectorize(tokens, model, tokenizer):
    vectors = []
    for token in set(tokens):
        inputs = tokenizer(token, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)
        vector = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
        vectors.append(vector)
    return np.array(vectors), list(set(tokens))

def create_annoy_index(vectors, dimension, n_trees=10): # These are the defult values for ANNOY. You can modify them according to your needs
    annoy_index = AnnoyIndex(dimension, 'angular')
    for i, vector in enumerate(vectors):
        annoy_index.add_item(i, vector)
    annoy_index.build(n_trees)
    return annoy_index

def create_faiss_index(vectors): # These are the defult values for FAISS. You can modify them according to your needs
    dimension = vectors.shape[1]
    faiss_index = faiss.IndexFlatL2(dimension)
    faiss_index.add(vectors)
    return faiss_index

def create_hnsw_index(vectors, dimension, space='l2', M=16, ef_construction=200):  # These are the defult values for HSNW. You can modify them according to your needs
    hnsw_index = hnswlib.Index(space=space, dim=dimension)
    num_elements = vectors.shape[0]
    hnsw_index.init_index(max_elements=num_elements, ef_construction=ef_construction, M=M)
    hnsw_index.add_items(vectors)
    hnsw_index.set_ef(10)
    return hnsw_index

def find_nearest_neighbors(query_vector, index, index_type, n_neighbors=15):
    if index_type == 'annoy':
        nearest_ids = index.get_nns_by_vector(query_vector, n_neighbors)
    elif index_type == 'faiss':
        query_vector = np.array([query_vector]).astype('float32')
        _, nearest_ids = index.search(query_vector, n_neighbors)
        nearest_ids = nearest_ids[0]
    elif index_type == 'hnsw':
        labels, _ = index.knn_query(query_vector, k=n_neighbors)
        nearest_ids = labels[0]
    return nearest_ids

def process_files_and_save_results(pdf_filenames, model_names, target_word, pdf_directory, year):
    results = []
    aggregated_text = ""

    for pdf_filename in pdf_filenames:
        pdf_path = f'{pdf_directory}/{pdf_filename}'  
        pdf_text = extract_text_from_pdf(pdf_path)
        aggregated_text += pdf_text

    for model_name in model_names:
        model, tokenizer = load_model_and_tokenizer(model_name)

        print(f"Processing year {year} with {model_name}")
        tokens = preprocess_text(aggregated_text)
        vectors, unique_tokens = tokenize_and_vectorize(tokens, model, tokenizer)

        # Vectorize the target word
        target_vector, _ = tokenize_and_vectorize([target_word], model, tokenizer)

        # Get the vector dimension from the loaded model
        dimension = model.config.hidden_size

        # Create indexes for vectors
        annoy_index = create_annoy_index(vectors, dimension)
        faiss_index = create_faiss_index(vectors)
        hnsw_index = create_hnsw_index(vectors, dimension)

        # Now, let's find the nearest neighbors using the indexes
        for index, index_type in zip([annoy_index, faiss_index, hnsw_index], ['annoy', 'faiss', 'hnsw']):
            nearest_ids = find_nearest_neighbors(target_vector[0], index, index_type)
            nearest_words = [unique_tokens[id] for id in nearest_ids]  # Get the actual nearest words

            # Append results for saving
            results.append([year, model_name, index_type, nearest_words])

    # Save results to CSV and then convert to Excel
    csv_filename = f'nearest_neighbors_results_{year}.csv'
    with open(csv_filename, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Year', 'Model Used', 'Index Type', 'Nearest Neighbors'])
        writer.writerows(results)

    # Convert CSV to Excel
    df_results = pd.read_csv(csv_filename)
    excel_filename = csv_filename.replace('.csv', '.xlsx')
    df_results.to_excel(excel_filename, index=False)

    return excel_filename


# Load the Excel file to get PDF filenames
excel_path = 'files.xlsx'  # Update this path. It cosiders that you have a column named 'Year' and another column named 'Filename'
df = pd.read_excel(excel_path)

# I had a list of PDF files from 2010 to 2023. Modify this according to your needs
start_year = 2010 
end_year = 2023

# Define models to use
# Based on size of memory and computational power you can selecet or deselct these models
model_names = ['xlnet-base-cased', 'gpt2', 'gpt2-medium', 'gpt2-large', 'roberta-base', 'roberta-large', 'bert-base-uncased', 'bert-large-uncased']

# Define the target word
target_word = 'inflation'

# Define the directory containing PDF files
pdf_directory = '' # Update this path. It considers that you have PDF files in the same directory as this script. If not, you can use an absolute path like '/path/to/pdf/files

for year in range(start_year, end_year + 1):
    # Filter the DataFrame based on the year
    df_filtered = df[df['Year'] == year]
    pdf_filenames = df_filtered.iloc[:, 0].tolist()

    # Process the PDF files and save the results
    excel_results_filename = process_files_and_save_results(pdf_filenames, model_names, target_word, pdf_directory, year)
    print(f"Results for year {year} saved to {excel_results_filename}") 