In [6]:
from transformers import pipeline, BertTokenizer, BertModel
import pandas as pd
import torch
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

In [12]:
tqdm.pandas()

file_path = "output.csv"  
data = pd.read_csv(f'../bbdd_rag/{file_path}')

nlp = spacy.load("en_core_web_md")

def preprocess_text_spacy(text):
    """
    Preprocesar texto con spaCy para extraer términos relevantes (sustantivos, nombres propios).
    """
    doc = nlp(text)
    tokens = [token.text for token in doc if token.is_alpha and token.pos_ in ("NOUN", "PROPN", "ADJ")]
    return " ".join(tokens)

data['processed_abstract'] = data['abstract'].progress_apply(preprocess_text_spacy)

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(data['processed_abstract'])
tfidf_keywords = tfidf.get_feature_names_out()

def extract_keywords_per_document(row_idx, tfidf_matrix, feature_names, max_keywords=10):
    """
    Extraer palabras clave más relevantes para cada documento según TF-IDF.
    """
    row_vector = tfidf_matrix[row_idx].toarray().flatten()
    sorted_indices = row_vector.argsort()[::-1]  # Indices ordenados por relevancia descendente
    keywords = [feature_names[idx] for idx in sorted_indices[:max_keywords]]
    return keywords

data['keywords'] = [
    extract_keywords_per_document(i, tfidf_matrix, tfidf_keywords, max_keywords=10)
    for i in tqdm(range(tfidf_matrix.shape[0]), desc="Extracting keywords", unit="doc")
]

output_path = "output_with_keywords_spacy.csv"
data.to_csv(output_path, index=False)

print(f"Palabras clave extraídas y guardadas en: {output_path}")


100%|██████████| 50000/50000 [19:48<00:00, 42.06it/s] 
Extracting keywords: 100%|██████████| 50000/50000 [00:44<00:00, 1131.56doc/s]


Palabras clave extraídas y guardadas en: output_with_keywords_spacy.csv


In [13]:
pd.read_csv(output_path)

Unnamed: 0,title,abstract,categories,id,summary,processed_abstract,keywords
0,Calculation of prompt diphoton production cros...,A fully differential calculation in perturba...,hep-ph,704.0001,A fully differential calculation in perturbati...,differential calculation perturbative quantum ...,"['diphoton', 'gluon', 'next', 'pairs', 'hadron..."
1,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-...",math.CO cs.CG,704.0002,"We describe a new algorithm, the $(k,\ell)$-pe...",new algorithm game colors characterization fam...,"['gabow', 'game', 'graphs', 'sparse', 'colors'..."
2,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is descri...,physics.gen-ph,704.0003,The evolution of Earth-Moon system is describe...,evolution Earth Moon system dark matter field ...,"['moon', 'earth', 'system', 'evolution', 'flui..."
3,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle...,math.CO,704.0004,A determinant of Stirling cycle numbers counts...,determinant Stirling cycle numbers acyclic sin...,"['automata', 'determinant', 'stirling', 'invol..."
4,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\L...,math.CA math.FA,704.0005,In this paper we show how to compute the $Lamb...,paper norm dyadic grid result consequence desc...,"['dyadic', 'hardy', 'norm', 'grid', 'consequen..."
...,...,...,...,...,...,...,...
49995,Casimir-Polder force between an atom and a die...,The low-temperature behavior of the Casimir-...,quant-ph,802.2698,The low-temperature behavior of the Casimir-Po...,low temperature behavior Casimir Polder free e...,"['dielectrics', 'lifshitz', 'plate', 'screenin..."
49996,Repelling Random Walkers in a Diffusion-Coales...,We have shown that the steady state probabil...,cond-mat.stat-mech,802.2699,Double shock structures perform biased random ...,steady state probability distribution function...,"['boundaries', 'shock', 'system', 'lattice', '..."
49997,The cobordism class of the moduli space of pol...,"For any vector $r=(r_1,..., r_n)$, let $M_r$...",math.SG,802.2700,"For any vector $r=(r_1,..., r_n)$, let $M_r$ d...",vector moduli space rigid motions polygons len...,"['vector', 'polygons', 'rigid', 'lengths', 'mo..."
49998,Authentication over Noisy Channels,"In this work, message authentication over no...",cs.IT cs.CR math.IT,802.2701,Message authentication over noisy channels is ...,work message authentication noisy channels mod...,"['authentication', 'opponent', 'attacks', 'mes..."


In [19]:
import pickle
with open('../bbdd_rag/arxiv_data.pkl', 'rb') as f:
    read_df = pickle.load(f)
read_df['keywords'] = data['keywords']
read_df

Unnamed: 0,title,abstract,categories,id,topic_distribution,main_topics,summary,processed_abstract,keywords
0,Calculation of prompt diphoton production cros...,A fully differential calculation in perturba...,hep-ph,704.0001,"[('6', 0.03385964), ('20', 0.03287259), ('21',...","Stellar Disks and Formation, Scaling and Criti...",A fully differential calculation in perturbati...,differential calculation perturbative quantum ...,"[diphoton, gluon, next, pairs, hadron, lhc, pe..."
1,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-...",math.CO cs.CG,704.0002,"[('6', 0.015022807), ('8', 0.29834053), ('21',...","Graphs and Computational Complexity, Collision...","We describe a new algorithm, the $(k,\ell)$-pe...",new algorithm game colors use characterization...,"[gabow, game, graphs, sparse, colors, certifie..."
2,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is descri...,physics.gen-ph,704.0003,"[('1', 0.01177972), ('4', 0.16045782), ('10', ...","Composite Spheres and Microstructures, Dark Ma...",The evolution of Earth-Moon system is describe...,evolution Earth Moon system dark matter field ...,"[moon, earth, system, evolution, fluid, model,..."
3,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle...,math.CO,704.0004,"[('1', 0.031633098), ('8', 0.051943827), ('11'...","Error Codes and Signal Modulations, Cross Sect...",A determinant of Stirling cycle numbers counts...,determinant Stirling cycle numbers unlabeled a...,"[automata, determinant, stirling, involution, ..."
4,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\L...,math.CA math.FA,704.0005,"[('22', 0.14159086), ('28', 0.06358145), ('45'...","Functional Cycles and Interferometers, Collisi...",In this paper we show how to compute the $Lamb...,paper norm dyadic grid result consequence desc...,"[dyadic, hardy, norm, grid, consequence, atoms..."
...,...,...,...,...,...,...,...,...,...
49995,Casimir-Polder force between an atom and a die...,The low-temperature behavior of the Casimir-...,quant-ph,802.2698,"[('1', 0.031941213), ('5', 0.01613487), ('9', ...","Electronic Transport and Band Structures, Data...",The low-temperature behavior of the Casimir-Po...,low temperature behavior Casimir Polder free e...,"[dielectrics, lifshitz, plate, screening, dc, ..."
49996,Repelling Random Walkers in a Diffusion-Coales...,We have shown that the steady state probabil...,cond-mat.stat-mech,802.2699,"[('0', 0.042325787), ('5', 0.050769534), ('7',...","Random Processes and Distributions, Quasi-Peri...",Double shock structures perform biased random ...,steady state probability distribution function...,"[boundaries, shock, system, lattice, coalescen..."
49997,The cobordism class of the moduli space of pol...,"For any vector $r=(r_1,..., r_n)$, let $M_r$...",math.SG,802.2700,"[('1', 0.039701346), ('5', 0.06615013), ('10',...","Hydrogen Reflection and Interference, Informat...","For any vector $r=(r_1,..., r_n)$, let $M_r$ d...",vector moduli space rigid motions polygons len...,"[vector, polygons, rigid, lengths, motions, mo..."
49998,Authentication over Noisy Channels,"In this work, message authentication over no...",cs.IT cs.CR math.IT,802.2701,"[('9', 0.057013277), ('16', 0.016576579), ('20...","Operators and Polynomial Transformations, Rand...",Message authentication over noisy channels is ...,work message authentication noisy channels mod...,"[authentication, opponent, attacks, message, n..."


In [21]:
with open('../bbdd_rag/arxiv_data.pkl', 'wb') as f:
    pickle.dump(read_df, f)
