In [10]:
from transformers import pipeline, BertTokenizer, BertModel
import pandas as pd
import torch
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

In [None]:
tqdm.pandas()

file_path = "output.csv"  
data = pd.read_csv(file_path)

nlp = spacy.load("en_core_web_sm")

def preprocess_text_spacy(text):
    """
    Preprocesar texto con spaCy para extraer términos relevantes (sustantivos, nombres propios).
    """
    doc = nlp(text)
    tokens = [token.text for token in doc if token.is_alpha and token.pos_ in ("NOUN", "PROPN", "ADJ")]
    return " ".join(tokens)

data['processed_abstract'] = data['abstract'].progress_apply(preprocess_text_spacy)

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(data['processed_abstract'])
tfidf_keywords = tfidf.get_feature_names_out()

def extract_keywords_per_document(row_idx, tfidf_matrix, feature_names, max_keywords=10):
    """
    Extraer palabras clave más relevantes para cada documento según TF-IDF.
    """
    row_vector = tfidf_matrix[row_idx].toarray().flatten()
    sorted_indices = row_vector.argsort()[::-1]  # Indices ordenados por relevancia descendente
    keywords = [feature_names[idx] for idx in sorted_indices[:max_keywords]]
    return keywords

data['keywords'] = [
    extract_keywords_per_document(i, tfidf_matrix, tfidf_keywords, max_keywords=10)
    for i in tqdm(range(tfidf_matrix.shape[0]), desc="Extracting keywords", unit="doc")
]

output_path = "output_with_keywords_spacy.csv"
data.to_csv(output_path, index=False)

print(f"Palabras clave extraídas y guardadas en: {output_path}")


100%|██████████| 50000/50000 [16:46<00:00, 49.68it/s] 
Extracting keywords: 100%|██████████| 50000/50000 [00:35<00:00, 1418.86doc/s]


Palabras clave extraídas y guardadas en: output_with_keywords_spacy.csv


: 