In [2]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
!python -m spacy download es_core_news_md

Collecting es-core-news-md==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_md-3.7.0/es_core_news_md-3.7.0-py3-none-any.whl (42.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: es-core-news-md
Successfully installed es-core-news-md-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


# 1. Preprocesar Documentos

In [3]:
import pandas as pd
import spacy
import json

# Cargar el modelo de spaCy (usa el modelo adecuado para el idioma de los textos (español == es_core_news_md)
nlp = spacy.load('en_core_web_md')  #                                            (inglés == en_core_web_md)

# Configuración de stopwords y POS válidos
valid_POS = {'VERB', 'NOUN', 'ADJ', 'PROPN'}
stopwords = nlp.Defaults.stop_words

# Función de preprocesamiento
def preprocess_text(text):
    """
    Preprocesa un texto para prepararlo para técnicas como BoW o TF-IDF.
    - Filtra stopwords
    - Lematiza palabras
    - Retiene solo palabras alfabéticas y ciertas POS

    Args:
        text (str): Texto a procesar.

    Returns:
        str: Texto procesado.
    """
    try:
        # Procesar con spaCy (Tokenizar)
        doc = nlp(text)

        # Filtrar y lematizar
        lemmatized = [
            token.lemma_.lower() for token in doc
            if token.is_alpha and token.pos_ in valid_POS # Filtrar POS
            and token.text.lower() not in stopwords # Elimina StopWords
        ]
        return " ".join(lemmatized)
    except Exception as e:
        print(f"Error procesando texto: {e}")
        return ""

# Función para aplicar a todas las filas de un DataFrame
def preprocess_dataframe(df, text_columns):
    """
    Aplica el preprocesamiento a las columnas de texto seleccionadas en un DataFrame.

    Args:
        df (pd.DataFrame): DataFrame con los datos.
        text_columns (list): Lista de nombres de columnas a combinar y procesar.

    Returns:
        pd.DataFrame: DataFrame con una nueva columna 'processed_text'.
    """
    try:
        # Combinar columnas de texto en una sola
        df['combined_text'] = df[text_columns].fillna('').apply(lambda row: ' '.join(row), axis=1)

        # Aplicar el preprocesamiento
        df['processed_text'] = df['combined_text'].apply(preprocess_text)
        return df
    except Exception as e:
        print(f"Error procesando DataFrame: {e}")
        return df

def save_processed_text_to_file(df, column_name, file_name):
    """
    Guarda los textos preprocesados de un DataFrame en un archivo .txt.

    Args:
        df (pd.DataFrame): DataFrame que contiene la columna con textos preprocesados.
        column_name (str): Nombre de la columna que contiene los textos preprocesados.
        file_name (str): Nombre del archivo donde guardar los textos.
    """
    try:
        # Abrir el archivo en modo escritura
        with open(file_name, 'w', encoding='utf-8') as f:
            # Escribir cada texto en una línea separada
            for text in df[column_name]:
                f.write(text + '\n')
        print(f"Textos guardados en el archivo: {file_name}")
    except Exception as e:
        print(f"Error al guardar en archivo: {e}")



## 1.1 Cargar y Preprocesar todos los datos del dataset (Cambiar rutas y archivos para los datos)

In [4]:
from google.colab import drive
import pandas as pd

# Montar Google Drive
drive.mount('/content/drive')

# Ruta al archivo en tu Google Drive
file_path = '/content/drive/MyDrive/MÁSTER/NLP/arxiv_papers.csv'

# Cargar el dataset
df = pd.read_csv(file_path)

# Ver las primeras filas del dataset
print("Primeras filas del dataset:")
print(df.head())
text_columns = ['title', 'abstract']

# Aplicar preprocesamiento a estas columnas
preprocessed_df = preprocess_dataframe(df, text_columns=text_columns)
save_processed_text_to_file(preprocessed_df, column_name='processed_text', file_name='processed_texts.txt')

# Mostrar las primeras filas después del preprocesamiento
print("Primeras filas después del preprocesamiento:")
print(preprocessed_df[['title', 'processed_text']])

Mounted at /content/drive
Primeras filas del dataset:
                                               title  \
0    MetaFormer is Actually What You Need for Vision   
1     Turbo Autoencoder with a Trainable Interleaver   
2  Ab-initio calculation of point defect equilibr...   
3  Divergent electrostriction at ferroelectric ph...   
4  ProxyFL: Decentralized Federated Learning thro...   

                                            abstract  \
0  Transformers have shown great potential in com...   
1  A critical aspect of reliable communication in...   
2  Point defects are responsible for a wide range...   
3  We investigate the electrostrictive response a...   
4  Institutions in highly regulated domains such ...   

                   published  \
0  2021-11-22T18:52:03+00:00   
1  2021-11-22T18:37:03+00:00   
2  2021-11-22T17:11:17+00:00   
3  2021-11-22T17:00:32+00:00   
4  2021-11-22T16:47:39+00:00   

                                             authors  \
0  ['Weihao Yu', 'Mi Lu

## 1.2. Cargando Nuestros Datos

In [None]:

from google.colab import drive
drive.mount('/content/drive')

# Ruta al archivo JSON en tu Drive
file_path = '/content/drive/MyDrive/MÁSTER/NLP/arxiv-metadata-oai-snapshot.json'

# Cargar datos desde el archivo JSON
data = []
with open(file_path, 'r') as file:
    for line in file:
        data.append(json.loads(line))

# Crear un DataFrame con las columnas relevantes
df = pd.DataFrame(data)

# Seleccionar las columnas que nos interesan para análisis
# En este caso, usaremos 'authors', 'title', y 'abstract'
df = df[['authors', 'title', 'abstract']]

# Renombrar las columnas para consistencia con el preprocesamiento
df.rename(columns={'authors': 'author'}, inplace=True)

# Verificar el DataFrame cargado
print(df.head())

# Aplicar la función de preprocesamiento
preprocessed_df = preprocess_dataframe(df, text_columns=['title', 'abstract'])

# Mostrar el resultado
print(preprocessed_df[['author', 'processed_text']])
save_processed_text_to_file(preprocessed_df, column_name='processed_text', file_name='processed_texts.txt')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


JSONDecodeError: Extra data: line 2 column 1 (char 1689)

# 2. Crear un Corpus

In [5]:
mycorpus = open('processed_texts.txt').readlines()
mycorpus = [el.strip().split() for el in mycorpus]

print(('Number of documents in corpus: '+str(len(mycorpus))))
print(('============= First document in corpus ============='))
print(mycorpus[0])
print(('============= Corresponding Python string ============='))
print(' '.join(mycorpus[0]))

Number of documents in corpus: 8000
['metaformer', 'need', 'vision', 'transformers', 'show', 'great', 'potential', 'computer', 'vision', 'task', 'common', 'belief', 'attention', 'base', 'token', 'mixer', 'module', 'contribute', 'competence', 'recent', 'work', 'attention', 'base', 'module', 'transformer', 'replace', 'spatial', 'mlp', 'resulted', 'model', 'perform', 'base', 'observation', 'hypothesize', 'general', 'architecture', 'transformer', 'specific', 'token', 'mixer', 'module', 'essential', 'model', 'performance', 'verify', 'replace', 'attention', 'module', 'transformer', 'simple', 'spatial', 'pooling', 'operator', 'conduct', 'basic', 'token', 'mixing', 'observe', 'derive', 'model', 'term', 'poolformer', 'achieve', 'competitive', 'performance', 'multiple', 'computer', 'vision', 'task', 'example', 'k', 'poolformer', 'achieve', 'accuracy', 'surpass', 'tune', 'vision', 'transformer', 'mlp', 'like', 'baseline', 'deit', 'b', 'resmlp', 'accuracy', 'few', 'parameter', 'few', 'mac', 'effec

# 3. Detectar N-Grams (Opcional, si queremos hacerlo debemos ajustar los thresholds para detectar buenos N-Grams)

In [6]:
from gensim.models.phrases import Phrases

phrase_model = Phrases(mycorpus, min_count=2, threshold=20)
mycorpus = [el for el in phrase_model[mycorpus]] #We populate mycorpus again
print(('============= First document after N-gram replacement ============='))
print(mycorpus[0])

['metaformer', 'need', 'vision_transformers', 'show_great', 'potential', 'computer_vision', 'task', 'common', 'belief', 'attention', 'base', 'token_mixer', 'module', 'contribute', 'competence', 'recent', 'work', 'attention', 'base', 'module', 'transformer', 'replace', 'spatial', 'mlp', 'resulted', 'model', 'perform', 'base', 'observation', 'hypothesize', 'general', 'architecture', 'transformer', 'specific', 'token_mixer', 'module', 'essential', 'model', 'performance', 'verify', 'replace', 'attention_module', 'transformer', 'simple', 'spatial', 'pooling', 'operator', 'conduct', 'basic', 'token_mixing', 'observe', 'derive', 'model', 'term', 'poolformer', 'achieve_competitive', 'performance', 'multiple', 'computer_vision', 'task', 'example', 'k', 'poolformer', 'achieve', 'accuracy', 'surpass', 'tune', 'vision_transformer', 'mlp', 'like', 'baseline', 'deit', 'b', 'resmlp', 'accuracy', 'few_parameter', 'few', 'mac', 'effectiveness', 'poolformer', 'verify', 'hypothesis', 'urge', 'initiate', 

# 4. Creamos un Diccionario

In [7]:
from gensim.corpora import Dictionary

no_below = 4 #Minimum number of documents to keep a term in the dictionary
no_above = .80 #Maximum proportion of documents in which a term can appear to be kept in the dictionary

# Create dictionary of tokens
D = Dictionary(mycorpus)
D.filter_extremes(no_below=no_below,no_above=no_above)

n_tokens = len(D)

print('The dictionary contains', n_tokens, 'terms')
print('First terms in the dictionary:')
for n in range(10):
    print(str(n), ':', D[n])

The dictionary contains 11654 terms
First terms in the dictionary:
0 : abstract
1 : accuracy
2 : achieve
3 : achieve_competitive
4 : achieve_superior
5 : architecture
6 : argue
7 : attention
8 : attention_module
9 : b


# 5. Hacemos BoW

In [8]:
mycorpus_bow = [D.doc2bow(doc) for doc in mycorpus]

n_project = 1000
print(('============= Project abstract (lemmas) ============='))
print(' '.join(mycorpus[n_project]))

print(('============= Sparse vector representation ============='))
print(mycorpus_bow[n_project])

print(('============= Word counts for the project ============='))
print(list(map(lambda x: (D[x[0]], x[1]), mycorpus_bow[n_project])))

real_time simulation level level level electric_vehicle charging systems charge system require convert ac electricity grid dc electricity charge electric_vehicle ev_battery accord society automatic engineers sae standard ev charger divide level base power rating level level level paper investigate circuit topology control principle ev_charge system level high_fidelity testbed ev_charge system kwh battery design implement real_time digital simulator rt lab testbed include model detail switch semiconductor minute real_time simulation conduct testbed detailed dynamic performance circuit control stage present demonstrate charge process level ev_charge system employ high frequency transformer embed dual active bridge dab dc_dc converter regulate battery dc_voltage current average model base linear system analysis give configure parameter phase_shift control adopt dab dc_dc converter addition power factor control pfc employ level level single phase ac charge system phase voltage source conve

# 6. Crear LDA

In [9]:
from gensim.models.ldamodel import LdaModel
num_topics = 20

ldag = LdaModel(corpus=mycorpus_bow, id2word=D, num_topics=num_topics)



# 7. Encontrar los documentos más relevantes de un tópic

In [10]:
def most_relevant_documents(ldag, topicid, corpus_bow, ndocs=10):
    """This function returns the most relevant documents in corpus_bow

    : ldag: The trained topic model object provided by gensim
    : topicid: The topic for which we want to find the most relevant documents
    : corpus_bow: The BoW representation of documents in Gensim format
    : ndocs: Number of most relevant documents to return

    : Returns: A list with the identifiers of the most relevant documents
    """
    print('Computing most relevant documents for Topic', topicid)
    print('Topic composition is:')
    print(ldag.show_topic(topicid))

    # Compute relevance of each document for the given topic
    doc_topic_probs = [
        (doc_id, ldag.get_document_topics(bow, minimum_probability=0)[topicid][1])
        for doc_id, bow in enumerate(corpus_bow)
    ]

    # Sort documents by their probability for the topic in descending order
    sorted_docs = sorted(doc_topic_probs, key=lambda x: x[1], reverse=True)

    # Extract the identifiers of the most relevant documents
    most_relevant_doc_ids = [doc_id for doc_id, _ in sorted_docs[:ndocs]]

    return most_relevant_doc_ids

# To test the function we will find the most relevant projects for one of the topics
project_id = most_relevant_documents(ldag, 8, mycorpus_bow, ndocs=3)

# Print titles of selected projects
for idproject in project_id:
    print('\n', ' '.join(mycorpus[idproject]))

Computing most relevant documents for Topic 8
Topic composition is:
[('algorithm', 0.031518213), ('problem', 0.0163214), ('propose', 0.014928381), ('base', 0.0105038015), ('method', 0.01007682), ('learning', 0.008250329), ('network', 0.007867106), ('result', 0.0076906467), ('approach', 0.0063222665), ('paper', 0.0062838783)]

 distribute plug n play algorithm multi robot application priori non computable objective_function paper_present distribute algorithm applicable_wide range practical multi robot application multi robot application user define objective mission cast general optimization_problem explicit guideline subtask different robot owe unknown environment unknown robot dynamic sensor nonlinearitie analytic form optimization cost function available standard gradient_descent like algorithm applicable problem tackle introduce new algorithm design robot subcost function optimization accomplish overall team objective transformation propose distribute methodology base base adaptive 