# Scopus

## MAA

_Jesús Morales_

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from pybliometrics.scopus import AbstractRetrieval
from pybliometrics.scopus import ScopusSearch
import spacy
import pandas as pd
import glob
import numpy as np


### Modelos

In [11]:
def preprocess_text(text):
    # Preprocesar el texto eliminando signos de puntuación y convirtiendo a minúsculas
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    preprocessed_text = ' '.join([token.lemma_ for token in doc if not token.is_punct and not token.is_stop])
    return preprocessed_text


def create_query(texts):
    # Paso 1: Preprocesamiento del texto
    preprocessed_texts = [preprocess_text(text) for text in texts]

    # Paso 2: Crear la matriz de características
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(preprocessed_texts)

    # Paso 3: Seleccionar la mejor cantidad de clusters
    best_score = -1
    best_n_clusters = 0
    n_samples = X.shape[0]

    for n_clusters in range(2, n_samples):
        kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(X)
        labels = kmeans.labels_
        score = silhouette_score(X, labels)
        if score > best_score:
            best_score = score
            best_n_clusters = n_clusters

    # Paso 4: Identificar los temas
    kmeans = KMeans(n_clusters=best_n_clusters, random_state=0).fit(X)
    labels = kmeans.labels_
    centroids = kmeans.cluster_centers_

    # Paso 5: Crear la ecuación de búsqueda
    feature_names = vectorizer.get_feature_names_out()
    topics = []
    for i in range(best_n_clusters):
        indices = [j for j, label in enumerate(labels) if label == i]
        top_features = [feature_names[j] for j in centroids[i].argsort()[:-10:-1]]
        topic = '({})'.format(' AND '.join(['"' + top_features[j] + '"' for j in range(len(top_features))]))
        topics.append(topic)
    topics_str = ' OR '.join(topics)
    search_query = 'ABS{}'.format(topics_str)

    return search_query

def get_abstract(doi):
    try:
        ab = AbstractRetrieval(doi)
        return ab.description
    except Exception as e:
        return str(e)

def find_similar_documents(consultados, query, titles, n):
    consultados = [preprocess_text(text) for text in consultados]
    recomendados = []
    
    search = ScopusSearch(query)
    search_results = search.results
    search_results = [registro.description for registro in search_results if registro.description is not None]
    descriptions_recommend = [preprocess_text(text) for text in search_results]

    vectorizer = TfidfVectorizer(stop_words='english')
    vectorizer.fit(consultados)
    
    df_vectors = vectorizer.transform(consultados)
    abstract_vectors = vectorizer.transform(descriptions_recommend)
    
    similarity_scores = cosine_similarity(df_vectors, abstract_vectors)
    
    similar_docs_indices = similarity_scores.argsort()[:, ::-1][:, :n]
    
    documentos_similares = []
    
    for indices_fila in similar_docs_indices:
        nombres = [search_results[indice].title for indice in indices_fila]
        documentos_similares.append(nombres)
    
    recommended_results = [search_results[i].title for i in similar_docs_indices[0]]
    
    result_dict = dict(zip(titles, recommended_results))
    
    return result_dict