# Scopus

## MAA

_Jesús Morales_

In [1]:
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from pybliometrics.scopus import AbstractRetrieval
from pybliometrics.scopus import ScopusSearch
import spacy
import pandas as pd
import glob
import numpy as np


### Modelos

In [11]:
def preprocess_text(text):
    # Preprocesar el texto eliminando signos de puntuación y convirtiendo a minúsculas
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    preprocessed_text = ' '.join([token.lemma_ for token in doc if not token.is_punct and not token.is_stop])
    return preprocessed_text


def create_query(texts):
    # Paso 1: Preprocesamiento del texto
    preprocessed_texts = [preprocess_text(text) for text in texts]

    # Paso 2: Crear la matriz de características
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(preprocessed_texts)

    # Paso 3: Seleccionar la mejor cantidad de clusters
    best_score = -1
    best_n_clusters = 0
    n_samples = X.shape[0]

    for n_clusters in range(2, n_samples):
        kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(X)
        labels = kmeans.labels_
        score = silhouette_score(X, labels)
        if score > best_score:
            best_score = score
            best_n_clusters = n_clusters

    # Paso 4: Identificar los temas
    kmeans = KMeans(n_clusters=best_n_clusters, random_state=0).fit(X)
    labels = kmeans.labels_
    centroids = kmeans.cluster_centers_

    # Paso 5: Crear la ecuación de búsqueda
    feature_names = vectorizer.get_feature_names_out()
    topics = []
    for i in range(best_n_clusters):
        indices = [j for j, label in enumerate(labels) if label == i]
        top_features = [feature_names[j] for j in centroids[i].argsort()[:-10:-1]]
        topic = '({})'.format(' AND '.join(['"' + top_features[j] + '"' for j in range(len(top_features))]))
        topics.append(topic)
    topics_str = ' OR '.join(topics)
    search_query = 'ABS{}'.format(topics_str)

    return search_query

def get_abstract(doi):
    try:
        ab = AbstractRetrieval(doi)
        return ab.description
    except Exception as e:
        return str(e)

def find_similar_documents(consultados, query, titles, n):
    consultados = [preprocess_text(text) for text in consultados]
    recomendados = []
    
    search = ScopusSearch(query)
    search_results = search.results
    search_results = [registro.description for registro in search_results if registro.description is not None]
    descriptions_recommend = [preprocess_text(text) for text in search_results]

    vectorizer = TfidfVectorizer(stop_words='english')
    vectorizer.fit(consultados)
    
    df_vectors = vectorizer.transform(consultados)
    abstract_vectors = vectorizer.transform(descriptions_recommend)
    
    similarity_scores = cosine_similarity(df_vectors, abstract_vectors)
    
    similar_docs_indices = similarity_scores.argsort()[:, ::-1][:, :n]
    
    documentos_similares = []
    
    for indices_fila in similar_docs_indices:
        nombres = [search_results[indice].title for indice in indices_fila]
        documentos_similares.append(nombres)
    
    recommended_results = [search_results[i].title for i in similar_docs_indices[0]]
    
    result_dict = dict(zip(titles, recommended_results))
    
    return result_dict

### Datos

In [3]:
csv_files = glob.glob('Ezpaarse/*.csv')

list_data = []
  

for filename in csv_files:
    #data = pd.read_excel(filename)
    data = pd.read_csv(filename)
    #data.to_csv(f'{filename[:19]}.csv')
    list_data.append(data)
 
ezp_data = pd.concat(list_data,ignore_index=True)

# Selección de variables
ezp_data = ezp_data.loc[:,['date','login', 'platform', 'platform_name',
       'publisher_name', 'rtype', 'mime','doi', 'publication_title',
       'publication_date','title', 'type', 'subject', 'identd']]

ezp_data['login'] = ezp_data['login'].str.lower()
ezp_data['doi'] = ezp_data['doi'].str.lower()

# Se usa un DF 
df = ezp_data
df['date'] = pd.to_datetime(df['date'])
df = df[df['date'] < '2023-01-01']
df = df[df['doi'].notnull()]

### Set de datos de test

In [5]:
# Se crea un dataframe agrupando con 100 registros aleatorios
df_grouped = df.groupby(['date', 'login', 'identd', 'platform']).agg({'doi': 'unique', 'title': 'unique'}).reset_index()

df_sample = df_grouped.sample(n=100,random_state=452).dropna(subset=['doi'])

In [6]:
# Aplicar la función a cada fila del DataFrame y almacenar los resultados en nuevas columnas
df_sample['response_scopus'] = df_sample['doi'].apply(lambda x: [get_abstract(doi) for doi in x])
df_sample['cantidad_doi'] = df_sample['doi'].apply(len)
df_sample['cantidad_doi_scopus'] = df_sample['response_scopus'].apply(lambda lst: sum(1 for elem in lst if elem is not None and 'The resource specified cannot be found.' not in elem))
df_sample['porcentaje_en_scopus'] = round(df_sample['cantidad_doi_scopus'] / df_sample['cantidad_doi'],2)

df_sample['query_result'] = df_sample.apply(lambda row: create_query([abstract for abstract in row['response_scopus'] if abstract not in ['The resource specified cannot be found.', None]]) if row['cantidad_doi_scopus'] > 2 and 'The resource specified cannot be found.' not in row['response_scopus'] else '', axis=1)

In [12]:
df_sample['Recommendation'] = df_sample.apply(lambda row: find_similar_documents(row['response_scopus'], row['query_result'], row['title'], 3) if all(elem not in ['The resource specified cannot be found.', None] for elem in row['response_scopus']) and row['cantidad_doi_scopus'] > 2 else {}, axis=1)

In [14]:
df_sample.to_excel('test_final.xlsx')