# Recommendation system for arXiv manuscripts by Peter Boross

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from datetime import timedelta
import json
import arxiv
import urllib.request as libreq
import re
from collections import Counter
import unidecode
import itertools
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, auc
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import ShuffleSplit

### Helper functions

In [2]:
def get_authors_LastF(authors):
    r = []
    for author in authors:
        if len(author[1]) == 0: r.append(unidecode.unidecode(author[0]))
        else: r.append(unidecode.unidecode(author[0]+author[1][0]))    
    return ' '.join(r)

def get_authors_LastF_2(authors):
    r = []
    for authorv in authors:
        author = authorv['name'].split(' ')
        r.append(unidecode.unidecode(author[-1]+author[0][0]))    
    return ' '.join(r)

### Load manuscripts from arXiv JSON by Kaggle

In [3]:
articles = []
categories = {'cond-mat', 'cond-mat.mes-hall', 'quant-ph', 'cond-mat.supr-con', 'cond-mat.mtrl-sci', 'cond-mat.str-el', 'cond-mat.other'}

with open("data/arxiv-metadata-oai-snapshot.json", "r") as f:
    for l in f:
        d = json.loads(l)
        if categories & set(d['categories'].split(' ')):
            d['authors_LastF'] = get_authors_LastF(d['authors_parsed'])
            articles.append(d)

articles_df = pd.DataFrame().from_records(articles)

print('number of papers =',len(articles))

number of papers = 315071


### Find manuscript of the authors

In [4]:
authors = ['BorossP','OroszlanyL','PalyiA','AsbothJ','SzechenyiG']

ids = list(articles_df[articles_df["authors_LastF"].str.contains('|'.join(authors))]['id'])

print('number of papers of the authors =',len(ids))

number of papers of the authors = 90


### Find cited papers by prophy.science

In [5]:
refs=[]

for id in ids:
    with libreq.urlopen('https://www.prophy.science/api/arxiv/' + id) as url:
        refs1paper = json.loads(url.read())
    refs.extend([ref['arxivId'] for ref in refs1paper['references'] if ref['arxivId'] != None])

refscounted = sorted(Counter(refs).items(), key=lambda pair: pair[1], reverse=True)
refs = [entry[0] for entry in refscounted]
counts = [entry[1] for entry in refscounted]
refscounteddict = dict(zip(refs, counts))

print('number of cited papers of the authors =',len(refscounted))

number of cited papers of the authors = 1501


### Make training dataset

In [36]:
cited_df = articles_df[articles_df['id'].isin(refs)][['abstract','title','authors_LastF','id']].replace(refscounteddict).rename(columns = {'id': 'citation', 'authors_LastF': 'authors'})
cited_df['cited'] = True

print('number of cited papers which in the selected categories =',len(cited_df))

notcited_df = articles_df[articles_df['id'].isin(refs) == False][['abstract','title','authors_LastF']].sample(n = 10*len(cited_df)).rename(columns = {'authors_LastF': 'authors'})
notcited_df['citation'] = 0
notcited_df['cited'] = False

print('number of non-cited papers which in the selected categories =',len(notcited_df))

all_df = pd.concat([cited_df, notcited_df])

X = all_df[['authors','title','abstract']]
y = list(all_df['cited'])

number of cited papers which in the selected categories = 1458
number of non-cited papers which in the selected categories = 14580


In [38]:
all_df.to_csv('data/all_df.csv')

In [39]:
all_df_alt = pd.read_csv('data/all_df.csv')

### Build the model and make cross validation

In [None]:
authors_feature = 'authors'
authors_transformer = CountVectorizer(lowercase=False, max_features = 1000)

title_feature = 'title'
title_transformer = TfidfVectorizer(stop_words='english', ngram_range=(1,3), max_features = 2000)

abstract_feature = 'abstract'
abstract_transformer = TfidfVectorizer(stop_words='english', ngram_range=(1,3), max_features = 5000)

preprocessor = ColumnTransformer(
    transformers=[
        ('authors', authors_transformer, authors_feature),
        ('title', title_transformer, title_feature),
        ('abstract', abstract_transformer, abstract_feature)
    ])

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', MultinomialNB())])

scores = cross_validate(pipeline, X, y, cv=ShuffleSplit(n_splits=5),
                        scoring=('accuracy', 'precision', 'recall' , 'roc_auc'),
                        return_train_score=True)

print('train accuracy = ',"{:.1f}%".format(100*np.mean(scores['train_accuracy'])),'test accuracy =',"{:.1f}%".format(100*np.mean(scores['test_accuracy'])))
print('train precision = ',"{:.1f}%".format(100*np.mean(scores['train_precision'])),'test precision =',"{:.1f}%".format(100*np.mean(scores['test_precision'])))
print('train recall = ',"{:.1f}%".format(100*np.mean(scores['train_recall'])),'test recall =',"{:.1f}%".format(100*np.mean(scores['test_recall'])))
print('train roc_auc = ',"{:.1f}%".format(100*np.mean(scores['train_roc_auc'])),'test roc_auc =',"{:.1f}%".format(100*np.mean(scores['test_roc_auc'])))

### Fit the model

In [27]:
pipeline.fit(X, y);

### Make a query and predict

In [28]:
delta = timedelta(days=1)
catstr = '+OR+'.join(['cat:'+x for x in categories])
client = arxiv.Client()
latest=False
n_query = 50
start_query = 0
last_query = n_query
df = pd.DataFrame(columns = ['id','authors', 'title', 'abstract'])

while last_query == n_query:
    feedparser = client._parse_feed(url='http://export.arxiv.org/api/query?search_query='+catstr+'&start='+str(start_query)+'&max_results='+str(n_query)+'&sortBy=submittedDate')
    last_query=0
    for entry in feedparser.entries:
        if not(latest): latest = datetime.strptime(entry.published[0:10],'%Y-%m-%d')
        if latest - datetime.strptime(entry.published[0:10],'%Y-%m-%d') < delta:
            last_query+=1
            df = df.append({
                'id' : entry.id,
                'authorsFull' : [author['name'] for author in entry.authors],
                'authors' : get_authors_LastF_2(entry.authors),
                'title' : entry.title,
                'abstract' : entry.summary,
                'published': datetime.strptime(entry.published[0:10],'%Y-%m-%d')
                            }, ignore_index = True)
    start_query += n_query

Xnew = df[['authors','title','abstract']]

df['pred'] = [x[1] for x in pipeline.predict_proba(Xnew)]

In [29]:
df[['id','authorsFull','title','abstract','pred']].sort_values(by=['pred'],ascending=False).head(10)

Unnamed: 0,id,authorsFull,title,abstract,pred
1,http://arxiv.org/abs/2105.12725v1,"[Chiara Devescovi, Mikel García-Díez, Iñigo Ro...",Cubic 3D Chern photonic insulators with orient...,Time Reversal Symmetry (TRS) broken topologica...,0.515815
48,http://arxiv.org/abs/2105.12403v1,"[Balázs Dóra, Doru Sticlet, Cătălin Paşcu Moca]",Non-Hermitian Lindhard function and Friedel os...,The Lindhard function represents the basic bui...,0.19946
75,http://arxiv.org/abs/2105.12269v1,"[Mahmoud M. Asmar, Wang-Kong Tse]",Impurity Screening and Friedel Oscillations in...,We develop a theory for the non-equilibrium sc...,0.182955
12,http://arxiv.org/abs/2105.12641v1,"[Zachary A. H. Goodwin, Lennart Klebl, Valerio...","Flat bands, electron interactions and magnetic...","Starting with twisted bilayer graphene, graphe...",0.174965
36,http://arxiv.org/abs/2105.12470v1,"[Carlos Vega, Miguel Bello, Diego Porras, Alej...",Qubit-photon bound states in topological waveg...,Quantum emitters interacting with photonic ban...,0.146146
40,http://arxiv.org/abs/2105.12461v1,"[H. Geng, G. Y. Qi, L. Sheng, W. Chen, D. Y. X...",Proposal for engineering 3D quantum Hall effec...,The three-dimensional (3D) quantum Hall effect...,0.106699
74,http://arxiv.org/abs/2105.12275v1,"[Mahmoud M. Asmar, Gaurav Gupta, Wang-Kong Tse]",Particle-hole asymmetry and quantum confinemen...,Intrinsically broken symmetries in the bulk of...,0.09814
42,http://arxiv.org/abs/2105.12442v1,"[Olli Siltanen, Tom Kuusela, Jyrki Piilo]",Engineering of Hong-Ou-Mandel interference wit...,Hong-Ou-Mandel effect lies in the heart of qua...,0.07244
31,http://arxiv.org/abs/2105.12501v1,"[Maximilian Paleschke, Cheng-Tien Chiang, Lian...",Plasmonic Spin-Hall Effect of propagating Surf...,Photoexcitation and shaping of a propagating s...,0.064124
56,http://arxiv.org/abs/2105.12373v1,[B. Szafran],Annular confinement for electrons on liquid he...,We discuss annular confinement for electrons o...,0.063004
