# Recommendation system for arXiv manuscripts by Peter Boross

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime
from datetime import timedelta
import json
import arxiv
import urllib.request as libreq
import re
from collections import Counter
import unidecode
import itertools
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, auc
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import ShuffleSplit

### Helper functions

In [2]:
def get_authors_LastF(authors):
    r = []
    for author in authors:
        if len(author[1]) == 0: r.append(unidecode.unidecode(author[0]))
        else: r.append(unidecode.unidecode(author[0]+author[1][0]))    
    return ' '.join(r)

def get_authors_LastF_2(authors):
    r = []
    for authorv in authors:
        author = authorv['name'].split(' ')
        r.append(unidecode.unidecode(author[-1]+author[0][0]))    
    return ' '.join(r)

### Define categories

In [7]:
categories = {'cond-mat', 'cond-mat.mes-hall', 'quant-ph', 'cond-mat.supr-con', 'cond-mat.mtrl-sci', 'cond-mat.str-el', 'cond-mat.other'}

### Load manuscripts from arXiv JSON by Kaggle

In [None]:
articles = []

with open("data/arxiv-metadata-oai-snapshot.json", "r") as f:
    for l in f:
        d = json.loads(l)
        if categories & set(d['categories'].split(' ')):
            d['authors_LastF'] = get_authors_LastF(d['authors_parsed'])
            articles.append(d)

articles_df = pd.DataFrame().from_records(articles)

print('number of papers =',len(articles))

### Find manuscript of the authors

In [None]:
authors = ['BorossP','OroszlanyL','PalyiA','AsbothJ','SzechenyiG']

ids = list(articles_df[articles_df["authors_LastF"].str.contains('|'.join(authors))]['id'])

print('number of papers of the authors =',len(ids))

### Find cited papers by prophy.science

In [None]:
refs=[]

for id in ids:
    with libreq.urlopen('https://www.prophy.science/api/arxiv/' + id) as url:
        refs1paper = json.loads(url.read())
    refs.extend([ref['arxivId'] for ref in refs1paper['references'] if ref['arxivId'] != None])

refscounted = sorted(Counter(refs).items(), key=lambda pair: pair[1], reverse=True)
refs = [entry[0] for entry in refscounted]
counts = [entry[1] for entry in refscounted]
refscounteddict = dict(zip(refs, counts))

print('number of cited papers of the authors =',len(refscounted))

### Make training dataset

In [None]:
cited_df = articles_df[articles_df['id'].isin(refs)][['abstract','title','authors_LastF','id']].replace(refscounteddict).rename(columns = {'id': 'citation', 'authors_LastF': 'authors'})
cited_df['cited'] = True

print('number of cited papers which in the selected categories =',len(cited_df))

notcited_df = articles_df[articles_df['id'].isin(refs) == False][['abstract','title','authors_LastF']].sample(n = 10*len(cited_df)).rename(columns = {'authors_LastF': 'authors'})
notcited_df['citation'] = 0
notcited_df['cited'] = False

print('number of non-cited papers which in the selected categories =',len(notcited_df))

all_df = pd.concat([cited_df, notcited_df])

all_df.to_csv('data/all_df.csv')

### Split to X and y

In [4]:
all_df = pd.read_csv('data/all_df.csv', index_col=0)

X = all_df[['authors','title','abstract']]
y = list(all_df['cited'])

### Build the model and make cross validation

In [5]:
authors_feature = 'authors'
authors_transformer = CountVectorizer(lowercase=False, max_features = 1000)

title_feature = 'title'
title_transformer = TfidfVectorizer(stop_words='english', ngram_range=(1,3), max_features = 2000)

abstract_feature = 'abstract'
abstract_transformer = TfidfVectorizer(stop_words='english', ngram_range=(1,3), max_features = 5000)

preprocessor = ColumnTransformer(
    transformers=[
        ('authors', authors_transformer, authors_feature),
        ('title', title_transformer, title_feature),
        ('abstract', abstract_transformer, abstract_feature)
    ])

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', MultinomialNB())])

scores = cross_validate(pipeline, X, y, cv=ShuffleSplit(n_splits=5),
                        scoring=('accuracy', 'precision', 'recall' , 'roc_auc'),
                        return_train_score=True)

print('train accuracy = ',"{:.1f}%".format(100*np.mean(scores['train_accuracy'])),'test accuracy =',"{:.1f}%".format(100*np.mean(scores['test_accuracy'])))
print('train precision = ',"{:.1f}%".format(100*np.mean(scores['train_precision'])),'test precision =',"{:.1f}%".format(100*np.mean(scores['test_precision'])))
print('train recall = ',"{:.1f}%".format(100*np.mean(scores['train_recall'])),'test recall =',"{:.1f}%".format(100*np.mean(scores['test_recall'])))
print('train roc_auc = ',"{:.1f}%".format(100*np.mean(scores['train_roc_auc'])),'test roc_auc =',"{:.1f}%".format(100*np.mean(scores['test_roc_auc'])))

train accuracy =  94.0% test accuracy = 93.1%
train precision =  65.3% test precision = 60.5%
train recall =  74.1% test recall = 61.1%
train roc_auc =  95.6% test roc_auc = 91.5%


### Fit the model

In [6]:
pipeline.fit(X, y);

### Make a query and predict

In [10]:
delta = timedelta(days=10)
catstr = '+OR+'.join(['cat:'+x for x in categories])
client = arxiv.Client()
latest=False
n_query = 50
start_query = 0
last_query = n_query
df = pd.DataFrame(columns = ['id','authors', 'title', 'abstract'])

while last_query == n_query:
    feedparser = client._parse_feed(url='http://export.arxiv.org/api/query?search_query='+catstr+'&start='+str(start_query)+'&max_results='+str(n_query)+'&sortBy=submittedDate')
    last_query=0
    for entry in feedparser.entries:
        if not(latest): latest = datetime.strptime(entry.published[0:10],'%Y-%m-%d')
        if latest - datetime.strptime(entry.published[0:10],'%Y-%m-%d') < delta:
            last_query+=1
            df = df.append({
                'id' : entry.id,
                'authorsFull' : [author['name'] for author in entry.authors],
                'authors' : get_authors_LastF_2(entry.authors),
                'title' : entry.title,
                'abstract' : entry.summary,
                'published': datetime.strptime(entry.published[0:10],'%Y-%m-%d')
                            }, ignore_index = True)
    start_query += n_query

Xnew = df[['authors','title','abstract']]

df['pred'] = [x[1] for x in pipeline.predict_proba(Xnew)]

In [11]:
df[['id','authorsFull','title','abstract','pred']].sort_values(by=['pred'],ascending=False).head(10)

Unnamed: 0,id,authorsFull,title,abstract,pred
127,http://arxiv.org/abs/2105.11850v1,"[Wenjun Kuang, Guillermo Lopez-Polin, Hyungjun...",Magnetization signature of topological surface...,Superconductors with nontrivial band structure...,0.989659
100,http://arxiv.org/abs/2105.12088v1,"[Arnau Sala, Jeroen Danon]",Line Shapes of Electric Dipole Spin Resonance ...,Electric dipole spin resonance (EDSR) is a com...,0.982567
167,http://arxiv.org/abs/2105.11489v1,"[Nguyen Minh Nguyen, Wojciech Brzezicki, Timo ...","Corner states, hinge states and Majorana modes...",SnTe materials are one of the most flexible ma...,0.749926
140,http://arxiv.org/abs/2105.11729v1,"[Jan David Brehm, Paul Pöpperl, Alexander D. M...",Tunable Anderson Localization of Dark States,Random scattering of photons in disordered one...,0.622407
159,http://arxiv.org/abs/2105.11560v1,"[Jorge Cayao, Pablo Burset]",Confinement-induced zero-bias peaks in convent...,Majorana bound states in topological supercond...,0.612309
1,http://arxiv.org/abs/2105.12725v1,"[Chiara Devescovi, Mikel García-Díez, Iñigo Ro...",Cubic 3D Chern photonic insulators with orient...,Time Reversal Symmetry (TRS) broken topologica...,0.599044
102,http://arxiv.org/abs/2105.12074v1,"[Jeffrey A. Ivie, Quinn Campbell, Justin C. Ko...",The impact of stochastic incorporation on atom...,Scanning tunneling microscope lithography can ...,0.514333
79,http://arxiv.org/abs/2105.12209v1,"[Guoqing Wang, Changhao Li, Paola Cappellaro]",Observation of symmetry-protected selection ru...,"Periodically driven quantum systems, known as ...",0.349097
99,http://arxiv.org/abs/2105.12090v1,"[M. Naumann, F. Arnold, Z. Medvecka, S. -C. Wu...",Weyl nodes close to the Fermi energy in NbAs,The noncentrosymmetric transition metal monopn...,0.315583
198,http://arxiv.org/abs/2105.11244v1,"[Huiying Huang, Diana Csontosová, Santanu Mann...",Electric-field induced tuning of electronic co...,We conduct a combined experimental and theoret...,0.309488
