# Recommendation system for arXiv manuscripts by Peter Boross

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from datetime import timedelta
import warnings
import time
import json
import arxiv
import sqlite3
import urllib.request as libreq
import re
from collections import Counter
import unidecode
import itertools
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, auc
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import ShuffleSplit

### Helper functions

In [2]:
def get_authors_FLast(authors):
    r = []
    for author in authors:
        if len(author[1]) == 0: r.append(unidecode.unidecode(author[0]))
        else: r.append(unidecode.unidecode(author[1][0]+author[0]))    
    return ' '.join(r)

def get_authors_FLast_arxivapi(authors):
    r = []
    for authorv in authors:
        author = authorv.split(' ')
        r.append(unidecode.unidecode(author[0][0]+author[-1]))    
    return ' '.join(r)

def get_authors_FdotLastcomma(authors):
    r = []
    for author in authors:
        authorv = author.split(' ')
        r.append(unidecode.unidecode(' '.join([x[0]+'.' for x in authorv[0:-1]])+' '+authorv[-1]))
    return ', '.join(r)

### Define categories

In [3]:
categories = {'cond-mat', 'cond-mat.mes-hall', 'quant-ph', 'cond-mat.supr-con', 'cond-mat.mtrl-sci', 'cond-mat.str-el', 'cond-mat.other'}

### Load manuscripts from arXiv JSON by Kaggle

In [4]:
articles = []

with open("data/arxiv-metadata-oai-snapshot.json", "r") as f:
    for l in f:
        d = json.loads(l)
        if categories & set(d['categories'].split(' ')):
            d['authors_FLast'] = get_authors_FLast(d['authors_parsed'])
            articles.append(d)

articles_df = pd.DataFrame().from_records(articles)

print('number of papers =',len(articles))

number of papers = 315071


### Find manuscript of the authors

In [5]:
authors = ['PBoross','LOroszlany','APalyi','JAsboth','GSzechenyi']

ids = list(articles_df[articles_df["authors_FLast"].str.contains('|'.join(authors))]['id'])

print('number of papers of the authors =',len(ids))

number of papers of the authors = 90


### Find cited papers by prophy.science

In [6]:
refs=[]

for id in ids:
    with libreq.urlopen('https://www.prophy.science/api/arxiv/' + id) as url:
        refs1paper = json.loads(url.read())
    refs.extend([ref['arxivId'] for ref in refs1paper['references'] if ref['arxivId'] != None])

refscounted = sorted(Counter(refs).items(), key=lambda pair: pair[1], reverse=True)
refs = [entry[0] for entry in refscounted]
counts = [entry[1] for entry in refscounted]
refscounteddict = dict(zip(refs, counts))

print('number of cited papers of the authors =',len(refscounted))

number of cited papers of the authors = 1501


### Make training dataset and write it out

In [7]:
cited_df = articles_df[articles_df['id'].isin(refs)][['abstract','title','authors_FLast','id']].replace(refscounteddict).rename(columns = {'id': 'citation'})
cited_df['cited'] = True

print('number of cited papers which in the selected categories =',len(cited_df))

notcited_df = articles_df[articles_df['id'].isin(refs) == False][['abstract','title','authors_FLast']].sample(n = 10*len(cited_df))
notcited_df['citation'] = 0
notcited_df['cited'] = False

print('number of non-cited papers which in the selected categories =',len(notcited_df))

all_df = pd.concat([cited_df, notcited_df])

all_df.to_csv('data/traindataset.csv')

number of cited papers which in the selected categories = 1458
number of non-cited papers which in the selected categories = 14580


### Split to X and y

In [8]:
#all_df = pd.read_csv('data/traindataset.csv', index_col=0)

X = all_df[['authors_FLast','title','abstract']]
y = list(all_df['cited'])

### Build the model and make cross validation

In [9]:
authors_feature = 'authors_FLast'
authors_transformer = CountVectorizer(lowercase=False, max_features = 1000)

title_feature = 'title'
title_transformer = TfidfVectorizer(stop_words='english', ngram_range=(1,3), max_features = 2000)

abstract_feature = 'abstract'
abstract_transformer = TfidfVectorizer(stop_words='english', ngram_range=(1,3), max_features = 5000)

preprocessor = ColumnTransformer(
    transformers=[
        ('authors_FLast', authors_transformer, authors_feature),
        ('title', title_transformer, title_feature),
        ('abstract', abstract_transformer, abstract_feature)
    ])

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', MultinomialNB())])

scores = cross_validate(pipeline, X, y, cv=ShuffleSplit(n_splits=5),
                        scoring=('accuracy', 'precision', 'recall' , 'roc_auc'),
                        return_train_score=True)

print('train accuracy = ',"{:.1f}%".format(100*np.mean(scores['train_accuracy'])),'test accuracy =',"{:.1f}%".format(100*np.mean(scores['test_accuracy'])))
print('train precision = ',"{:.1f}%".format(100*np.mean(scores['train_precision'])),'test precision =',"{:.1f}%".format(100*np.mean(scores['test_precision'])))
print('train recall = ',"{:.1f}%".format(100*np.mean(scores['train_recall'])),'test recall =',"{:.1f}%".format(100*np.mean(scores['test_recall'])))
print('train roc_auc = ',"{:.1f}%".format(100*np.mean(scores['train_roc_auc'])),'test roc_auc =',"{:.1f}%".format(100*np.mean(scores['test_roc_auc'])))

train accuracy =  94.1% test accuracy = 92.5%
train precision =  65.4% test precision = 58.6%
train recall =  73.7% test recall = 60.4%
train roc_auc =  95.6% test roc_auc = 91.7%


### Fit the model and save it

In [10]:
pipeline.fit(X, y);

filename = 'data/model.sav'
pickle.dump(pipeline, open(filename, 'wb'))

### Make a query and predict

In [17]:
filename = 'data/model.sav'
pipeline = pickle.load(open(filename, 'rb'))

days = 3
delta = timedelta(days = days)
catstr = '+OR+'.join(['cat:'+x for x in categories])
client = arxiv.Client()
nquery = 500
startquery = 0
lastquery = nquery
latestdate = False
predicted_df = pd.DataFrame(columns = ['id','published','authors_FdotLastcomma','authors_FLast', 'title', 'abstract'])

while lastquery == nquery:
    feedparser = client._parse_feed(url='http://export.arxiv.org/api/query?search_query='+catstr+'&start='+str(startquery)+'&max_results='+str(nquery)+'&sortBy=submittedDate')
    if len(feedparser.entries) == 0:
        warnings.warn("Warning...........arXiv api provides 0 entry")
    lastquery = 0
    for entry in feedparser.entries:
        if not(latestdate): latestdate = datetime.strptime(entry.published[0:10],'%Y-%m-%d')
        if latestdate - datetime.strptime(entry.published[0:10],'%Y-%m-%d') < delta:
            lastquery+=1
            predicted_df = predicted_df.append({
                'id' : entry.id,
                'authors_FdotLastcomma' : get_authors_FdotLastcomma([author['name'] for author in entry.authors]),
                'authors_FLast' : get_authors_FLast_arxivapi([author['name'] for author in entry.authors]),
                'title' : entry.title.rstrip(),
                'abstract' : entry.summary.rstrip(),
                'published': datetime.strptime(entry.published[0:10],'%Y-%m-%d')
                            }, ignore_index = True)
    startquery += nquery
    time.sleep(5)

Xnew = predicted_df[['authors_FLast','title','abstract']]

predicted_df['relevance'] = [x[1] for x in pipeline.predict_proba(Xnew)]
print('number of the requested papers = ',predicted_df.shape[0])

predicted_df[['id','published','authors_FdotLastcomma','title','abstract','relevance']].sort_values(by=['relevance'],ascending=False).head(5)

number of the requested papers =  161


Unnamed: 0,id,published,authors_FdotLastcomma,title,abstract,relevance
106,http://arxiv.org/abs/2105.14864v1,2021-05-31,"A. Rossi, N. W. Hendrickx, A. Sammak, M. Veldh...",Single-Hole Pump in Germanium,Single-charge pumps are the main candidates fo...,0.855058
119,http://arxiv.org/abs/2105.14763v1,2021-05-31,"N. John, A. D. Maestro, B. Rosenow",Robustness of Helical Edge States Under Edge R...,The helical edge states of time-reversal invar...,0.7614
14,http://arxiv.org/abs/2106.00495v1,2021-06-01,"S. R. Mudi, S. M. Frolov",Model for missing Shapiro steps due to bias-de...,Majorana zero modes are predicted in several s...,0.753026
32,http://arxiv.org/abs/2106.00338v1,2021-06-01,"M. Noro, J. Tanaka, T. Yokoyama, S. Murakami",Theory of Chiral Transport in Carbon Nanotubes,Based on the similarity between the chiral nan...,0.723917
70,http://arxiv.org/abs/2105.15201v1,2021-05-31,"M. Carroll, S. Rosenblatt, P. Jurcevic, I. Lau...",Dynamics of superconducting qubit relaxation t...,Superconducting qubits are a leading candidate...,0.652492


### Write predictions into 'manuscripts.db'

In [18]:
conn = sqlite3.connect('data/manuscripts.db')
c = conn.cursor()

c.execute('CREATE TABLE IF NOT EXISTS manuscripts (id, published, authors, title, abstract, relevance)')
conn.commit()

tosql_df = predicted_df[['id','published','authors_FdotLastcomma','title','abstract','relevance']]

tosql_df.to_sql('manuscripts', conn, if_exists='replace', index = False)