# Recommendation system for arXiv manuscripts by Peter Boross

In [147]:
import numpy as np
import pandas as pd
from datetime import datetime
from datetime import timedelta
import warnings
import time
import json
import arxiv
import sqlite3
import urllib.request as libreq
import re
from collections import Counter
import unidecode
import itertools
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, roc_curve, auc
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV

### Helper functions

In [131]:
def get_authors_FLast(authors):
    r = []
    for author in authors:
        if len(author[1]) == 0: r.append(unidecode.unidecode(author[0]))
        else: r.append(unidecode.unidecode(author[1][0]+author[0]))    
    return ' '.join(r)

def get_authors_FLast_arxivapi(authors):
    r = []
    for authorv in authors:
        author = authorv.split(' ')
        r.append(unidecode.unidecode(author[0][0]+author[-1]))    
    return ' '.join(r)

def get_authors_FdotLastcomma(authors):
    r = []
    for author in authors:
        authorv = author.split(' ')
        r.append(unidecode.unidecode(' '.join([x[0]+'.' for x in authorv[0:-1]])+' '+authorv[-1]))
    return ', '.join(r)

def progress_bar(relevance):
    s = 100-int(100-100*relevance/5)
    if s > 14:
        if s > 17:
            color = 'red'
        else:
            color = 'orange'
    else:
        color = 'yellow'
    return '|<font color="'+color+'">'+'█'*s+'</font>'+'-'*(20-s)+'|'

def print_score(scores, type):
    extra_space = 9
    print('train '+type+' '*(extra_space-len(type))+' = ',"{:.1f}%".format(100*np.mean(scores['train_'+type])),'\ttest '+type+' '*(extra_space-len(type))+' =',"{:.1f}%".format(100*np.mean(scores['test_'+type])))

### Define categories

In [69]:
categories = {'cond-mat', 'cond-mat.mes-hall', 'quant-ph', 'cond-mat.supr-con', 'cond-mat.mtrl-sci', 'cond-mat.str-el', 'cond-mat.other'}

### Load manuscripts from arXiv JSON by Kaggle

In [70]:
manuscripts = []

with open("data/arxiv-metadata-oai-snapshot.json", "r") as f:
    for l in f:
        d = json.loads(l)
        if categories & set(d['categories'].split(' ')):
            d['authors_FLast'] = get_authors_FLast(d['authors_parsed'])
            manuscripts.append(d)

manuscripts_df = pd.DataFrame().from_records(manuscripts)

print('number of papers =',len(manuscripts))

number of papers = 315071


### Find manuscript of the authors

In [71]:
authors = ['PBoross','LOroszlany','APalyi','JAsboth','GSzechenyi']

ownids = list(manuscripts_df[manuscripts_df["authors_FLast"].str.contains('|'.join(authors))]['id'])

print('number of papers of the authors =',len(ownids))

number of papers of the authors = 90


### Find cited papers by prophy.science

In [72]:
citedids = ownids.copy()

for id in ownids:
    with libreq.urlopen('https://www.prophy.science/api/arxiv/' + id) as url:
        refs1manuscript = json.loads(url.read())
    citedids.extend([ref['arxivId'] for ref in refs1manuscript['references'] if ref['arxivId'] != None])

citedidscounted = sorted(Counter(citedids).items(), key=lambda pair: pair[1], reverse=True)
citedids = [entry[0] for entry in citedidscounted]
counts = [entry[1] for entry in citedidscounted]
citedidscounteddict = dict(zip(citedids, counts))

print('number of cited papers of the authors =',len(citedidscounted))

number of cited papers of the authors = 1541


### Make training dataset and write it out

In [160]:
cited_df = manuscripts_df[manuscripts_df['id'].isin(citedids)][['abstract','title','authors_FLast','id']].replace(citedidscounteddict).rename(columns = {'id': 'citation'})
cited_df['cited'] = True

print('number of cited papers which in the selected categories =',len(cited_df))

notcited_df = manuscripts_df[manuscripts_df['id'].isin(citedids) == False][['abstract','title','authors_FLast']].sample(n = 10*len(cited_df))
notcited_df['citation'] = 0
notcited_df['cited'] = False

print('number of non-cited papers which in the selected categories =',len(notcited_df))

all_df = pd.concat([cited_df, notcited_df])

all_df.to_csv('data/traindataset.csv')

number of cited papers which in the selected categories = 1498
number of non-cited papers which in the selected categories = 14980


### Split to X and y

In [161]:
#all_df = pd.read_csv('data/traindataset.csv', index_col=0)

X = all_df[['authors_FLast','title','abstract']]
y = list(all_df['cited'])

### Build the model and make cross validation

In [162]:
authors_feature = 'authors_FLast'
authors_transformer = CountVectorizer(lowercase=False, max_features=1000)

title_feature = 'title'
title_transformer = TfidfVectorizer(stop_words='english', ngram_range=(1,3), max_features=2000)

abstract_feature = 'abstract'
abstract_transformer = TfidfVectorizer(stop_words='english', ngram_range=(1,4), max_features=5000)

preprocessor = ColumnTransformer(
    transformers=[
        ('authors_FLast', authors_transformer, authors_feature),
        ('title', title_transformer, title_feature),
        ('abstract', abstract_transformer, abstract_feature)
    ])

classifier = Pipeline(steps=[('preprocessor', preprocessor),
                             #('balancer', SMOTE()),
                             ('classifier', MultinomialNB())])

scores = cross_validate(classifier, X, y, cv=StratifiedShuffleSplit(n_splits=5),
                        scoring=('accuracy', 'precision', 'recall' , 'f1', 'roc_auc'),
                        return_train_score=True)

print_score(scores, 'accuracy')
print_score(scores, 'precision')
print_score(scores, 'recall')
print_score(scores, 'f1')
print_score(scores, 'roc_auc')

train accuracy  =  94.1% 	test accuracy  = 92.7%
train precision =  65.7% 	test precision = 59.1%
train recall    =  74.5% 	test recall    = 64.3%
train f1        =  69.8% 	test f1        = 61.5%
train roc_auc   =  95.7% 	test roc_auc   = 91.9%


### GridSearch

In [142]:
authors_feature = 'authors_FLast'
authors_transformer = CountVectorizer(lowercase=False)

title_feature = 'title'
title_transformer = TfidfVectorizer(stop_words='english', ngram_range=(1,3), max_features=20)

abstract_feature = 'abstract'
abstract_transformer = TfidfVectorizer(stop_words='english', ngram_range=(1,3), max_features=50)

preprocessor = ColumnTransformer(
    transformers=[
        ('authors_FLast', authors_transformer, authors_feature),
        ('title', title_transformer, title_feature),
        ('abstract', abstract_transformer, abstract_feature)
    ])

param_grids = {'preprocessor__authors_FLast__max_features':[10,20,30]}

classifier = Pipeline(steps=[('preprocessor', preprocessor),
                            #('balancer', RandomOverSampler()),
                            ('classifier', MultinomialNB())])

gridsearch = GridSearchCV(classifier, param_grids, cv=ShuffleSplit(n_splits=5),
                          scoring=('f1'),
                          return_train_score=True)

gridsearch.fit(X,y)

GridSearchCV(cv=ShuffleSplit(n_splits=5, random_state=None, test_size=None, train_size=None),
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('authors_FLast',
                                                                         CountVectorizer(lowercase=False),
                                                                         'authors_FLast'),
                                                                        ('title',
                                                                         TfidfVectorizer(max_features=20,
                                                                                         ngram_range=(1,
                                                                                                      3),
                                                                                         stop_words='english'),
                                                   

In [163]:
classifier.fit(X, y);

filename = 'data/model.sav'
pickle.dump(classifier, open(filename, 'wb'))

### Make a query and predict

In [164]:
filename = 'data/model.sav'
classifier = pickle.load(open(filename, 'rb'))

days = 7
delta = timedelta(days = days)
catstr = '+OR+'.join(['cat:'+x for x in categories])
client = arxiv.Client()
nquery = 500
startquery = 0
lastquery = nquery
latestdate = False
predicted_df = pd.DataFrame(columns = ['id','published','authors_FdotLastcomma','authors_FLast', 'title', 'abstract'])

while lastquery == nquery:
    feedparser = client._parse_feed(url='http://export.arxiv.org/api/query?search_query='+catstr+'&start='+str(startquery)+'&max_results='+str(nquery)+'&sortBy=submittedDate')
    if len(feedparser.entries) == 0:
        warnings.warn("Warning...........arXiv api provides 0 entry")
    lastquery = 0
    for entry in feedparser.entries:
        if not(latestdate): latestdate = datetime.strptime(entry.published[0:10],'%Y-%m-%d')
        if latestdate - datetime.strptime(entry.published[0:10],'%Y-%m-%d') < delta:
            lastquery += 1
            predicted_df = predicted_df.append({
                'id' : entry.id,
                'authors_FdotLastcomma' : get_authors_FdotLastcomma([author['name'] for author in entry.authors]),
                'authors_FLast' : get_authors_FLast_arxivapi([author['name'] for author in entry.authors]),
                'title' : entry.title.replace('\n', ' '),
                'abstract' : entry.summary.replace('\n', ' '),
                'published': datetime.strptime(entry.published[0:10],'%Y-%m-%d')
                            }, ignore_index = True)
    startquery += nquery
    time.sleep(5)

Xnew = predicted_df[['authors_FLast','title','abstract']]

predicted_df['relevance'] = [x[1] for x in classifier.predict_proba(Xnew)]
print('number of the requested papers = ',predicted_df.shape[0])

predicted_df[['id','published','authors_FdotLastcomma','title','abstract','relevance']].sort_values(by=['relevance'],ascending=False).head(5)

number of the requested papers =  390


Unnamed: 0,id,published,authors_FdotLastcomma,title,abstract,relevance
81,http://arxiv.org/abs/2106.03082v1,2021-06-06,"M. T. Madzik, S. Asaad, A. Youssry, B. Joecker...",Precision tomography of a three-qubit electron...,Nuclear spins were among the first physical pl...,1.0
260,http://arxiv.org/abs/2106.01391v1,2021-06-02,"D. Buterakos, S. D. Sarma",Spin-Valley Qubit Dynamics In Exchange Coupled...,The presence of valley states is a significant...,0.996651
42,http://arxiv.org/abs/2106.03435v1,2021-06-07,"V. L. Muller, Y. Yan, O. Kashuba, B. Trauzette...",Electron-hole scattering limited transport of ...,We experimentally investigate the effect of el...,0.988116
246,http://arxiv.org/abs/2106.01576v1,2021-06-03,"K. Kuroyama, S. Matsuo, J. Muramoto, S. Yabuna...",Real-time observation of charge-spin cooperati...,Quantum dots are recognized as a suitable plat...,0.980173
324,http://arxiv.org/abs/2106.00800v2,2021-06-01,"B. Mera, N. Goldman",Relating the topology of Dirac Hamiltonians to...,Quantum geometry has emerged as a central and ...,0.936935


### Write predictions into 'manuscripts.db'

In [84]:
tosql_df = predicted_df[['id','published','authors_FdotLastcomma','title','abstract','relevance']].rename(columns = {"authors_FdotLastcomma": "authors"})

conn = sqlite3.connect('data/manuscripts.db')
c = conn.cursor()

c.execute('CREATE TABLE IF NOT EXISTS manuscripts (id, published, authors, title, abstract, relevance)')
conn.commit()

tosql_df.to_sql('manuscripts', conn, if_exists = 'replace', index = False)

### Write predictions into 'manuscripts.html'

In [165]:
tohtml_df = predicted_df[['id','published','authors_FdotLastcomma','title','abstract','relevance']].rename(columns = {"authors_FdotLastcomma": "authors"})

day = False
html = '<ul>\n'
html += '<hr>\n'
for idx, row in tohtml_df[tohtml_df['relevance']>0.5].sort_values(by=['published','relevance'],ascending=False).iterrows():
    if day != row['published']:
        html += '<div class="date">'+row['published'].strftime('%-d %B, %Y')+'</div>\n'
        html += '<hr>\n'
        day = row['published']
    html += '<li>\n'
    html += '<a href="'+row['id']+'">arXiv:'+row['id'].split('http://arxiv.org/abs/')[-1][:-2]+'</a>\n'
    html += '<div class="relevance"><b>Relevance:</b>'
    html += '<font style="font-family:courier, monospace">'+progress_bar(row['relevance'])+'</font>'
    html += str(round(100*row['relevance'],1))+'%</div>\n'
    html += '<div class="title"><b>Title:</b> '+row['title']+'</div>\n'
    html += '<div class="authors_head"><b>Authors:</div></b> '
    html += '<div class="authors"><i>'+row['authors']+'</i></div>\n'
    html += '<div class="abstract_head"><b>Abstract:</b></div>\n'
    html += '<div class="abstract">'+row['abstract']+'</div>\n'
    html += '<br>\n'
    html += '</li>\n'
    html += '<hr>\n'
html += '</ul>'

with open("data/template.html", "r") as file:
    template = file.read()

html = template.replace("***", html)

with open("data/manuscripts.html", "w") as file:
        file.write(html)