In [4]:
from elsapy.elsclient import ElsClient
from elsapy.elsprofile import ElsAuthor, ElsAffil
from elsapy.elsdoc import FullDoc, AbsDoc
from elsapy.elssearch import ElsSearch
import json
from tqdm import tqdm
import hashlib
from bs4 import BeautifulSoup
import requests
from time import sleep
import pandas as pd

In [5]:
con_file = open("config.json")
config = json.load(con_file)
con_file.close()

## Initialize client
client = ElsClient(config['apikey'])
client.inst_token = config['insttoken']

In [6]:
my_auth = ElsAuthor(uri = 'https://api.elsevier.com/content/author/author_id/7004367821')

### Поиск статей

In [12]:
doc_srch = ElsSearch("(KEY quantum) AND (PUBYEAR = 2023) AND SUBJAREA(COMP)",'scopus')
doc_srch.execute(client, get_all = True)
print ("doc_srch has", len(doc_srch.results), "results.")

doc_srch has 324 results.


In [18]:
refs = [r['prism:url'] for r in doc_srch.results]

In [38]:
doc_srch.results[0]

{'@_fa': 'true',
 'link': [{'@_fa': 'true',
   '@ref': 'self',
   '@href': 'https://api.elsevier.com/content/abstract/scopus_id/85145769887'},
  {'@_fa': 'true',
   '@ref': 'author-affiliation',
   '@href': 'https://api.elsevier.com/content/abstract/scopus_id/85145769887?field=author,affiliation'},
  {'@_fa': 'true',
   '@ref': 'scopus',
   '@href': 'https://www.scopus.com/inward/record.uri?partnerID=HzOxMe3b&scp=85145769887&origin=inward'},
  {'@_fa': 'true',
   '@ref': 'scopus-citedby',
   '@href': 'https://www.scopus.com/inward/citedby.uri?partnerID=HzOxMe3b&scp=85145769887&origin=inward'}],
 'prism:url': 'https://api.elsevier.com/content/abstract/scopus_id/85145769887',
 'dc:identifier': 'SCOPUS_ID:85145769887',
 'eid': '2-s2.0-85145769887',
 'dc:title': 'Machine-learning atomic simulation for heterogeneous catalysis',
 'dc:creator': 'Chen D.',
 'prism:publicationName': 'npj Computational Materials',
 'prism:eIssn': '20573960',
 'prism:volume': '9',
 'prism:issueIdentifier': '1',
 

In [14]:
def get_doi_list(results, http=True, scihub=False):
    if scihub:
        pre = "https://sci-hub.do/"
    else:
        pre = "https://doi.org/"
    new_results = list(filter(lambda x: 'prism:doi' in x, results))
    return [f"{pre}{el['prism:doi']}" for el in new_results]

In [31]:
def get_full_text(doi):
    doi_doc = FullDoc(doi = doi)
    if doi_doc.read(client):
        print ("doi_doc.title: ", doi_doc.title)
        doi_doc.write()
    return doi_doc.data


def get_open_texts(results):
    open_docs = []
    open_texts = []
    for res in tqdm(results):
        if res['openaccess'] == '1':
            open_docs.append(res)
            open_texts.append(get_full_text(res['prism:doi']))
    return open_docs, open_texts

def filter_results(results, andwords, orwords, notwords):
    res = []
    for result in results:
        flag = True
        for word in andwords:
            if word not in result['dc:title'].lower():
                flag = False
                break
        orflag = False
        for word in orwords:
            if word in result['dc:title'].lower():
                orflag = True
                break
        notflag = True
        for word in notwords:
            if word in result['dc:title'].lower():
                notflag = False
                break
        if flag and orflag and notflag:
            res.append(result)
    return res

In [16]:
def download_from_scihub(link):
    with open('tmp.html', 'wb') as f:
        f.write(requests.get(link).content)
        
    with open("tmp.html", "r") as f:
        contents = f.read()
        soup = BeautifulSoup(contents, 'lxml')
        src = soup.find("iframe", id="pdf")['src']
        hash_object = hashlib.md5(link.encode())
        with open(f'scihub_downloads/{hash_object.hexdigest()}.pdf', 'wb') as ff:
            if not src.startswith('http'):
                src = 'http:'+src
            ff.write(requests.get(src).content)

In [29]:
doc_srch.results[0]['doi']

KeyError: 'doi'

In [32]:
get_full_text(doc_srch.results[0]['prism:doi'])

In [33]:
doi_doc = FullDoc(doi = doc_srch.results[0]['prism:doi'])
if doi_doc.read(client):
    print ("doi_doc.title: ", doi_doc.title)
    doi_doc.write()

In [17]:
for link in tqdm(refs):
    try:
        download_from_scihub(link)
        sleep(3)
    except Exception as e:
        print(e)

NameError: name 'refs' is not defined

In [139]:
refs

['https://sci-hub.do/10.1016/j.childyouth.2020.105514',
 'https://sci-hub.do/10.3390/su12208519',
 'https://sci-hub.do/10.1007/s10964-020-01245-7',
 'https://sci-hub.do/10.1111/bjep.12292',
 'https://sci-hub.do/10.1080/02643944.2020.1713870',
 'https://sci-hub.do/10.1080/00131881.2020.1750305',
 'https://sci-hub.do/10.1080/08856257.2019.1643145',
 'https://sci-hub.do/10.1080/08856257.2019.1628338',
 'https://sci-hub.do/10.1111/bjep.12276',
 'https://sci-hub.do/10.17583/ijep.2020.4463',
 'https://sci-hub.do/10.1007/s12187-019-09674-y',
 'https://sci-hub.do/10.1007/s12187-019-09646-2',
 'https://sci-hub.do/10.1080/02643944.2019.1700546',
 'https://sci-hub.do/10.1080/19411243.2019.1647814',
 'https://sci-hub.do/10.1080/03004279.2019.1573264',
 'https://sci-hub.do/10.1080/03057925.2018.1510304',
 'https://sci-hub.do/10.1007/s10803-020-04655-5',
 'https://sci-hub.do/10.18848/2327-7963/CGP/V27I02/17-37',
 'https://sci-hub.do/10.1002/pits.22319',
 'https://sci-hub.do/10.1080/00313831.2020.179

In [19]:
df = pd.DataFrame()

In [20]:
df['link'] = refs
df['doi'] = list(map(lambda x: x.strip('https://sci-hub.do/'), refs))
df['filename'] = list(map(lambda x: hashlib.md5(x.encode()).hexdigest(), df['link']))

In [144]:
df.to_csv('links.csv', index=False)

In [7]:
from surprise import SVD
from surprise import Dataset, Reader
import pandas as pd

raw_data = pd.DataFrame([
    ['user1', 'item1', 5],
    ['user1', 'item2', 3],
    ['user2', 'item1', 4],
    ['user2', 'item2', 2]
], columns=['user', 'item', 'rating'])


algo = SVD()
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(raw_data, reader)
trainset = data.build_full_trainset()

algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f94bf2d7100>

In [15]:
algo.predict('user34', 'item2')

Prediction(uid='user34', iid='item2', r_ui=None, est=3.3176996441538043, details={'was_impossible': False})