Skip to content

Commit

Permalink
Calculating Metrics for Software Records (#52)
Browse files Browse the repository at this point in the history
Changes to CitationCapture that allow it to produce the flat files necessary for ADSDataPipeline to generate metrics for software records:
- Canonical Bibcodes
- Citations
- References
- Author Names
Additional changes allow CitationCapture to import and include reader data in the nonbib record.
  • Loading branch information
tjacovich committed Sep 20, 2022
1 parent a24da27 commit 5cda8d2
Show file tree
Hide file tree
Showing 11 changed files with 977 additions and 80 deletions.
151 changes: 149 additions & 2 deletions ADSCitationCapture/db.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import os
from typing import OrderedDict
from psycopg2 import IntegrityError
from dateutil.tz import tzutc
from ADSCitationCapture.models import Citation, CitationTarget, Event
from ADSCitationCapture.models import Citation, CitationTarget, Event, Reader
from ADSCitationCapture import doi
from adsmsg import CitationChange
import datetime
from adsputils import setup_logging

# ============================= INITIALIZATION ==================================== #
Expand All @@ -18,6 +20,13 @@
level=config.get('LOGGING_LEVEL', 'INFO'),
attach_stdout=config.get('LOG_STDOUT', False))

#Dictionary that defines the output files for ADSDataPipeline
file_names=OrderedDict()
file_names['bibcode'] =proj_home+'/logs/output/bibcodes_CC.can.list'
file_names['citations'] = proj_home+'/logs/output/citations_CC.list'
file_names['references'] = proj_home+'/logs/output/references_CC.list'
file_names['authors'] = proj_home+'/logs/output/facet_authors_CC.list'


# =============================== FUNCTIONS ======================================= #
def store_event(app, data):
Expand Down Expand Up @@ -101,6 +110,69 @@ def update_citation_target_metadata(app, content, raw_metadata, parsed_metadata,
metadata_updated = _update_citation_target_metadata_session(session, content, raw_metadata, parsed_metadata, curated_metadata, status=status, bibcode=bibcode, associated=associated)
return metadata_updated

def write_citation_target_data(app, only_status=None):
"""
Writes Canonical bibcodes to file for DataPipeline
returns: Reference Network File
Citation Network File
Canonical Bibcodes File
Facet Authors File
"""
with app.session_scope() as session:
if only_status:
records_db = session.query(CitationTarget).filter_by(status=only_status).all()
disable_filter = only_status in ['DISCARDED','EMITTABLE']
else:
records_db = session.query(CitationTarget).all()
disable_filter = True
bibcodes = [r.bibcode for r in records_db]
records = _extract_key_citation_target_data(records_db, disable_filter=disable_filter)
#writes canonical bibcodes to file.
with open(file_names['bibcode']+".tmp", 'w') as f:
f.write("\n".join(bibcodes))
logger.info("Writing Citation/Reference Network Files.")
_write_key_citation_reference_data(app, bibcodes)
logger.info("Writing author data for {} records".format(len(records)))
_write_key_citation_target_authors(app, records)
for file in file_names:
os.system('cp {} {}'.format(file+".tmp", file))
logger.debug("Copied {}.tmp to {}".format(file, file))

def _write_key_citation_target_authors(app, records):
"""
Writes facet author data to file.
"""
try:
with open(file_names['authors']+".tmp", 'w') as f:
for rec in records:
parsed_metadata = get_citation_target_metadata(app, rec['content']).get('parsed', {})
if parsed_metadata:
f.write(str(rec['bibcode'])+"\t"+"\t".join(parsed_metadata.get('authors',''))+"\n")

logger.info("Wrote file {} to disk.".format('authors'))
except Exception as e:
logger.exception("Failed to write file {}.".format(file_names['authors']+".tmp"))
raise Exception("Failed to write file {}.".format(file_names['authors']+".tmp"))

def _write_key_citation_reference_data(app, bibcodes):
"""
Write the two network files:
Citation Network File: X cites software record
Reference Network File: software record is cited by X
Both are needed to integrate software records into classic record metrics.
"""
try:
with open(file_names['citations']+".tmp", 'w') as f, open(file_names['references']+".tmp", 'w') as g:
for bib in bibcodes:
cites=get_citations_by_bibcode(app, bib)
for cite in cites:
g.write(str(cite)+"\t"+str(bib)+"\n")
f.write(str(bib)+"\t"+str(cite)+"\n")
logger.info("Wrote files {} and {} to disk.".format(file_names['citations'], file_names['references']))
except Exception as e:
logger.exception("Failed to write files {} and {}.".format(file_names['citations']+".tmp", file_names['references']+".tmp"))
raise Exception("Failed to write files {} and {}.".format(file_names['citations']+".tmp", file_names['references']+".tmp"))
def _update_citation_target_curator_message_session(session, content, msg):
"""
Actual calls to database session for update_citation_target_metadata
Expand Down Expand Up @@ -146,6 +218,28 @@ def store_citation(app, citation_change, content_type, raw_metadata, parsed_meta
stored = True
return stored

def store_reader_data(app, reader_change, status):
"""
Stores a new citation in the DB
"""
stored = False
with app.session_scope() as session:
reads = Reader()
reads.bibcode = reader_change['bibcode']
reads.reader = reader_change['reader']
reads.timestamp = reader_change['timestamp']#.ToDatetime().replace(tzinfo=tzutc())
reads.status = status
session.add(reads)
try:
session.commit()
except IntegrityError as e:
# IntegrityError: (psycopg2.IntegrityError) duplicate key value violates unique constraint "citing_content_unique_constraint"
logger.error("Ignoring new reader information (bibcode '%s', reader '%s') because it already exists in the database when it is not supposed to (race condition?): '%s'", reader_change['bibcode'], reader_change['readers'], str(e))
else:
logger.info("Stored new reader (bibcode: '%s', reader '%s' timestamp '%s)", reader_change['bibcode'], reader_change['reader'], reader_change['timestamp'])
stored = True
return stored

def get_citation_target_count(app):
"""
Return the number of citation targets registered in the database
Expand Down Expand Up @@ -204,6 +298,27 @@ def get_citation_targets_by_bibcode(app, bibcodes, only_status='REGISTERED'):
records = _extract_key_citation_target_data(records_db, disable_filter=disable_filter)
return records

def get_citation_targets_by_alt_bibcode(app, alt_bibcodes, only_status='REGISTERED'):
"""
Return a list of dict with the requested citation targets based on their bibcode
"""
with app.session_scope() as session:
records_db = []
for alt_bibcode in alt_bibcodes:
if only_status:
record_db = session.query(CitationTarget).filter(CitationTarget.parsed_cited_metadata['alternate_bibcode'].contains([alt_bibcode])).filter_by(status=only_status).first()
else:
record_db = session.query(CitationTarget).filter(CitationTarget.parsed_cited_metadata['alternate_bibcode'].contains([alt_bibcode])).first()
if record_db:
records_db.append(record_db)

if only_status:
disable_filter = only_status == 'DISCARDED'
else:
disable_filter = True
records = _extract_key_citation_target_data(records_db, disable_filter=disable_filter)
return records

def get_citation_targets_by_doi(app, dois, only_status='REGISTERED'):
"""
Return a list of dict with the requested citation targets based on their DOI
Expand All @@ -216,7 +331,6 @@ def get_citation_targets_by_doi(app, dois, only_status='REGISTERED'):
else:
records_db = session.query(CitationTarget).filter(CitationTarget.content.in_(dois)).all()
disable_filter = True

records = _extract_key_citation_target_data(records_db, disable_filter=disable_filter)
return records

Expand Down Expand Up @@ -331,6 +445,18 @@ def get_citations(app, citation_change):
citation_bibcodes = [r.citing for r in session.query(Citation).filter_by(content=citation_change.content, status="REGISTERED").all()]
return citation_bibcodes

def get_citation_target_readers(app, bibcode, alt_bibcodes):
"""
Return all the Reader hashes for a given content.
It will ignore DELETED and DISCARDED hashes.
"""
with app.session_scope() as session:
reader_hashes = [r.reader for r in session.query(Reader).filter_by(bibcode=bibcode, status="REGISTERED").all()]
for alt_bibcode in alt_bibcodes:
reader_hashes = reader_hashes + [r.reader for r in session.query(Reader).filter_by(bibcode=alt_bibcode, status="REGISTERED").all()]

return reader_hashes

def generate_modified_metadata(parsed_metadata, curated_entry):
"""
modify parsed_metadata with any curated metadata. return results.
Expand Down Expand Up @@ -406,6 +532,27 @@ def mark_citation_as_deleted(app, citation_change):
logger.info("Ignoring citation deletion (citing '%s', content '%s' and timestamp '%s') because received timestamp is equal/older than timestamp in database", citation_change.citing, citation_change.content, citation_change.timestamp.ToJsonString())
return marked_as_deleted, previous_status

def mark_reader_as_deleted(app, reader_change):
"""
Update status to DELETED for a given reader
"""
marked_as_deleted = False
previous_status = None
with app.session_scope() as session:
reader = session.query(Reader).with_for_update().filter_by(bibcode=reader_change['bibcode'], reader=reader_change['reader']).first()
previous_status = reader.status
change_timestamp = reader_change['timestamp']#.ToDatetime().replace(tzinfo=tzutc()) # Consider it as UTC to be able to compare it
if str(reader.timestamp) < reader_change['timestamp']:
reader.status = "DELETED"
reader.timestamp = reader_change['timestamp']
session.add(reader)
session.commit()
marked_as_deleted = True
logger.info("Marked reader as deleted (citing '%s', content '%s')", reader_change['bibcode'], reader_change['reader'])#, reader_change.timestamp.ToJsonString())
else:
logger.info("Ignoring reader deletion (citing '%s', content '%s' and timestamp '%s') because received timestamp is equal/older than timestamp in database", reader_change['bibcode'], reader_change['reader'], reader_change['timestamp'])
return marked_as_deleted, previous_status

def mark_all_discarded_citations_as_registered(app, content):
"""
Update status to REGISTERED for all discarded citations of a given content
Expand Down
1 change: 1 addition & 0 deletions ADSCitationCapture/delta_computation.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ def _setup_schemas(self):
# Determine previous schema name if any
if len(filtered_existing_schema_names) > 0:
filtered_existing_schema_names.sort(reverse=True)
filtered_existing_schema_names = [schema_name for schema_name in filtered_existing_schema_names if "reader" not in schema_name]
self.previous_schema_name = filtered_existing_schema_names[0]

# Verify the data that is going to be imported is newer than the data already imported
Expand Down
14 changes: 7 additions & 7 deletions ADSCitationCapture/forward.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@


# =============================== FUNCTIONS ======================================= #
def build_record(app, citation_change, parsed_metadata, citations, db_versions, entry_date=None):
def build_record(app, citation_change, parsed_metadata, citations, db_versions, readers=[], entry_date=None):
if citation_change.content_type != CitationChangeContentType.doi:
raise Exception("Only DOI records can be forwarded to master")
raise ValueError("Only DOI records can be forwarded to master")
# Extract required values
bibcode = parsed_metadata.get('bibcode')
if bibcode is None:
raise Exception("Only records with a bibcode can be forwarded to master")
raise ValueError("Only records with a valid bibcode can be forwarded to master")
if entry_date is None:
entry_date = citation_change.timestamp.ToDatetime()
#Check if doi points to a concept record or to a specific version
Expand Down Expand Up @@ -122,7 +122,7 @@ def build_record(app, citation_change, parsed_metadata, citations, db_versions,
'pub_raw': source,
'pubdate': pubdate,
'pubnote': [],
'read_count': 0,
'read_count': len(readers),
'title': [title],
'publisher': source,
'version': version
Expand All @@ -148,11 +148,11 @@ def build_record(app, citation_change, parsed_metadata, citations, db_versions,
record_dict['property'].append('RELEASE')

record = DenormalizedRecord(**record_dict)
nonbib_record = _build_nonbib_record(app, citation_change, record, db_versions, status)
nonbib_record = _build_nonbib_record(app, citation_change, record, db_versions, status, readers=readers)
return record, nonbib_record


def _build_nonbib_record(app, citation_change, record, db_versions, status):
def _build_nonbib_record(app, citation_change, record, db_versions, status, readers=[]):
doi = citation_change.content
nonbib_record_dict = {
'status': status,
Expand All @@ -169,7 +169,7 @@ def _build_nonbib_record(app, citation_change, record, db_versions, status):
'ned_objects': [],
'norm_cites': 0, # log10-normalized count of citations computed on the classic site but not currently used
'read_count': record.read_count,
'readers': [],
'readers': readers,
'simbad_objects': [],
'total_link_counts': 0 # Only used for DATA and not for ESOURCES
}
Expand Down
30 changes: 30 additions & 0 deletions ADSCitationCapture/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@

citation_content_type = ENUM('DOI', 'PID', 'URL', name='citation_content_type')
citation_change_type = ENUM('NEW', 'DELETED', 'UPDATED', name='citation_change_type')
reader_change_type = ENUM('NEW', 'DELETED', name='reader_change_type')
reader_status_type = ENUM('REGISTERED', 'DELETED', 'DISCARDED', name='reader_status_type')
citation_status_type = ENUM('EMITTABLE','REGISTERED', 'DELETED', 'DISCARDED', name='citation_status_type')
target_status_type = ENUM('EMITTABLE','REGISTERED', 'DELETED', 'DISCARDED', name='target_status_type')

Expand All @@ -24,6 +26,34 @@ class RawCitation(Base):
bibcode = Column(String(19))
payload = Column(JSONB) # Binary, faster than JSON (requires postgres >9.4)

class ReaderData(Base):
__tablename__ = 'reader_data'
__table_args__ = ({"schema": "public"})
id = Column(Integer, primary_key=True)
bibcode = Column(String())
reader = Column(Text())

class Reader(Base):
__tablename__ = 'readers'
__table_args__ = ({"schema": "public"})
id = Column(Integer, primary_key=True)
bibcode = Column(String())
reader = Column(Text())
timestamp = Column(UTCDateTime)
status = Column(reader_status_type)
created = Column(UTCDateTime, default=get_date)
updated = Column(UTCDateTime, onupdate=get_date)

class ReaderChanges(Base):
__tablename__ = 'reader_changes'
__table_args__ = ({"schema": "public"})
id = Column(Integer, primary_key=True)
new_bibcode = Column(String())
new_reader = Column(Text())
previous_bibcode = Column(String())
previous_reader = Column(Text())
status = Column(reader_change_type)

class CitationChanges(Base):
__tablename__ = 'citation_changes'
__table_args__ = ({"schema": "public"})
Expand Down

0 comments on commit 5cda8d2

Please sign in to comment.