Calculating Metrics for Software Records (#52)

Changes to CitationCapture that allow it to produce the flat files necessary for ADSDataPipeline to generate metrics for software records: - Canonical Bibcodes - Citations - References - Author Names Additional changes allow CitationCapture to import and include reader data in the nonbib record.
adsabs · Sep 20, 2022 · 5cda8d2 · 5cda8d2
1 parent a24da27
commit 5cda8d2
Show file tree

Hide file tree

Showing 11 changed files with 977 additions and 80 deletions.
diff --git a/ADSCitationCapture/db.py b/ADSCitationCapture/db.py
@@ -1,9 +1,11 @@
 import os
+from typing import OrderedDict
 from psycopg2 import IntegrityError
 from dateutil.tz import tzutc
-from ADSCitationCapture.models import Citation, CitationTarget, Event
+from ADSCitationCapture.models import Citation, CitationTarget, Event, Reader
 from ADSCitationCapture import doi
 from adsmsg import CitationChange
+import datetime
 from adsputils import setup_logging
 
 # ============================= INITIALIZATION ==================================== #
@@ -18,6 +20,13 @@
                         level=config.get('LOGGING_LEVEL', 'INFO'),
                         attach_stdout=config.get('LOG_STDOUT', False))
 
+#Dictionary that defines the output files for ADSDataPipeline
+file_names=OrderedDict()
+file_names['bibcode'] =proj_home+'/logs/output/bibcodes_CC.can.list'
+file_names['citations'] = proj_home+'/logs/output/citations_CC.list'
+file_names['references'] = proj_home+'/logs/output/references_CC.list'
+file_names['authors'] = proj_home+'/logs/output/facet_authors_CC.list'
+
 
 # =============================== FUNCTIONS ======================================= #
 def store_event(app, data):
@@ -101,6 +110,69 @@ def update_citation_target_metadata(app, content, raw_metadata, parsed_metadata,
         metadata_updated =  _update_citation_target_metadata_session(session, content, raw_metadata, parsed_metadata, curated_metadata, status=status, bibcode=bibcode, associated=associated)
     return metadata_updated
 
+def write_citation_target_data(app, only_status=None):
+    """
+    Writes Canonical bibcodes to file for DataPipeline
+    returns: Reference Network File
+             Citation Network File
+             Canonical Bibcodes File
+             Facet Authors File
+    """
+    with app.session_scope() as session:
+        if only_status:
+            records_db = session.query(CitationTarget).filter_by(status=only_status).all()
+            disable_filter = only_status in ['DISCARDED','EMITTABLE']
+        else:
+            records_db = session.query(CitationTarget).all()
+            disable_filter = True
+        bibcodes = [r.bibcode for r in records_db]
+        records = _extract_key_citation_target_data(records_db, disable_filter=disable_filter)
+        #writes canonical bibcodes to file.
+        with open(file_names['bibcode']+".tmp", 'w') as f:
+            f.write("\n".join(bibcodes))
+        logger.info("Writing Citation/Reference Network Files.")
+        _write_key_citation_reference_data(app, bibcodes)
+        logger.info("Writing author data for {} records".format(len(records)))
+        _write_key_citation_target_authors(app, records)
+        for file in file_names:
+            os.system('cp {} {}'.format(file+".tmp", file))
+            logger.debug("Copied {}.tmp to {}".format(file, file))
+
+def _write_key_citation_target_authors(app, records):
+    """
+    Writes facet author data to file.
+    """
+    try:
+        with open(file_names['authors']+".tmp", 'w') as f:
+            for rec in records:
+                parsed_metadata = get_citation_target_metadata(app, rec['content']).get('parsed', {})
+                if parsed_metadata:
+                    f.write(str(rec['bibcode'])+"\t"+"\t".join(parsed_metadata.get('authors',''))+"\n")
+
+        logger.info("Wrote file {} to disk.".format('authors'))
+    except Exception as e:
+        logger.exception("Failed to write file {}.".format(file_names['authors']+".tmp"))
+        raise Exception("Failed to write file {}.".format(file_names['authors']+".tmp"))
+
+def _write_key_citation_reference_data(app, bibcodes):
+    """
+    Write the two network files:
+    Citation Network File: X cites software record
+    Reference Network File: software record is cited by X
+
+    Both are needed to integrate software records into classic record metrics.
+    """
+    try:
+        with open(file_names['citations']+".tmp", 'w') as f, open(file_names['references']+".tmp", 'w') as g:
+            for bib in bibcodes:
+                cites=get_citations_by_bibcode(app, bib)
+                for cite in cites:
+                    g.write(str(cite)+"\t"+str(bib)+"\n")
+                    f.write(str(bib)+"\t"+str(cite)+"\n")
+        logger.info("Wrote files {} and {} to disk.".format(file_names['citations'], file_names['references']))
+    except Exception as e:
+        logger.exception("Failed to write files {} and {}.".format(file_names['citations']+".tmp", file_names['references']+".tmp"))
+        raise Exception("Failed to write files {} and {}.".format(file_names['citations']+".tmp", file_names['references']+".tmp"))
 def _update_citation_target_curator_message_session(session, content, msg):
     """
     Actual calls to database session for update_citation_target_metadata
@@ -146,6 +218,28 @@ def store_citation(app, citation_change, content_type, raw_metadata, parsed_meta
             stored = True
     return stored
 
+def store_reader_data(app, reader_change, status):
+    """
+    Stores a new citation in the DB
+    """
+    stored = False
+    with app.session_scope() as session:
+        reads = Reader()
+        reads.bibcode = reader_change['bibcode']
+        reads.reader = reader_change['reader']
+        reads.timestamp = reader_change['timestamp']#.ToDatetime().replace(tzinfo=tzutc())
+        reads.status = status
+        session.add(reads)
+        try:
+            session.commit()
+        except IntegrityError as e:
+            # IntegrityError: (psycopg2.IntegrityError) duplicate key value violates unique constraint "citing_content_unique_constraint"
+            logger.error("Ignoring new reader information (bibcode '%s', reader '%s') because it already exists in the database when it is not supposed to (race condition?): '%s'", reader_change['bibcode'], reader_change['readers'], str(e))
+        else:
+            logger.info("Stored new reader (bibcode: '%s', reader '%s' timestamp '%s)", reader_change['bibcode'], reader_change['reader'], reader_change['timestamp'])
+            stored = True
+    return stored
+
 def get_citation_target_count(app):
     """
     Return the number of citation targets registered in the database
@@ -204,6 +298,27 @@ def get_citation_targets_by_bibcode(app, bibcodes, only_status='REGISTERED'):
         records = _extract_key_citation_target_data(records_db, disable_filter=disable_filter)
     return records
 
+def get_citation_targets_by_alt_bibcode(app, alt_bibcodes, only_status='REGISTERED'):
+    """
+    Return a list of dict with the requested citation targets based on their bibcode
+    """
+    with app.session_scope() as session:
+        records_db = []
+        for alt_bibcode in alt_bibcodes:
+            if only_status:
+                record_db = session.query(CitationTarget).filter(CitationTarget.parsed_cited_metadata['alternate_bibcode'].contains([alt_bibcode])).filter_by(status=only_status).first()
+            else:
+                record_db = session.query(CitationTarget).filter(CitationTarget.parsed_cited_metadata['alternate_bibcode'].contains([alt_bibcode])).first()
+            if record_db:
+                records_db.append(record_db)
+
+        if only_status:
+            disable_filter = only_status == 'DISCARDED'
+        else:
+            disable_filter = True
+        records = _extract_key_citation_target_data(records_db, disable_filter=disable_filter)
+    return records
+
 def get_citation_targets_by_doi(app, dois, only_status='REGISTERED'):
     """
     Return a list of dict with the requested citation targets based on their DOI
@@ -216,7 +331,6 @@ def get_citation_targets_by_doi(app, dois, only_status='REGISTERED'):
         else:
             records_db = session.query(CitationTarget).filter(CitationTarget.content.in_(dois)).all()
             disable_filter = True
-
         records = _extract_key_citation_target_data(records_db, disable_filter=disable_filter)
     return records
 
@@ -331,6 +445,18 @@ def get_citations(app, citation_change):
         citation_bibcodes = [r.citing for r in session.query(Citation).filter_by(content=citation_change.content, status="REGISTERED").all()]
     return citation_bibcodes
 
+def get_citation_target_readers(app, bibcode, alt_bibcodes):
+    """
+    Return all the Reader hashes for a given content.
+    It will ignore DELETED and DISCARDED hashes.
+    """
+    with app.session_scope() as session:
+        reader_hashes = [r.reader for r in session.query(Reader).filter_by(bibcode=bibcode, status="REGISTERED").all()]
+        for alt_bibcode in alt_bibcodes:
+            reader_hashes = reader_hashes + [r.reader for r in session.query(Reader).filter_by(bibcode=alt_bibcode, status="REGISTERED").all()]
+
+    return reader_hashes
+
 def generate_modified_metadata(parsed_metadata, curated_entry):
     """
     modify parsed_metadata with any curated metadata. return results.
@@ -406,6 +532,27 @@ def mark_citation_as_deleted(app, citation_change):
             logger.info("Ignoring citation deletion (citing '%s', content '%s' and timestamp '%s') because received timestamp is equal/older than timestamp in database", citation_change.citing, citation_change.content, citation_change.timestamp.ToJsonString())
     return marked_as_deleted, previous_status
 
+def mark_reader_as_deleted(app, reader_change):
+    """
+    Update status to DELETED for a given reader
+    """
+    marked_as_deleted = False
+    previous_status = None
+    with app.session_scope() as session:
+        reader = session.query(Reader).with_for_update().filter_by(bibcode=reader_change['bibcode'], reader=reader_change['reader']).first()
+        previous_status = reader.status
+        change_timestamp = reader_change['timestamp']#.ToDatetime().replace(tzinfo=tzutc()) # Consider it as UTC to be able to compare it
+        if str(reader.timestamp) < reader_change['timestamp']:
+            reader.status = "DELETED"
+            reader.timestamp = reader_change['timestamp']
+            session.add(reader)
+            session.commit()
+            marked_as_deleted = True
+            logger.info("Marked reader as deleted (citing '%s', content '%s')", reader_change['bibcode'], reader_change['reader'])#, reader_change.timestamp.ToJsonString())
+        else:
+            logger.info("Ignoring reader deletion (citing '%s', content '%s' and timestamp '%s') because received timestamp is equal/older than timestamp in database", reader_change['bibcode'], reader_change['reader'], reader_change['timestamp'])
+    return marked_as_deleted, previous_status
+
 def mark_all_discarded_citations_as_registered(app, content):
     """
     Update status to REGISTERED for all discarded citations of a given content

diff --git a/ADSCitationCapture/delta_computation.py b/ADSCitationCapture/delta_computation.py
@@ -161,6 +161,7 @@ def _setup_schemas(self):
         # Determine previous schema name if any
         if len(filtered_existing_schema_names) > 0:
             filtered_existing_schema_names.sort(reverse=True)
+            filtered_existing_schema_names = [schema_name for schema_name in filtered_existing_schema_names if "reader" not in schema_name]
             self.previous_schema_name = filtered_existing_schema_names[0]
 
             # Verify the data that is going to be imported is newer than the data already imported

diff --git a/ADSCitationCapture/forward.py b/ADSCitationCapture/forward.py
@@ -22,13 +22,13 @@
 
 
 # =============================== FUNCTIONS ======================================= #
-def build_record(app, citation_change, parsed_metadata, citations, db_versions, entry_date=None):
+def build_record(app, citation_change, parsed_metadata, citations, db_versions, readers=[], entry_date=None):
     if citation_change.content_type != CitationChangeContentType.doi:
-        raise Exception("Only DOI records can be forwarded to master")
+        raise ValueError("Only DOI records can be forwarded to master")
     # Extract required values
     bibcode = parsed_metadata.get('bibcode')
     if bibcode is None:
-        raise Exception("Only records with a bibcode can be forwarded to master")
+        raise ValueError("Only records with a valid bibcode can be forwarded to master")
     if entry_date is None:
         entry_date = citation_change.timestamp.ToDatetime()
     #Check if doi points to a concept record or to a specific version
@@ -122,7 +122,7 @@ def build_record(app, citation_change, parsed_metadata, citations, db_versions,
         'pub_raw': source,
         'pubdate': pubdate,
         'pubnote': [],
-        'read_count': 0,
+        'read_count': len(readers),
         'title': [title],
         'publisher': source,
         'version': version
@@ -148,11 +148,11 @@ def build_record(app, citation_change, parsed_metadata, citations, db_versions,
         record_dict['property'].append('RELEASE')
 
     record = DenormalizedRecord(**record_dict)
-    nonbib_record = _build_nonbib_record(app, citation_change, record, db_versions, status)
+    nonbib_record = _build_nonbib_record(app, citation_change, record, db_versions, status, readers=readers)
     return record, nonbib_record
 
 
-def _build_nonbib_record(app, citation_change, record, db_versions, status):
+def _build_nonbib_record(app, citation_change, record, db_versions, status, readers=[]):
     doi = citation_change.content
     nonbib_record_dict = {
         'status': status,
@@ -169,7 +169,7 @@ def _build_nonbib_record(app, citation_change, record, db_versions, status):
         'ned_objects': [],
         'norm_cites': 0, # log10-normalized count of citations computed on the classic site but not currently used
         'read_count': record.read_count,
-        'readers': [],
+        'readers': readers,
         'simbad_objects': [],
         'total_link_counts': 0 # Only used for DATA and not for ESOURCES
     }

diff --git a/ADSCitationCapture/models.py b/ADSCitationCapture/models.py
@@ -13,6 +13,8 @@
 
 citation_content_type = ENUM('DOI', 'PID', 'URL', name='citation_content_type')
 citation_change_type = ENUM('NEW', 'DELETED', 'UPDATED', name='citation_change_type')    
+reader_change_type = ENUM('NEW', 'DELETED', name='reader_change_type')    
+reader_status_type = ENUM('REGISTERED', 'DELETED', 'DISCARDED', name='reader_status_type')
 citation_status_type = ENUM('EMITTABLE','REGISTERED', 'DELETED', 'DISCARDED', name='citation_status_type')
 target_status_type = ENUM('EMITTABLE','REGISTERED', 'DELETED', 'DISCARDED', name='target_status_type')
 
@@ -24,6 +26,34 @@ class RawCitation(Base):
     bibcode = Column(String(19))
     payload = Column(JSONB) # Binary, faster than JSON (requires postgres >9.4)
 
+class ReaderData(Base):
+    __tablename__ = 'reader_data'
+    __table_args__ = ({"schema": "public"})
+    id = Column(Integer, primary_key=True)
+    bibcode = Column(String())
+    reader = Column(Text())
+
+class Reader(Base):
+    __tablename__ = 'readers'
+    __table_args__ = ({"schema": "public"})
+    id = Column(Integer, primary_key=True)
+    bibcode = Column(String())
+    reader = Column(Text())
+    timestamp = Column(UTCDateTime)
+    status = Column(reader_status_type)
+    created = Column(UTCDateTime, default=get_date)
+    updated = Column(UTCDateTime, onupdate=get_date)
+
+class ReaderChanges(Base):
+    __tablename__ = 'reader_changes'
+    __table_args__ = ({"schema": "public"})
+    id = Column(Integer, primary_key=True)
+    new_bibcode = Column(String())
+    new_reader = Column(Text())
+    previous_bibcode = Column(String())
+    previous_reader = Column(Text())
+    status = Column(reader_change_type)
+
 class CitationChanges(Base):
     __tablename__ = 'citation_changes'
     __table_args__ = ({"schema": "public"})