Get research output data from Zoteron and prepare the data for FileMaker import. 

This procedure is typically run 2 weeks after the 
finalisation of mid-year and annual KPI reporting.

In [79]:
import json
import sqlite3
import csv
import pandas as pd
import importlib
from lib import kpi

In [13]:
# Run this if changes have been made to kpi.py. It ensure that any changes are
# made available in the current session.
import importlib
importlib.reload(kpi)

<module 'kpi' from '/Users/nxo/Workspace/saef_api/kpi.py'>

In [100]:
# This is an ALL load to fix historical error. Hopefully it's a one off
with open('data/all/saef_library_All_postman_20240110.json', mode='r', encoding='utf8') as f:
        data = f.read()
        saef_library = json.loads(data)
projectKeys = []
# This is output from an FM report: ProjectKey_Report
with open('data/all/fm_projectKey.csv', newline='') as f:
        reader = csv.reader(f)
        for row in reader:
                projectKeys.append(row)
projectKey_df = pd.DataFrame(projectKeys, columns=['id_project', 'alias', 'project'])
projectKey = projectKey_df[['id_project', 'project', 'alias']]

data           = kpi.load_data()
rpt_config     = kpi.get_rpt_args()
responses_json = kpi.split_response(data[0])
ppl_collection = kpi.person_construct(responses_json, data[2], rpt_config[0])
proj_saef      = kpi.project_construct(responses_json, ppl_collection[0])
buckets        = kpi.matched_library(saef_library, ppl_collection[1])
meta_bucket    = buckets[0]
bucket         = buckets[1]
ppl_saef       = ppl_collection[0]
ppl_hash       = ppl_collection[1]


In [81]:
meta_bucket.loc[meta_bucket['itemType'] == 'book',             'itemType'] = 'Book'
meta_bucket.loc[meta_bucket['itemType'] == 'dataset',          'itemType'] = 'Dataset'
meta_bucket.loc[meta_bucket['itemType'] == 'journalArticle',   'itemType'] = 'Journal Article'
meta_bucket.loc[meta_bucket['itemType'] == 'presentation',     'itemType'] = 'Presentation'
meta_bucket.loc[meta_bucket['itemType'] == 'plenary',          'itemType'] = 'Plenary'
meta_bucket.loc[meta_bucket['itemType'] == 'report',           'itemType'] = 'Report'
meta_bucket.loc[meta_bucket['itemType'] == 'radioBroadcast',   'itemType'] = 'Radio'
meta_bucket.loc[meta_bucket['itemType'] == 'artwork',          'itemType'] = 'Artwork'
meta_bucket.loc[meta_bucket['itemType'] == 'tvBroadcast',      'itemType'] = 'TV'
meta_bucket.loc[meta_bucket['itemType'] == 'bookSection',      'itemType'] = 'Book Chapter'
meta_bucket.loc[meta_bucket['itemType'] == 'conferencePaper',  'itemType'] = 'Conference Paper'
meta_bucket.loc[meta_bucket['itemType'] == 'newspaperArticle', 'itemType'] = 'Newspaper'
meta_bucket.loc[meta_bucket['itemType'] == 'videoRecording',   'itemType'] = 'Video'
meta_bucket.loc[meta_bucket['itemType'] == 'blogPost',         'itemType'] = 'Blog'
meta_bucket.loc[meta_bucket['itemType'] == 'magazineArticle',  'itemType'] = 'Magazine'
meta_bucket.loc[meta_bucket['itemType'] == 'film',             'itemType'] = 'Film'

In [133]:
# get outputs associated with a single project
b1 = bucket[ bucket.project != ''  ]
b2 = b1[b1.project.str.contains(";") == False]
b2 = b2[['key', 'itemType', 'title', 'rights', 'pubyr', 'project']]
b2 = b2.drop_duplicates()

In [134]:
# get outputs associated with more than one project
df = bucket[bucket.project.str.contains(';')][['key', 'itemType', 'title', 'rights', 'pubyr', 'publicationTitle', 'project']].drop_duplicates() #n=31
b3 = []
for i in range(len(df)):
    proj_list  = df[df['key'] == df.iloc[i, 0]]['project'].str.split(';').tolist()
    proj_list = proj_list[0]
    for proj in proj_list:
        b3.append([df.iloc[i, 0], df.iloc[i, 1], df.iloc[i, 2], df.iloc[i, 3], df.iloc[i, 4], proj.strip()])

b3 = pd.DataFrame(b3, columns=['key', 'itemType', 'title', 'rights', 'pubyr', 'project'])

In [142]:
# proj_outputs
proj_basket = pd.concat([b2, b3])
proj_outputs = proj_basket.merge(projectKey_df, how='left', on='project')
proj_outputs.rename(columns={"key": "id_zotero"}, errors="raise", inplace=True)
proj_outputs = proj_outputs[['id_zotero', 'id_project', 'project', 'title']]

In [85]:
# Match bucket headings with outputs table headings. Then append to outputs
outputs = meta_bucket[['key', 'itemType', 'title', 'rights', 'pubyr', 'publicationTitle']]
outputs.rename(columns={"key": "id_zotero", "itemType": "item_type", "publicationTitle": "pub_title"}, errors="raise", inplace=True)

In [111]:
# Match bucket headings with ppl_outputs table headings. Then append to ppl_outputs
ppl_outputs = bucket.loc[ bucket.itemType.isin(['attachment', 'note']) == False ][['key', 'id_person', 'name', 'title']]
ppl_outputs.rename(columns={"key": "id_zotero"}, errors="raise", inplace=True)
ppl_outputs = ppl_outputs[['id_zotero', 'id_person', 'name', 'title']]

In [119]:
con = sqlite3.connect("data/all/saef_library.db")
cur = con.cursor()

In [144]:
# There are origin tables. We should not need to re-create these tables unless the data is blown away in FM
cur.execute("CREATE TABLE outputs(id_zotero, item_type, title, rights, pubyr, pub_title)")
cur.execute("CREATE TABLE ppl_outputs(id_zotero, id_person, name, title)")
cur.execute("CREATE TABLE proj_outputs(id_zotero, id_project, project, title)")
cur.execute("CREATE TABLE projectKey(id_project, project, alias)")
con.commit()

In [121]:
# this is a output from FM. This should be a one off unless the data is blown away in FM
projectKey.to_sql('projectKey', con, if_exists='append', index=False)

27

In [145]:
# Populate
# outputs
# ppl_outputs
# proj_outputs
outputs.to_sql('outputs', con, if_exists='append', index=False)
ppl_outputs.to_sql('ppl_outputs', con, if_exists='append', index=False)
proj_outputs.to_sql('proj_outputs', con, if_exists='append', index=False)


95

In [123]:
# Export FM ready(ish) project_Ouputs table csv ingest
outputs = pd.read_sql_query("SELECT * FROM outputs", con)
outputs.to_csv('data/output/outputs_20240110.csv',index=False)

In [126]:
# Export FM ready(ish) Ouputs table csv ingest
ppl_outputs = pd.read_sql_query("SELECT id_zotero, id_person FROM ppl_outputs", con)
ppl_outputs.to_csv('data/output/ppl_outputs_20240110.csv',index=False)

In [146]:
# Export FM ready(ish) Ouputs table csv ingest
proj_outputs = pd.read_sql_query("SELECT id_zotero, id_project FROM proj_outputs", con)
proj_outputs.to_csv('data/output/proj_outputs_20240110.csv',index=False)

In [143]:
# Future buckets should perform an update into outputs, proj_outputs & ppl_outputs of new rows only
# ToDo: write update SQL logic
# Outputs list
# .header on
# .mode csv
# .once saef_api/data/outputs_20231005.csv
# SELECT id_zotero, item_type, title, LOWER(rights) AS embargo, pubyr, pub_title FROM fm_zotero;
# .once saef_api/data/ppl_outputs_20231005.csv
# SELECT id_person, id_zotero FROM fm_zotero;
# .once saef_api/data/projects_Outputs.csv
# SELECT DISTINCT project, id_project, id_zotero from fm_zotero LEFT JOIN fm_projectKey ON fm_zotero.project = fm_projectKey.project_code WHERE project != '';

# Tidy data in Excel, save as .xlsx, FM import
proj_outputs

Unnamed: 0,id_zotero,id_project,project,title
0,RZKQ34ZR,785265CB-72B3-8C4B-9668-823F1552C299,T2_P015,#GlobalCollembola - full sample-level database
1,74GEEA3L,785265CB-72B3-8C4B-9668-823F1552C299,T2_P015,Global fine-resolution data on springtail abundance and community structure
2,NL9JCFTG,785265CB-72B3-8C4B-9668-823F1552C299,T2_P015,Globally invariant metabolism but density-diversity mismatch in springtails
3,HRYIP6ZP,CB59459A-9C9C-454E-A679-198B8392F952,T2_P023,"A Green Fingerprint of Antarctica: Drones, Hyperspectral Imaging, and Machine Learning for Moss and Lichen Classification"
4,DRGVXIXN,FFA7D2E5-8D4A-FB43-99E4-9187862B9BB8,T2_P022,"Environmental effects of stratospheric ozone depletion, UV radiation, and interactions with climate change: UNEP Environmental Effects Assessment Panel, Update 2022"
...,...,...,...,...
90,CBGS97PZ,6AF09846-2EEB-9C4A-BD43-917BD313518E,T3_P031,"Antarctic Climate Change and the Environment: A Decadal Synopsis and Recommendations for Action. Scientific Committee on Antarctic Research, Cambridge, United Kingdom."
91,TRTV5IVA,ACEC6F1C-FC21-2E4A-ADEF-DC4C89AD77CB,T2_P016,Cross-Chapter Paper 6: Polar Regions. IPCC WGII Sixth Assessment Report
92,TRTV5IVA,FFA7D2E5-8D4A-FB43-99E4-9187862B9BB8,T2_P022,Cross-Chapter Paper 6: Polar Regions. IPCC WGII Sixth Assessment Report
93,A2F6H38Q,ACEC6F1C-FC21-2E4A-ADEF-DC4C89AD77CB,T2_P016,Basking in the sun: how mosses photosynthesise and survive in Antarctica


In [147]:
con.close()