In [2]:
import importlib
import sqlite3
import pandas as pd
from lib import kpi

In [3]:
importlib.reload(kpi)

<module 'lib.kpi' from 'c:\\workspace\\GitHub\\arcsaef\\lib\\kpi.py'>

In [4]:
data           = kpi.load_data()
rpt_config     = kpi.get_rpt_args()
saef_library   = kpi.get_saef_library()
responses_json = kpi.split_response(data[0])
ppl_collection = kpi.person_construct(responses_json, data[2], rpt_config[0])
proj_saef      = kpi.project_construct(responses_json, ppl_collection[0])
buckets        = kpi.matched_library(saef_library, ppl_collection[1])
templates      = kpi.load_templates()
meta_bucket    = buckets[0]
bucket         = buckets[1]
ppl_saef       = dict(sorted(ppl_collection[0].items(), key = lambda x: x[1].get('LastName')))
ppl_hash       = ppl_collection[1]
bulk_responses = data[0]
biblio         = data[1]
scopus         = data[2]
rpt_year       = rpt_config[0]
organisations  = rpt_config[1]
org_shortnames = list(organisations.keys())
proj_saef_nohold = proj_saef[proj_saef.Status != 'On hold']

In [197]:
# 1.remove empty rows
bucket = bucket[bucket['id_person'].isna() == False]
# 2. Make parsing more straight forward
for prsn in ppl_saef:
    if ppl_saef[prsn]['Gender'] == 'Non-binary/Gender diverse' or  ppl_saef[prsn]['Gender']  == 'Prefer not to say' or ppl_saef[prsn]['Gender'] == 'Non-binary':
        ppl_saef[prsn]['Gender'] = 'Unspecified diversity'
# 3. Remove leading/trialing whitespace from  blibliographic entry.
biblio['Biblio'] = biblio['Biblio'].str.strip()
# 4. Jump into your pedantry
for prsn in ppl_saef:
    if ppl_saef[prsn]['Position'] != 'PhD Student' and  ppl_saef[prsn]['Position'] != 'Masters Student' and \
       ppl_saef[prsn]['Position'] != 'Honours Student':
        ppl_saef[prsn]['StudentProjectTitle'] = 'Not applicable'

In [198]:
active_ppl_saef = {}
for p in ppl_saef:
    if ppl_saef[p]['State'] == 'Active' and ppl_saef[p]['Gender'] != '':
        if ppl_saef[p]['Position'] in [ 'Research Professional', 'PhD Student', 'Chief Investigator', 'Partner Investigator', 'Post Doc']:
            position = 'Researcher'
        else: 
            position = ppl_saef[p]['Position']

        active_ppl_saef[p] = {
            'FirstName':    ppl_saef[p]['FirstName'] , 
            'LastName':     ppl_saef[p]['LastName'],
            'Gender':       ppl_saef[p]['Gender'],
            'Position':     position}       


In [199]:
positions = []
for p in active_ppl_saef:
    if active_ppl_saef[p]['Position'] in ['Program Staff', 'Researcher']:
        row = [active_ppl_saef[p]['FirstName'], active_ppl_saef[p]['LastName'], active_ppl_saef[p]['Gender'], active_ppl_saef[p]['Position'] ]
        positions.append(row)

In [200]:
for p in active_ppl_saef:
    for s in responses_json['People_Detail']['data']:
        if s['fieldData']['ID_Person'] == p:
            if len(s['portalData']['people_Groups']) > 0:
                for i in s['portalData']['people_Groups']:
                    if i.get('people_Groups::Group') in ['Governance Advisory Board', 'International Science Advisory Panel', 'Program Executive Group']:
                        row = [ active_ppl_saef[p]['FirstName'], active_ppl_saef[p]['LastName'], active_ppl_saef[p]['Gender'], i.get('people_Groups::Group') ]
                        positions.append(row)


In [201]:
df = pd.DataFrame(positions, columns=['FirstName', 'LastName','Gender', 'Position'])



In [202]:
df.groupby(['Position', 'Gender'])['Gender'].count()

Position                              Gender               
Governance Advisory Board             Man                       4
                                      Woman                     4
International Science Advisory Panel  Man                       6
                                      Woman                     8
Program Executive Group               Man                       5
                                      Woman                     9
Program Staff                         Man                       2
                                      Unspecified diversity     1
                                      Woman                     7
Researcher                            Man                      54
                                      Unspecified diversity     3
                                      Woman                    63
Name: Gender, dtype: int64

In [4]:
con = sqlite3.connect(":memory:")
cur = con.cursor()
cur.execute("create table logistics (ID_Logistics, FieldSeason, PlanningStatus, Location, ReqPersonTotal, FinalPersonTotal, LeaveDate, ReturnDate, FinalPersonDays, FieldStatus)")
cur.execute("create table people_logistics (IDf_Logistics, IDf_Person, FullName, LeaveDate, ReturnDate, PersonDays, CareerStage, Gender, Org)")
cur.execute("create table logistics_projects (IDf_Logistics, IDf_Project, IDf_Person, ProjectCode, ProjectPercentage)")

<sqlite3.Cursor at 0x17e318a95e0>

In [5]:
# populate logistics table
for x in responses_json['Logistics_Detail']['data']:
    cur.execute("insert into logistics values (?,?,?,?,?,?,?,?,?,?)", 
          (x['fieldData']['ID_Logistics'],    x['fieldData']['FieldSeason'], \
           x['fieldData']['PlanningStatus'],  x['fieldData']['Location'], \
           x['fieldData']['ReqPersonTotal'],  x['fieldData']['FinalPersonTotal'], \
           x['fieldData']['LeaveDate'],       x['fieldData']['ReturnDate'], \
           x['fieldData']['FinalPersonDays'], x['fieldData']['FieldStatus']) )

In [8]:
# populate people_Logistics table
for x in responses_json['people_Logistics']['data']:
    cur.execute("insert into people_logistics values (?,?,?,?,?,?,?,?,?)", 
          (x['fieldData']['IDf_logistics'],       x['fieldData']['IDf_Person'], \
           x['fieldData']['People::FullName'],    x['fieldData']['DateFrom'], \
           x['fieldData']['DateTo'],              x['fieldData']['PersonDays'], \
           x['fieldData']['People::CareerStage'], x['fieldData']['People::Gender'], \
           x['fieldData']['Organisations 2::ShortName']) )

In [151]:
# Incorrect linkage in a layout meant that some of the researcher didn't have an Org affiliation.
# This has been fixed in FM. This code section will not be required after the next FM data export
# update_statement = "UPDATE people_Logistics SET Org=? WHERE IDf_Person=?"
# for person in ppl_saef:
#     cur.execute(update_statement, (ppl_saef.get(person)['Org'], person))
#     con.commit()


In [9]:
# populate logistics_projects table
for x in responses_json['logistics_Projects']['data']:
    cur.execute("insert into logistics_projects values (?,?,?,?,?)", 
          (x['fieldData']['IDf_Project'],       x['fieldData']['IDf_Logistics'], \
           x['fieldData']['IDf_Person'],        x['fieldData']['Projects 3::ProjectCode'], \
           x['fieldData']['ProjectPercentage']) )

In [12]:
# Gender breakdown of completed expeditions
pd.read_sql_query("select a.Gender, count(a.Gender) as Total from people_logistics a \
                  join logistics b on a.IDf_Logistics = b.ID_Logistics \
                  where b.FieldStatus ='Completed' and a.CareerStage not like 'Field%'\
                  group by a.Gender \
                  union \
                  select '-', count(a.Gender) as Total from people_logistics a \
                  join logistics b on a.IDf_Logistics = b.ID_Logistics \
                  where b.FieldStatus ='Completed' and a.CareerStage not like 'Field%' \
                  order by 2" , con )


Unnamed: 0,Gender,Total
0,Non-binary,1
1,,4
2,Man,13
3,Woman,16
4,-,34


In [11]:
# Career stage breakdown of completed projects
pd.read_sql_query("select a.CareerStage, count(a.CareerStage) as Total from people_logistics a \
            join logistics b on a.IDf_Logistics = b.ID_Logistics where b.FieldStatus ='Completed' \
            and  b.FieldSeason = '2021/22' and a.CareerStage not like 'Field%'\
            group by a.CareerStage \
            union \
            select '-', count(a.CareerStage) as Total from people_logistics a \
            join logistics b on a.IDf_Logistics = b.ID_Logistics where b.FieldStatus ='Completed' \
            and  FieldSeason = '2021/22' and a.CareerStage not like 'Field%' \
            order by 2", con )


Unnamed: 0,CareerStage,Total
0,Early Career (<5yrs post PhD),1
1,Senior Researcher,1
2,Student,1
3,Mid Career (5-15yrs post PhD),2
4,Professional,2
5,-,7


In [21]:
# Career stage breakdown of completed projects
                #   --100.0 * count(a.Org) / (select count(*) from people_logistics) \
pd.read_sql_query("select a.Org as Organisation, count(a.Org) as Total, \
                  round(100.0 * count(a.Org) / (select count(a.Org) from people_logistics a \
                  join logistics b on a.IDf_Logistics = b.ID_Logistics \
                  where b.FieldStatus ='Completed' and a.CareerStage not like 'Field%' ), 0) as Percent \
                  from people_logistics a \
                  join logistics b on a.IDf_Logistics = b.ID_Logistics \
                  where b.FieldStatus ='Completed' and a.CareerStage not like 'Field%' \
                  group by a.Org \
                  union \
                  select '-', count(a.Org), '100.0' from people_logistics a \
                  join logistics b on a.IDf_Logistics = b.ID_Logistics \
                  where b.FieldStatus ='Completed' and a.CareerStage not like 'Field%' \
                  order by 2", con )


Unnamed: 0,Organisation,Total,Percent
0,GA,1,3.0
1,La Trobe,2,6.0
2,AUT,3,9.0
3,UOW,6,18.0
4,QUT,9,26.0
5,Monash,13,38.0
6,-,34,100.0


In [185]:
con.close()