# Completeness (incl. new/del)

This shows the monthly change in records, per collecting department, for the percentages completeness for conceptual grouping of record fields. This 
includes new or deleted records, which may obscure field level changes made to existing records.

In [1]:
import glob
import pandas as pd

# Read in last two months HD5

last_month_df = pd.read_hdf('/home/richard/Git/bbk-mqaf/data/prev/field.h5')

In [3]:
cur_month_df = pd.read_hdf('/home/richard/Git/bbk-mqaf/data/latest/field.h5')

In [None]:
# Calculate completness for each per dept grouped into 10% ranges 

In [4]:
last_month_df = last_month_df.filter(regex='collectionCode|completeness*', axis=1)
cur_month_df = cur_month_df.filter(regex='collectionCode|completeness*', axis=1)

In [5]:
import numpy 

depts = cur_month_df[' collectionCode'].dropna().unique() # TODO should ensure same for both in case of department changes

In [6]:
import re
tidied_cur_month_df = cur_month_df.rename(columns=lambda x: re.sub('completeness:','',x))
tidied_last_month_df = last_month_df.rename(columns=lambda x: re.sub('completeness:','',x))

In [None]:
## Field Completeness Changes

In [2]:
deptNames = {"APS": "Apsley House",
             "CER": "Ceramics Collection",
             "CIRC": "Circulation Department (1909 – 1977)",
             "DAD": "Design, Architecture and Digital Department",
             "EAS": "East Asia Collection",
             "EXH": "Exhibitions Department",
             "FoB": "Fabric of the Building",
             "FWK": "Furniture and Woodwork Collection",
             "MES": "Middle East Section",
             "MET": "Metalwork Collection",
             "MoC": "Museum of Childhood",
             "NAL": "National Art Library",
             "PDP": "Prints, Drawings & Paintings Collection",
             "RPS": "Royal Photographic Society Collection",
             "SCP": "Sculpture Collection",
             "SSEA": "South & South East Asia Collection",
             "T&F": "Textiles and Fashion Collection",
             "T&P": "Theatre and Performance Collection",
             "VAA": "V&A Archive Collection",
             "WED": "Wedgwood Collection",
             "AAD": "Archive of Art and Design",
             "DOP": "Department of Photography"}

In [8]:
import altair as alt
import numpy as np

ranges = [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
charts = []
hcharts = []
count = 0

for dept_code in depts:
  dept_diff_rows_df = pd.DataFrame()
  dept_cur_stats_df = tidied_cur_month_df.loc[tidied_cur_month_df[' collectionCode'] == dept_code]
  dept_last_stats_df =  tidied_last_month_df.loc[tidied_last_month_df[' collectionCode'] == dept_code]

  if(len(dept_cur_stats_df) < 10):
        continue
    
  # Now need to loop over each completeness coll in turn
  for cur_column, last_column in zip(dept_cur_stats_df.columns[1:], dept_last_stats_df.columns[1:]):
    
    cur_dept_col_counts = dept_cur_stats_df[cur_column].groupby(pd.cut(dept_cur_stats_df[cur_column], ranges, labels=["0-10%","10-20%","20-30%","30-40%","40-50%","50-60%","60-70%","70-80%","80-90%","90-100%"], include_lowest=True)).count()
    last_dept_col_counts = dept_last_stats_df[last_column].groupby(pd.cut(dept_last_stats_df[last_column], ranges, labels=["0-10%","10-20%","20-30%","30-40%","40-50%","50-60%","60-70%","70-80%","80-90%","90-100%"], include_lowest=True)).count()
    diff_dept_col_counts = cur_dept_col_counts.subtract(last_dept_col_counts, fill_value=0)
    
    dept_diff_rows_df = dept_diff_rows_df.append(diff_dept_col_counts, ignore_index=False)
    
  dept_diff_rows_df = dept_diff_rows_df.rename_axis('Concept').rename_axis('Percentages', axis='columns')
    # TO handle converting from CategoricalIndex - may change https://github.com/pandas-dev/pandas/issues/19136
  dept_diff_rows_df.columns = dept_diff_rows_df.columns.tolist()
  dept_diff_rows_df = dept_diff_rows_df.reset_index()
  dept_diff_rows_melt_df = dept_diff_rows_df.melt(id_vars=['Concept'], var_name='Percentage', value_name='Change')
  
  # Replace zero values with Nan so altair doesn't show them
    
  dept_diff_rows_melt_df.replace(0, np.nan, inplace=True)

  chart = alt.Chart(dept_diff_rows_melt_df).mark_circle().encode(
    alt.X('Percentage:O'),
    alt.Y('Concept:O'),
    alt.Size('Quantity:Q'),
    alt.Color('Direction:N', scale=alt.Scale(domain=['Fewer Records', 'More Records'], range=['red', 'green'])),
    tooltip=['Change']
  ).transform_calculate(
      Quantity='abs(datum.Change)',
      Direction='if(datum.Change < 0, "Fewer Records", "More Records")'
  ).properties(title="%s" % (deptNames[dept_code]), width=300)
    
  charts.append(chart)
  if count > 0:
      hcharts.append(alt.hconcat(*charts))
      charts = []
      count = 0
  else:
      count += 1

hcharts.append(alt.hconcat(*charts))

alt.vconcat(*hcharts)