# Retrieval (incl. new/del)

In [1]:
import glob
import pandas as pd

cur_objects_df = pd.read_hdf('/home/richard/Git/bbk-mqaf/data/latest/retrieval.h5')

In [2]:
prev_objects_df = pd.read_hdf('/home/richard/Git/bbk-mqaf/data/prev/retrieval.h5')

In [3]:
# First create new data frame reducing down to just the columns we need

cur_complete_df = cur_objects_df.filter(regex='collectionCode|completeness*', axis=1)
prev_complete_df = prev_objects_df.filter(regex='collectionCode|completeness*', axis=1)

In [4]:
depts = cur_complete_df[' collectionCode'].unique()

In [5]:
import re
tidied_cur_df = cur_complete_df.rename(columns=lambda x: re.sub('completeness:','',x))
tidied_prev_df = prev_complete_df.rename(columns=lambda x: re.sub('completeness:','',x))

In [None]:
deptNames = {"APS": "Apsley House",
             "CER": "Ceramics Collection",
             "CIRC": "Circulation Department (1909 – 1977)",
             "DAD": "Design, Architecture and Digital Department",
             "EAS": "East Asia Collection",
             "EXH": "Exhibitions Department",
             "FoB": "Fabric of the Building",
             "FWK": "Furniture and Woodwork Collection",
             "MES": "Middle East Section",
             "MET": "Metalwork Collection",
             "MoC": "Museum of Childhood",
             "NAL": "National Art Library",
             "PDP": "Prints, Drawings & Paintings Collection",
             "RPS": "Royal Photographic Society Collection",
             "SCP": "Sculpture Collection",
             "SSEA": "South & South East Asia Collection",
             "T&F": "Textiles and Fashion Collection",
             "T&P": "Theatre and Performance Collection",
             "VAA": "V&A Archive Collection",
             "WED": "Wedgwood Collection",
             "AAD": "Archive of Art and Design",
             "DOP": "Department of Photography"}

In [6]:
import altair as alt
import numpy as np

ranges = [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
count =0

dept_col_counts = None
depts = tidied_cur_df[' collectionCode'].unique()
charts = []
hcharts = []
for dept_code in depts:
  dept_diff_rows_df = pd.DataFrame()
  dept_cur_stats_df = tidied_cur_df.loc[tidied_cur_df[' collectionCode'] == dept_code]
  dept_prev_stats_df = tidied_prev_df.loc[tidied_prev_df[' collectionCode'] == dept_code]

  if(len(dept_cur_stats_df) < 10):
    # Ignore EXH and other errartic departments
    continue

  # Now need to loop over each completeness coll in turn
  for cur_column, prev_column in zip(dept_cur_stats_df.columns[1:], dept_prev_stats_df.columns[1:]):
        
    # Group into counts of percentages 0-10, 10-20, ... 90-100
    # dept_col_counts = dept_stats_df[column].value_counts(bins=10, sort=False)
    
    cur_dept_col_counts = dept_cur_stats_df[cur_column].groupby(pd.cut(dept_cur_stats_df[cur_column], ranges, labels=["0-10%","10-20%","20-30%","30-40%","40-50%","50-60%","60-70%","70-80%","80-90%","90-100%"], include_lowest=True)).count()
    prev_dept_col_counts = dept_prev_stats_df[prev_column].groupby(pd.cut(dept_prev_stats_df[prev_column], ranges, labels=["0-10%","10-20%","20-30%","30-40%","40-50%","50-60%","60-70%","70-80%","80-90%","90-100%"], include_lowest=True)).count()
    diff_dept_col_counts = cur_dept_col_counts.subtract(prev_dept_col_counts, fill_value=0)
    
    dept_diff_rows_df = dept_diff_rows_df.append(diff_dept_col_counts, ignore_index=False)
    
#    dept_col_counts.name = column
#    dept_rows_df = dept_rows_df.append(dept_col_counts, ignore_index=False)
    
  dept_diff_rows_df = dept_diff_rows_df.rename_axis('Retrievability').rename_axis('Percentages', axis='columns')
    # TO handle converting from CategoricalIndex - may change https://github.com/pandas-dev/pandas/issues/19136
  dept_diff_rows_df.columns = dept_diff_rows_df.columns.tolist()
 #   print(dept_rows_df)
  dept_diff_rows_df = dept_diff_rows_df.reset_index()

  dept_diff_rows_melt_df = dept_diff_rows_df.melt(id_vars=['Retrievability'], var_name='Percentage', value_name='Change')
  dept_diff_rows_melt_df.replace(0, np.nan, inplace=True) 

  chart = alt.Chart(dept_diff_rows_melt_df).mark_circle().encode(
    alt.X('Percentage:O', axis=alt.Axis(title="Percentage Complete")),
    alt.Y('Retrievability:O'),
    alt.Size('Size:Q'),
    alt.Color('Direction:N', scale=alt.Scale(domain=['Fewer Records', 'More Records'], range=['red', 'green'])),
    tooltip=['Change']
  ).transform_calculate(
      Size='abs(datum.Change)',
      Direction='if(datum.Change < 0, "Fewer Records", "More Records")'
  ).properties(title="%s" % deptNames[dept_code], width=300,height=200)
    
  charts.append(chart)
  if count > 0:
      hcharts.append(alt.hconcat(*charts))
      charts = []
      count = 0
  else:
      count += 1  
        
hcharts.append(alt.hconcat(*charts))

alt.vconcat(*hcharts)