# Retrievability (Decades)

These graphs show per decade of the museum's existence, the counts of object record with percentages of fields, grouped into retrievability measures, defined as:
  * Searchability - all fields containing text are counted for this measure
  * Browseability - all fields using authority controlled values that are used for Faceted browsing
  * Connectivity - all fields linking together indidivual objects

In [1]:
import glob
import pandas as pd

objects_df = pd.read_hdf('/home/richard/Git/bbk-mqaf/data/latest/retrieval.h5')

In [2]:
# First create new data frame reducing down to just the columns we need

complete_df = objects_df.filter(regex='accessionYear|completeness*', axis=1)

In [3]:
import re
tidied_df = complete_df.rename(columns=lambda x: re.sub('completeness:','',x))

In [5]:
deptNames = {"APS": "Apsley House",
             "CER": "Ceramics Collection",
             "CIRC": "Circulation Department (1909 – 1977)",
             "DAD": "Design, Architecture and Digital Department",
             "EAS": "East Asia Collection",
             "EXH": "Exhibitions Department",
             "FoB": "Fabric of the Building",
             "FWK": "Furniture and Woodwork Collection",
             "MES": "Middle East Section",
             "MET": "Metalwork Collection",
             "MoC": "Museum of Childhood",
             "NAL": "National Art Library",
             "PDP": "Prints, Drawings & Paintings Collection",
             "RPS": "Royal Photographic Society Collection",
             "SCP": "Sculpture Collection",
             "SSEA": "South & South East Asia Collection",
             "T&F": "Textiles and Fashion Collection",
             "T&P": "Theatre and Performance Collection",
             "VAA": "V&A Archive Collection",
             "WED": "Wedgwood Collection",
             "AAD": "Archive of Art and Design",
             "DOP": "Department of Photography"}

In [7]:
import altair as alt

ranges = [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
count =0

dept_col_counts = None
charts = []
hcharts = []

for decade in range(1850,2030,10):
  dept_rows_df = pd.DataFrame()
  dept_stats_df = tidied_df.loc[tidied_df[' accessionYear'].between(decade, decade+9)]

  # Now need to loop over each completeness coll in turn
  for column in dept_stats_df.columns[1:]:
    # Group into counts of percentages 0-10, 10-20, ... 90-100
    # dept_col_counts = dept_stats_df[column].value_counts(bins=10, sort=False)
    
    dept_col_counts = dept_stats_df[column].groupby(pd.cut(dept_stats_df[column], ranges, labels=["0-10%","10-20%","20-30%","30-40%","40-50%","50-60%","60-70%","70-80%","80-90%","90-100%"], include_lowest=True)).count()
    dept_col_counts.name = column
    dept_rows_df = dept_rows_df.append(dept_col_counts, ignore_index=False)
    
  dept_rows_df = dept_rows_df.rename_axis('Retrievability').rename_axis('Percentages', axis='columns')
    # TO handle converting from CategoricalIndex - may change https://github.com/pandas-dev/pandas/issues/19136
  dept_rows_df.columns = dept_rows_df.columns.tolist()
 #   print(dept_rows_df)
  dept_rows_df = dept_rows_df.reset_index()

  dept_rows_melt_df = dept_rows_df.melt(id_vars=['Retrievability'], var_name='Percentage', value_name='Objects')
  
  chart = alt.Chart(dept_rows_melt_df).mark_bar().encode(
       x=alt.X('Objects:Q', axis=alt.Axis(title='Objects')),
       y='Retrievability:O',
        color='Percentage:O',
       tooltip=['Percentage', 'Objects']
    ).properties(width=220, title="%d (%d)" % (decade, len(dept_stats_df)))
  charts.append(chart)
  if count > 1:
      hcharts.append(alt.hconcat(*charts))
      charts = []
      count = 0
  else:
      count += 1
        
hcharts.append(alt.hconcat(*charts))

alt.vconcat(*hcharts).configure_concat(spacing=20)