# Retrieval Performance (Departmental)

In [1]:
import glob
import pandas as pd

objects_df = pd.read_hdf('/home/richard/Git/bbk-mqaf/data/latest/retrieval.h5')

In [3]:
# First create new data frame reducing down to just the columns we need

complete_df = objects_df.filter(regex='collectionCode|completeness*', axis=1)

In [4]:
complete_df

Unnamed: 0,collectionCode,completeness:TOTAL,completeness:SEARCHABILITY,completeness:BROWSEABILITY,completeness:CONNECTIVITY
0,T&P,0.435897,0.447368,0.391304,0.0
1,T&P,0.333333,0.342105,0.347826,0.0
2,T&P,0.307692,0.315789,0.304348,0.0
3,T&P,0.410256,0.421053,0.391304,0.0
4,T&P,0.358974,0.368421,0.347826,0.0
...,...,...,...,...,...
8759,PDP,0.333333,0.342105,0.391304,0.0
8760,PDP,0.333333,0.342105,0.391304,0.0
8761,PDP,0.333333,0.342105,0.391304,0.0
8762,PDP,0.333333,0.342105,0.391304,0.0


In [5]:
depts = complete_df[' collectionCode'].unique()

In [6]:
import re
tidied_df = complete_df.rename(columns=lambda x: re.sub('completeness:','',x))
tidied_df

Unnamed: 0,collectionCode,TOTAL,SEARCHABILITY,BROWSEABILITY,CONNECTIVITY
0,T&P,0.435897,0.447368,0.391304,0.0
1,T&P,0.333333,0.342105,0.347826,0.0
2,T&P,0.307692,0.315789,0.304348,0.0
3,T&P,0.410256,0.421053,0.391304,0.0
4,T&P,0.358974,0.368421,0.347826,0.0
...,...,...,...,...,...
8759,PDP,0.333333,0.342105,0.391304,0.0
8760,PDP,0.333333,0.342105,0.391304,0.0
8761,PDP,0.333333,0.342105,0.391304,0.0
8762,PDP,0.333333,0.342105,0.391304,0.0


In [7]:
import altair as alt

ranges = [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
count =0

dept_col_counts = None
depts = tidied_df[' collectionCode'].unique()
charts = []
hcharts = []
for dept_code in depts:
  dept_rows_df = pd.DataFrame()
  dept_stats_df = tidied_df.loc[tidied_df[' collectionCode'] == dept_code]
  if(len(dept_stats_df) < 10):
    # Ignore EXH and other errartic departments
    continue
  # Now need to loop over each completeness coll in turn
  for column in dept_stats_df.columns[1:]:
    # Group into counts of percentages 0-10, 10-20, ... 90-100
    # dept_col_counts = dept_stats_df[column].value_counts(bins=10, sort=False)
    
    dept_col_counts = dept_stats_df[column].groupby(pd.cut(dept_stats_df[column], ranges, labels=["0%","10%","20%","30%","40%","50%","60%","70%","80%","90%"], include_lowest=True)).count()
    dept_col_counts.name = column
    dept_rows_df = dept_rows_df.append(dept_col_counts, ignore_index=False)
    
  dept_rows_df = dept_rows_df.rename_axis('Retrievability').rename_axis('Percentages', axis='columns')
    # TO handle converting from CategoricalIndex - may change https://github.com/pandas-dev/pandas/issues/19136
  dept_rows_df.columns = dept_rows_df.columns.tolist()
 #   print(dept_rows_df)
  dept_rows_df = dept_rows_df.reset_index()

  dept_rows_melt_df = dept_rows_df.melt(id_vars=['Retrievability'])
  
  chart = alt.Chart(dept_rows_melt_df).mark_bar().encode(
       x=alt.X('value:Q', axis=alt.Axis(title='Objects')),
       y='Retrievability:O',
        color='variable:O',
       tooltip=['variable', 'value']
    ).properties(width=220, title="%s (%d)" % (dept_code, len(dept_stats_df)))
  charts.append(chart)
  if count > 1:
      hcharts.append(alt.hconcat(*charts))
      charts = []
      count = 0
  else:
      count += 1
        
hcharts.append(alt.hconcat(*charts))

alt.vconcat(*hcharts)