In [1]:
import math
import statistics
import pandas as pd
import numpy as np
import altair as alt
import vl_convert as vlc

In [2]:
# PARSE METADATA

# NOTE: change me
md_fp = ''
md = pd.read_csv(md_fp, sep='\t', dtype={'sample-id': str})

md = md.rename({'sample-id': 'sample_id'}, axis=1)

md = md[['sample_id', 'Bucket', 'Composting Time Point', 'SampleType']]

# drop nan & week-0 rows of HEC
md = md[~(
    (md['Composting Time Point'] == 'Human Exrement Compost') &
    (md['Composting Time Point'].isna() | md['Composting Time Point'] > 0)
)]

# keep only HE and HEC observations
md = md[md['SampleType'].isin(['Human Excrement Compost', 'Human Excrement'])]

In [3]:
# IMPORT CULUTURING DATA

# NOTE: change me
culturing_fp = ''
culture = pd.read_csv(culturing_fp, sep='\t', skiprows=[1])
culture = culture.rename({'sample-id': 'sample_id'}, axis='columns')

culture.head()

Unnamed: 0,sample_id,Bucket,Preliminary analysis of presence,Results from preliminary analysis,Serial Dilution 0.1,Serial Dilution 0.01,Serial Dilution 0.001,Serial Dilution 0.0001,MPN/g,Lower 95% Confidence Limit (MPN/g),Upper 95% Confidence Limit (MPN/g),Lower 95% Confidence Limit (MPN/g) .1,Upper 95% Confidence Limit (MPN/g).1,Composting Time Point,SampleType,Date
0,772b5c41,1,(+/+),Y,(+/+/+/+),(+/+/+/+),(-/-/-/-),(-/-/-/-),239.791,63.784,849.372,38.676,964.672,1,Compost Post-Roll,2021-10-04
1,e706dd01,1,(+/+),Y,(+/+/+/+),(+/+/+/+),(-/-/-/-),(-/-/-/-),239.791,63.784,849.372,38.676,964.672,3,Compost Post-Roll,2021-10-13
2,600569a7,1,(+/+),Y,(+/+/+/-),(-/-/-/-),(-/-/-/-),(-/-/-/-),11.451,1.724,29.699,2.191,43.744,5,Compost Post-Roll,2021-10-28
3,b9f1a9d7,1,(+/-),Y,(-/-/-/-),(-/-/-/-),(-/-/-/-),(-/-/-/-),0.0,0.0,0.0,0.0,0.0,7,Compost Post-Roll,2021-11-11
4,3983043e,1,(+/+),Y,(+/+/-/-),(-/-/-/-),(-/-/-/-),(-/-/-/-),6.061,0.386,16.26,0.653,26.597,9,Compost Post-Roll,2021-11-24


In [4]:
# SUBSET CULTURING DATA

# description row is duplicated
culture = culture.iloc[:-1, :]

culture = culture[['sample_id', 'MPN/g']]
culture = culture.astype({'MPN/g': float})

In [5]:
# DROP NA CULTURING VALUES

culture = culture[culture['MPN/g'].notna()]

In [6]:
# MERGE CULTURING DATA, METADATA

culture_md = culture.merge(md, on='sample_id', how='inner')

culture_only = set(culture['sample_id']) - set(md['sample_id'])
print('ids in culture, not in metadata: ', culture_only)

ids in culture, not in metadata:  {'4fcd5839'}


In [7]:
# CALCULATE PER-BUCKET MOVING AVERAGES

# sort by week for moving average calculation
culture_md = culture_md.sort_values('Composting Time Point')

# averaging period
window = 4

culture_md['MPN/g Moving Average'] = (
    culture_md.groupby(['Bucket', 'SampleType'])['MPN/g']
    .transform(lambda x: x.rolling(window=window, min_periods=1).mean())
)

In [8]:
line = alt.Chart(culture_md).mark_line().encode(
    x=alt.X('Composting Time Point').scale(domain=(0, 52), nice=False),
    y=alt.Y('MPN/g').scale(type='linear').title('MPN/g'),
).transform_filter(
    (alt.datum['SampleType'] == 'Human Excrement Compost')
).facet(
    facet='Bucket',
    columns=5,
    title=alt.Title(
        'Culturing MPN/g, By Bucket',
        #subtitle=['moving average, period=4',]
    )
).resolve_scale(
    y='independent',
    x='independent'
)

line

In [9]:
culture_md[culture_md['Bucket'] == 16]

Unnamed: 0,sample_id,MPN/g,Bucket,Composting Time Point,SampleType,MPN/g Moving Average
358,c82b032c,0.0,16.0,1.0,Human Excrement Compost,0.0
359,0975edb5,0.0,16.0,3.0,Human Excrement Compost,0.0
360,1db63fd8,0.0,16.0,5.0,Human Excrement Compost,0.0
361,ccca29fb,0.0,16.0,7.0,Human Excrement Compost,0.0
362,8be7bfa3,0.0,16.0,9.0,Human Excrement Compost,0.0
363,21f81b81,0.0,16.0,11.0,Human Excrement Compost,0.0
364,b7caa7d8,0.0,16.0,13.0,Human Excrement Compost,0.0
365,e1f6d0be,0.0,16.0,15.0,Human Excrement Compost,0.0
366,f3ace39a,0.0,16.0,17.0,Human Excrement Compost,0.0
367,88e24a0d,0.0,16.0,19.0,Human Excrement Compost,0.0
