In [1]:
import math
import statistics
import pandas as pd
import numpy as np
import altair as alt
import vl_convert as vlc

In [2]:
# PARSE METADATA

# NOTE: change me
md_fp = './data/nov-1-2024-metadata.tsv'
md = pd.read_csv(md_fp, sep='\t', dtype={'sample-id': str})

# remove comment lines
md = md[~ md['sample-id'].str.startswith('#')]

md = md.rename({'sample-id': 'sample_id'}, axis=1)

md = md[['sample_id', 'Bucket', 'Composting Time Point', 'SampleType']]

# drop nan & week-0 rows of HEC
md = md[~(
    (md['Composting Time Point'] == 'Human Excrement Compost') &
    (md['Composting Time Point'].isna() | md['Composting Time Point'] > 0)
)]

# keep only HEC observations
md = md[md['SampleType'] == 'Human Excrement Compost']

In [3]:
# RECAST WEEK AND BUCKET COLUMNS TO INTEGER

md['Bucket'] = md['Bucket'].astype('Int64')
md['Composting Time Point'] = md['Composting Time Point'].astype('Int64')

In [4]:
# IMPORT CULUTURING DATA

# NOTE: change me
culturing_fp = './data/culturing-data.tsv'
culture = pd.read_csv(culturing_fp, sep='\t', skiprows=[1])

# remove comment lines
culture = culture[~ culture['sample-id'].str.startswith('#')]

culture = culture.rename({'sample-id': 'sample_id'}, axis='columns')

culture.head()

Unnamed: 0,sample_id,Bucket,Preliminary analysis of presence,Results from preliminary analysis,Serial Dilution 0.1,Serial Dilution 0.01,Serial Dilution 0.001,Serial Dilution 0.0001,MPN/g,Lower 95% Confidence Limit (MPN/g),Upper 95% Confidence Limit (MPN/g),Lower 95% Confidence Limit (MPN/g) .1,Upper 95% Confidence Limit (MPN/g).1,Composting Time Point,SampleType,Date
0,772b5c41,1,(+/+),Y,(+/+/+/+),(+/+/+/+),(-/-/-/-),(-/-/-/-),239.791,63.784,849.372,38.676,964.672,1,Compost Post-Roll,2021-10-04
1,e706dd01,1,(+/+),Y,(+/+/+/+),(+/+/+/+),(-/-/-/-),(-/-/-/-),239.791,63.784,849.372,38.676,964.672,3,Compost Post-Roll,2021-10-13
2,600569a7,1,(+/+),Y,(+/+/+/-),(-/-/-/-),(-/-/-/-),(-/-/-/-),11.451,1.724,29.699,2.191,43.744,5,Compost Post-Roll,2021-10-28
3,b9f1a9d7,1,(+/-),Y,(-/-/-/-),(-/-/-/-),(-/-/-/-),(-/-/-/-),0.0,0.0,0.0,0.0,0.0,7,Compost Post-Roll,2021-11-11
4,3983043e,1,(+/+),Y,(+/+/-/-),(-/-/-/-),(-/-/-/-),(-/-/-/-),6.061,0.386,16.26,0.653,26.597,9,Compost Post-Roll,2021-11-24


In [5]:
# SUBSET CULTURING DATA

culture = culture[['sample_id', 'MPN/g']]
culture = culture.astype({'MPN/g': float})

In [6]:
# DROP NA CULTURING VALUES

culture = culture[culture['MPN/g'].notna()]

In [7]:
# MERGE CULTURING DATA, METADATA

culture_md = culture.merge(md, on='sample_id', how='inner')

culture_md.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384 entries, 0 to 383
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   sample_id              384 non-null    object 
 1   MPN/g                  384 non-null    float64
 2   Bucket                 384 non-null    Int64  
 3   Composting Time Point  384 non-null    Int64  
 4   SampleType             384 non-null    object 
dtypes: Int64(2), float64(1), object(2)
memory usage: 15.9+ KB


In [8]:
# CALCULATE PER-BUCKET MOVING AVERAGES

# sort by week for moving average calculation
culture_md = culture_md.sort_values('Composting Time Point')

# averaging period
window = 4

culture_md['MPN/g Moving Average'] = (
    culture_md.groupby(['Bucket', 'SampleType'])['MPN/g']
    .transform(lambda x: x.rolling(window=window, min_periods=1).mean())
)

In [9]:
line = alt.Chart(culture_md).mark_line().encode(
    x=alt.X('Composting Time Point').scale(domain=(0, 52), nice=False),
    y=alt.Y('MPN/g').scale(type='linear').title('MPN/g'),
).transform_filter(
    (alt.datum['SampleType'] == 'Human Excrement Compost')
).facet(
    facet='Bucket',
    columns=5,
    title=alt.Title(
        'Culturing MPN/g, By Bucket'
    )
).resolve_scale(
    y='independent',
    x='independent'
)

line