In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read in the data
df = pd.read_csv('volume_directory.csv', index_col='sample_id')
ref_df = pd.read_csv('biomass_scrape.csv', index_col='Sample ID')

In [3]:
# Drop rows not found in indexes of both dfs
df = df[df.index.isin(ref_df.index)]

In [5]:
print(df.columns)

Index(['plot_id', 'trial_dir', 'date_dir', 'species', 'width', 'depth',
       'height', 'voxel_vol_02', 'voxel_vol_01', 'voxel_vol_005',
       'voxel_vol_002', 'voxel_vol_001', 'convh_vol', 'ground_area'],
      dtype='object')


In [6]:
# Add sampling age column from ref_df to df based on indexes of both dfs
df['sampling_age'] = ref_df['Sampling Age']

In [7]:
print(df.columns)
print(len(df))

Index(['plot_id', 'trial_dir', 'date_dir', 'species', 'width', 'depth',
       'height', 'voxel_vol_02', 'voxel_vol_01', 'voxel_vol_005',
       'voxel_vol_002', 'voxel_vol_001', 'convh_vol', 'ground_area',
       'sampling_age'],
      dtype='object')
24


In [28]:
# Set up summary dataframe
s_cols = ['species', 'sampling_age', 'voxel_vol_02', 'voxel_vol_01', 'voxel_vol_005', 
          'voxel_vol_002', 'voxel_vol_001', 'convh_vol']
summary_df = pd.DataFrame(columns=s_cols)

In [29]:
# Loop through each species and sampling age
for species in df['species'].unique():
    for sampling_age in df['sampling_age'].unique():
        # Subset the dataframe
        sub_df = df[(df['species'] == species) & (df['sampling_age'] == sampling_age)]
        # Get the number of samples
        n = len(sub_df)
        if n != 0:
            # Get the mean biomass values
            voxel_vol_02 = np.mean(sub_df['voxel_vol_02'])
            voxel_vol_01 = np.mean(sub_df['voxel_vol_01'])
            voxel_vol_005 = np.mean(sub_df['voxel_vol_005'])
            voxel_vol_002 = np.mean(sub_df['voxel_vol_002'])
            voxel_vol_001 = np.mean(sub_df['voxel_vol_001'])
            convh_vol = np.mean(sub_df['convh_vol'])
            # Add the values to the summary dataframe
            summary_df.loc[len(summary_df)] = [species, sampling_age, voxel_vol_02, 
                                               voxel_vol_01, voxel_vol_005, voxel_vol_002, 
                                               voxel_vol_001, convh_vol]

In [31]:
summary_df.sort_values(by=['species', 'sampling_age'], inplace=True)
print(summary_df)

      species  sampling_age  voxel_vol_02  voxel_vol_01  voxel_vol_005  \
9      tomato             0      0.004731      0.001653       0.000539   
7      tomato             2      0.028936      0.010664       0.003509   
6      tomato             5      0.052128      0.017904       0.005082   
8      tomato            10      0.145664      0.047812       0.011175   
5      tomato            15      0.222432      0.074665       0.017648   
4  watermelon             0      0.003328      0.001158       0.000377   
3  watermelon             2      0.013872      0.004619       0.001356   
1  watermelon             4      0.004052      0.001378       0.000427   
2  watermelon             9      0.246048      0.081480       0.022184   
0  watermelon            15      0.292144      0.087762       0.016587   

   voxel_vol_002  voxel_vol_001  convh_vol  
9       0.000129       0.000039   0.012095  
7       0.000791       0.000207   0.092525  
6       0.000715       0.000137   0.222063  
8    

In [32]:
# Write to csv
summary_df.to_csv('volume_summary.csv', index=False)