In [1]:
import polars as pl
import pathlib
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

pio.renderers.default = 'plotly_mimetype+notebook'
pl.Config.set_fmt_str_lengths(100)

polars.config.Config

In [13]:
path_data = pathlib.Path.cwd().parent / 'data'
path_input = path_data / 'input'
path_output = path_data / 'output'

path_df = path_output / 'HY1999-2016_P3A3_20241001.csv'
path_df_ltar = path_input / 'hy2017-hy2022_QCCodes_P2A0_20231219.csv'

In [3]:
df = pl.read_csv(str(path_df), infer_schema_length=6000)
df.head()

HarvestYear,ID2,Longitude,Latitude,SampleID,Crop,Comments,QCCoverage,QCFlags,CropExists,GrainMoisture,GrainProtein,GrainStarch,GrainWGlutDM,GrainOilDM,GrainTestWeight,GrainCarbon,GrainNitrogen,GrainSulfur,ResidueCarbon,ResidueNitrogen,ResidueSulfur,GrainYieldWet,GrainYieldAirDry,ResidueMassWetPerArea,ResidueMassAirDryPerArea
i64,i64,f64,f64,str,str,str,f64,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1999,1,-117.087514,46.778729,"""1999_1""","""SW""",,100.0,0.0,1,,,,,,,44.06,2.387,0.1919,44.1,0.44,0.06,229.084646,220.836614,339.665354,320.768812
1999,2,-117.087065,46.778692,"""1999_2""","""SW""",,100.0,0.0,1,,,,,,,45.26,2.602,0.2053,43.3,0.45,0.07,251.919291,242.73622,414.96063,391.49443
1999,3,-117.086678,46.778791,"""1999_3""","""SW""",,100.0,0.0,1,,,,,,,44.52,2.489,0.19,43.38,0.41,0.08,266.781496,255.344488,384.744094,366.365872
1999,4,-117.08626,46.778761,"""1999_4""","""SW""",,100.0,0.0,1,,,,,,,44.64,2.432,0.18,43.41,0.29,0.06,300.19685,289.17815,430.019685,408.890419
1999,5,-117.085842,46.778666,"""1999_5""","""SW""",,100.0,0.0,1,,,,,,,44.27,2.599,0.1874,40.93,0.43,0.07,353.395669,341.520669,493.011811,466.320089


In [14]:
# Lots of duplicated values in dataset (not sure why??) so drop
df_ltar = pl.read_csv(str(path_df_ltar), infer_schema_length=10000).filter(pl.col('FieldId') == 'CE').unique().sort(['HarvestYear', 'ID2'])
df_ltar.head()

HarvestYear,FieldId,ID2,SampleId,Crop,Crop_qcApplied,Crop_qcResult,Crop_qcPhrase,CropExists,CropExists_qcApplied,CropExists_qcResult,CropExists_qcPhrase,HarvestDate,HarvestDate_qcApplied,HarvestDate_qcResult,HarvestDate_qcPhrase,BiomassDry,BiomassDry_qcApplied,BiomassDry_qcResult,BiomassDry_qcPhrase,GrainMassDry,GrainMassDry_qcApplied,GrainMassDry_qcResult,GrainMassDry_qcPhrase,GrainTestWeight,GrainTestWeight_qcApplied,GrainTestWeight_qcResult,GrainTestWeight_qcPhrase,GrainMoisture,GrainMoisture_qcApplied,GrainMoisture_qcResult,GrainMoisture_qcPhrase,GrainProtein,GrainProtein_qcApplied,GrainProtein_qcResult,GrainProtein_qcPhrase,GrainStarch,…,Residue13C_qcApplied,Residue13C_qcResult,Residue13C_qcPhrase,ResidueNitrogen,ResidueNitrogen_qcApplied,ResidueNitrogen_qcResult,ResidueNitrogen_qcPhrase,GrainMass0,GrainMass0_qcApplied,GrainMass0_qcResult,GrainMass0_qcPhrase,GrainMass125,GrainMass125_qcApplied,GrainMass125_qcResult,GrainMass125_qcPhrase,GrainYieldDryPerArea,GrainYieldDryPerArea_qcApplied,GrainYieldDryPerArea_qcResult,GrainYieldDryPerArea_qcPhrase,GrainYield0PerArea,GrainYield0PerArea_qcApplied,GrainYield0PerArea_qcResult,GrainYield0PerArea_qcPhrase,GrainYield125PerArea,GrainYield125PerArea_qcApplied,GrainYield125PerArea_qcResult,GrainYield125PerArea_qcPhrase,BiomassDryPerArea,BiomassDryPerArea_qcApplied,BiomassDryPerArea_qcResult,BiomassDryPerArea_qcPhrase,HarvestIndex,HarvestIndex_qcApplied,HarvestIndex_qcResult,HarvestIndex_qcPhrase,Comments,QualityControlFlag
i64,str,i64,str,str,i64,i64,str,i64,i64,i64,str,str,i64,i64,str,f64,i64,i64,str,f64,i64,i64,str,f64,i64,i64,str,f64,i64,i64,str,f64,i64,i64,str,f64,…,i64,i64,str,f64,i64,i64,str,f64,i64,i64,str,f64,i64,i64,str,f64,i64,i64,str,f64,i64,i64,str,f64,i64,i64,str,f64,i64,i64,str,f64,i64,i64,str,str,i64
2017,"""CE""",1,"""CE1_Bio_SW_2017_5-A""","""SW""",1,0,,1,1,0,,,1,0,,1198.7,1,0,,625.3,1,0,,57.5,1,0,,3.0,1,0,,11.7,1001,0,,69.1,…,1,0,,0.439046,1011,0,,606.541,1,0,,682.358625,1,0,,256.438648,1011,0,,248.745489,1,0,,279.838675,1,0,,491.592848,11,0,,0.521648,1,0,,,0
2017,"""CE""",2,"""CE2_Bio_SW_2017_6-A""","""SW""",1,0,,1,1,0,,,1,0,,1949.7,1,0,,942.3,1,0,,57.28125,1,0,,3.8,1,0,,12.8,1001,0,,69.2,…,1,0,,,1011,0,,906.4926,1,0,,1019.804175,1,0,,386.441929,1011,0,,371.757136,1,0,,418.226778,1,0,,799.581693,11,0,,0.483305,1,0,,,0
2017,"""CE""",3,"""CE3_Bio_SW_2017_7-A""","""SW""",1,0,,1,1,0,,,1,0,,1413.7,1,0,,777.3,1,0,,59.0,1,0,,4.4,1,0,,11.3,1001,0,,70.4,…,1,0,,0.439032,1011,0,,743.0988,1,0,,835.98615,1,0,,318.774606,1011,0,,304.748524,1,0,,342.842089,1,0,,579.76542,11,0,,0.549834,1,0,,,0
2017,"""CE""",4,"""CE4_Bio_SW_2017_8-A""","""SW""",1,0,,1,1,0,,,1,0,,1629.7,1,0,,777.3,1,0,,57.4,1,0,,1.2,1,0,,12.5,1001,0,,68.8,…,1,0,,,1011,0,,767.9724,1,0,,863.96895,1,0,,318.774606,1011,0,,314.949311,1,0,,354.317975,1,0,,668.348097,11,0,,0.476959,1,0,,,0
2017,"""CE""",5,"""CE5_Bio_SW_2017_9-A""","""SW""",1,0,,1,1,0,,,1,0,,1545.7,1,0,,740.3,1,0,,58.7,1,0,,2.8,1,0,,12.4,1001,0,,68.9,…,1,0,,0.409888,1011,0,,719.5716,1,0,,809.51805,1,0,,303.600722,1011,0,,295.099902,1,0,,331.987389,1,0,,633.899278,11,0,,0.478942,1,0,,,0


In [29]:
# Merge ltar data with legacy data
print(df_ltar.columns)
print(df.columns)
df_ltar = df_ltar.with_columns(
    (pl.col('BiomassDryPerArea') - pl.col('GrainYieldDryPerArea')).alias('ResidueDryPerArea')
)
#df_ltar['ResidueDryPerArea'] = df_ltar['BiomassDryPerArea'] - df_ltar['GrainYieldDryPerArea']
ltar_keep_cols = ['HarvestYear', 'ID2', 'Crop', 'GrainYieldDryPerArea', 'ResidueDryPerArea']
df_ltar_subset = df_ltar.select(ltar_keep_cols).rename({'GrainYieldDryPerArea': 'GrainYieldAirDry', 'ResidueDryPerArea': 'ResidueMassAirDryPerArea'})

df_join = pl.concat([df, df_ltar_subset], how='diagonal')
print(df_join.columns)
print(df_join.head())

df = df_join

['HarvestYear', 'FieldId', 'ID2', 'SampleId', 'Crop', 'Crop_qcApplied', 'Crop_qcResult', 'Crop_qcPhrase', 'CropExists', 'CropExists_qcApplied', 'CropExists_qcResult', 'CropExists_qcPhrase', 'HarvestDate', 'HarvestDate_qcApplied', 'HarvestDate_qcResult', 'HarvestDate_qcPhrase', 'BiomassDry', 'BiomassDry_qcApplied', 'BiomassDry_qcResult', 'BiomassDry_qcPhrase', 'GrainMassDry', 'GrainMassDry_qcApplied', 'GrainMassDry_qcResult', 'GrainMassDry_qcPhrase', 'GrainTestWeight', 'GrainTestWeight_qcApplied', 'GrainTestWeight_qcResult', 'GrainTestWeight_qcPhrase', 'GrainMoisture', 'GrainMoisture_qcApplied', 'GrainMoisture_qcResult', 'GrainMoisture_qcPhrase', 'GrainProtein', 'GrainProtein_qcApplied', 'GrainProtein_qcResult', 'GrainProtein_qcPhrase', 'GrainStarch', 'GrainStarch_qcApplied', 'GrainStarch_qcResult', 'GrainStarch_qcPhrase', 'GrainGluten', 'GrainGluten_qcApplied', 'GrainGluten_qcResult', 'GrainGluten_qcPhrase', 'GrainOil', 'GrainOil_qcApplied', 'GrainOil_qcResult', 'GrainOil_qcPhrase', 'G

In [24]:
non_metric_cols = ['HarvestYear', 'ID2', 'Longitude', 'Latitude', 'SampleID', 'Crop']
metric_cols = [x for x in df.columns if x not in non_metric_cols]

df_agg_yr = (
    df.group_by(['HarvestYear', 'Crop'])
    .agg(
        [pl.col(f'{c}').mean().alias(f'{c}_mean') for c in metric_cols]
    )
    .sort(['Crop', 'HarvestYear'])
) 

print(df_agg_yr)

shape: (108, 25)
┌────────────┬──────┬────────────┬────────────┬───┬────────────┬───────────┬───────────┬───────────┐
│ HarvestYea ┆ Crop ┆ Comments_m ┆ QCCoverage ┆ … ┆ ResidueMas ┆ GrainYiel ┆ GrainYiel ┆ ResidueMa │
│ r          ┆ ---  ┆ ean        ┆ _mean      ┆   ┆ sAirDryPer ┆ dAirDry_P ┆ dDry0_P2_ ┆ ssAirDryP │
│ ---        ┆ str  ┆ ---        ┆ ---        ┆   ┆ Area_mean  ┆ 2_mean    ┆ mean      ┆ erArea_P2 │
│ i64        ┆      ┆ str        ┆ f64        ┆   ┆ ---        ┆ ---       ┆ ---       ┆ _mean     │
│            ┆      ┆            ┆            ┆   ┆ f64        ┆ f64       ┆ f64       ┆ ---       │
│            ┆      ┆            ┆            ┆   ┆            ┆           ┆           ┆ f64       │
╞════════════╪══════╪════════════╪════════════╪═══╪════════════╪═══════════╪═══════════╪═══════════╡
│ 2001       ┆ null ┆ null       ┆ 100.0      ┆ … ┆ null       ┆ null      ┆ null      ┆ null      │
│ 2003       ┆ null ┆ null       ┆ 100.0      ┆ … ┆ null       ┆ null     

In [25]:
def create_yield_comparison_plot(crop, grain_or_residue):

    # init with 'grain' values
    wet_var = 'GrainYieldWet'
    air_var = 'GrainYieldAirDry'

    if(grain_or_residue == 'residue'):
        wet_var = 'ResidueMassWetPerArea'
        air_var = 'ResidueMassAirDryPerArea'
        
    fig = go.Figure()

    crop_df = df.filter(pl.col('Crop') == crop)

    x = crop_df['HarvestYear'].to_list()

    # Start/End cutoffs for years were methods and/or data were different
    cutoff_a0 = 1999
    cutoff_a1 = 2010

    cutoff_b0 = 2011
    cutoff_b1 = 2012

    cutoff_c0 = 2013
    cutoff_c1 = 2016

    cutoff_d0 = 2017
    cutoff_d1 = 2022

    # Wet averages
    if(wet_var is not None):
        wet_mean_1999_2010 = crop_df.filter((pl.col('HarvestYear') >= cutoff_a0) & (pl.col('HarvestYear') <= cutoff_a1))[wet_var].mean()
        wet_mean_2011_2012 = crop_df.filter((pl.col('HarvestYear') >= cutoff_b0) & (pl.col('HarvestYear') <= cutoff_b1))[wet_var].mean()
        wet_mean_2013_2016 = crop_df.filter((pl.col('HarvestYear') >= cutoff_c0) & (pl.col('HarvestYear') <= cutoff_c1))[wet_var].mean()
        wet_mean_2017_2022 = crop_df.filter((pl.col('HarvestYear') >= cutoff_d0) & (pl.col('HarvestYear') <= cutoff_d1))[wet_var].mean()
    else:
        wet_mean_1999_2010 = 0
        wet_mean_2011_2012 = 0
        wet_mean_2013_2016 = 0
        wet_mean_2017_2022 = 0

    # Air dry averages
    if(air_var is not None):
        air_mean_1999_2010 = crop_df.filter((pl.col('HarvestYear') >= cutoff_a0) & (pl.col('HarvestYear') <= cutoff_a1))[air_var].mean()
        air_mean_2011_2012 = crop_df.filter((pl.col('HarvestYear') >= cutoff_b0) & (pl.col('HarvestYear') <= cutoff_b1))[air_var].mean()
        air_mean_2013_2016 = crop_df.filter((pl.col('HarvestYear') >= cutoff_c0) & (pl.col('HarvestYear') <= cutoff_c1))[air_var].mean()
        air_mean_2017_2022 = crop_df.filter((pl.col('HarvestYear') >= cutoff_d0) & (pl.col('HarvestYear') <= cutoff_d1))[air_var].mean()
    else:
        air_mean_1999_2010 = 0
        air_mean_2011_2012 = 0
        air_mean_2013_2016 = 0
        air_mean_2017_2022 = 0

    # Box plots
    if(wet_var is not None):
        fig.add_trace(go.Box(
            x = x,
            y = crop_df[wet_var].to_list(),
            name = 'Wet',
            marker_color='blue'
        ))

    if(air_var is not None):
        fig.add_trace(go.Box(
            x = x,
            y = crop_df[air_var].to_list(),
            name='Air',
            marker_color='green'
        ))

    # Title, xaxis, add dashed v lines
    fig.update_layout(
        boxmode='group',
        title={'text':crop,
               'y':0.9,
               'x':0.5,
               'xanchor':'center',
               'yanchor':'top'},
        xaxis=dict(range=[1998,2023]))
    fig.add_vline(x = cutoff_a1 + 0.5, line_width = 3, line_dash='dash')
    fig.add_vline(x = cutoff_b1 + 0.5, line_width = 3, line_dash='dash')
    fig.add_vline(x = cutoff_c1 + 0.5, line_width = 3, line_dash='dash')

    #Wet averages
    fig.add_shape(type='line',
                  x0=cutoff_a0,
                  x1=cutoff_a1,
                  y0=wet_mean_1999_2010,
                  y1=wet_mean_1999_2010,
                  xref='x',
                  yref='y',
                  line = dict(color='Blue'))
    fig.add_shape(type='line',
                  x0=cutoff_b0,
                  x1=cutoff_b1,
                  y0=wet_mean_2011_2012,
                  y1=wet_mean_2011_2012,
                  xref='x',
                  yref='y',
                  line = dict(color='Blue'))
    fig.add_shape(type='line',
                  x0=cutoff_c0,
                  x1=cutoff_c1,
                  y0=wet_mean_2013_2016,
                  y1=wet_mean_2013_2016,
                  xref='x',
                  yref='y',
                  line = dict(color='Blue'))
    fig.add_shape(type='line',
                  x0=cutoff_d0,
                  x1=cutoff_d1,
                  y0=wet_mean_2017_2022,
                  y1=wet_mean_2017_2022,
                  xref='x',
                  yref='y',
                  line = dict(color='Blue'))
    
    #Air dry averages
    fig.add_shape(type='line',
                  x0=cutoff_a0,
                  x1=cutoff_a1,
                  y0=air_mean_1999_2010,
                  y1=air_mean_1999_2010,
                  xref='x',
                  yref='y',
                  line = dict(color='Green'))
    fig.add_shape(type='line',
                  x0=cutoff_b0,
                  x1=cutoff_b1,
                  y0=air_mean_2011_2012,
                  y1=air_mean_2011_2012,
                  xref='x',
                  yref='y',
                  line = dict(color='Green'))
    fig.add_shape(type='line',
                  x0=cutoff_c0,
                  x1=cutoff_c1,
                  y0=air_mean_2013_2016,
                  y1=air_mean_2013_2016,
                  xref='x',
                  yref='y',
                  line = dict(color='Green'))
    fig.add_shape(type='line',
                  x0=cutoff_d0,
                  x1=cutoff_d1,
                  y0=air_mean_2017_2022,
                  y1=air_mean_2017_2022,
                  xref='x',
                  yref='y',
                  line = dict(color='Green'))

    fig.show()

In [26]:
crops = list(set(df['Crop'].to_list()))

# Box plots

The following figures show box plots of each crop across the years. Vertical dashed lines indicate a change of methods (and/or available data). The horizontal lines indicate mean values for the years within the vertical lines.

## Grain yields (g/m2)

In [27]:
for crop in crops:
    create_yield_comparison_plot(crop, 'grain')


Comparisons with None always result in null. Consider using `.is_null()` or `.is_not_null()`.



## Residue yields (g/m2)

In [28]:
for crop in crops:
    create_yield_comparison_plot(crop, 'residue')


Comparisons with None always result in null. Consider using `.is_null()` or `.is_not_null()`.

