# Parameter Recommendation
This notebook generates recommendations to align variety trial data collection with crop modeling needs and vice versa. It operates by first getting the model importance and prevalence of each parameter for each model. Then, it identifies any parameters present in the raw data but not the trial reports, and recommends including them in the report. Then, it identifies any parameters recorded at a lower granularity (spatio-temporal) than required for each model in the variety trial data, and recommends recording them at increased granularity, or supplying them as raw data if they are found in the raw data at the correct granularity. Then, it identifies parameters that are rarely recorded but still important, and recommends trial runners always record them. Then, if they are not recorded at all, recommendations will be suggested manually on a case-by-case basis.

In [9]:
# Import Libraries
import numpy as np
import pandas as pd

In [10]:
# identify models
models = ['aquacrop', 'dssat', 'genericdescriptive',
          'rodekoning', 'rotomgro', 'stics']

# Set temporal precision scores
temp_order = {'time': 5, 'date': 4, 'frequency': 3, 'range': 2, 'static': 1}

temp_a = 'placeholder a'
temp_b = 'placeholder b'
# Set recommendation codes
rec_codes = {0 : 'individual inspection required',
			 1 : 'always record in report',
			 2 : 'supply raw data',
			 3 : 'always record in raw data, supply raw data',
			 4 : f'recorded as {temp_a} in report, should be {temp_b}',
			 5 : f'recorded as {temp_a} in raw data, should be {temp_b}, supply raw data'}

In [11]:
# create a df for the recommendations 
rec_df = pd.DataFrame(columns=['universal_term',
                               'domain',
							   'temporality',
                               'situational',
                               'importance',
                               'rec_code',
                               'recommendation'])
# add a column for each model
for model in models:
	rec_df[model] = False

# reset index
rec_df = rec_df.reset_index(drop=True)

In [12]:
# Define a function to give recommendations

def rec_code(row):
	# If the fraction is at least 4/32, recommend to always record in report
	if row['fraction'] >= 4/32:
		row['rec_code'] = 1
		row['recommendation'] = rec_codes[1]
	else:
		# Get the row with the same index from the raw params_df
		param_index = row.name
		rd_row = rd_param_df.loc[param_index]
		# If rd_row['fraction'] is at least 6/7, recommend to supply raw data
		if rd_row['fraction'] >= 5/7:
			row['rec_code'] = 2
			row['recommendation'] = rec_codes[2]
		# If rd_row['fraction'] is at least 2/7, recommend to always record in raw data, supply raw data
		elif rd_row['fraction'] >= 2/7:
			row['rec_code'] = 3
			row['recommendation'] = rec_codes[3]
		else:
			# if the parent parameter is not nan, get the parent parameter
			if row['parent_parameter'] is not np.nan:
				# Get the parent parameter
				parent_param = row['parent_parameter']
				# Get the rows with the same parent parameter from the param_df
				parent_df = param_df[param_df['parent_parameter'] == parent_param]
				# Drop rows where the t_temp is nan
				parent_df = parent_df.dropna(subset=['t_temp'])
				# If there are any rows left
				if parent_df.shape[0] > 0:
					# Replace the t_temp with the temporal precision score
					parent_df['t_temp'] = parent_df['t_temp'].map(temp_order)
					# Sort the parent df by fraction, then t_temp, then index
					parent_df = parent_df.sort_values(by=['fraction', 't_temp'], ascending=False)
					# Map the values back to the original values
					parent_df['t_temp'] = parent_df['t_temp'].map({v: k for k, v in temp_order.items()})
					# get the index of the first row
					first_index = parent_df.index[0]
					# If the temporal precision of the parent parameter is not the same as the temporal precision of the current parameter
					if row['m_temp'] != parent_df.loc[first_index]['t_temp']:
						# Set temp_a and temp_b
						temp_b = row['m_temp']
						temp_a = parent_df.loc[first_index]['t_temp']
						# Set the rec_code and recommendation
						row['rec_code'] = 4
						# Update the recommendation with the temp_a and temp_b
						rec_codes[4] = f'recorded as {temp_a} in report, should be {temp_b}'
						row['recommendation'] = rec_codes[4]
					else:
						# Get the parent_df from the raw data
						rd_parent_df = rd_param_df[rd_param_df['parent_parameter'] == parent_param]
						# Drop rows where the t_temp is nan
						rd_parent_df = rd_parent_df.dropna(subset=['t_temp'])
						# If there are any rows left
						if rd_parent_df.shape[0] > 0:
							# Replace the t_temp with the temporal precision score
							rd_parent_df['t_temp'] = rd_parent_df['t_temp'].map(temp_order)
							# Sort the parent df by fraction, then t_temp, then index
							rd_parent_df = rd_parent_df.sort_values(by=['fraction', 't_temp'], ascending=False)
							# Map the values back to the original values
							rd_parent_df['t_temp'] = rd_parent_df['t_temp'].map({v: k for k, v in temp_order.items()})
							# get the index of the first row
							first_index = rd_parent_df.index[0]
							# If the temporal precision of the parent parameter is not the same as the temporal precision of the current parameter
							if row['m_temp'] != rd_parent_df.loc[first_index]['t_temp']:
								# Set temp_a and temp_b
								temp_b = row['m_temp']
								temp_a = rd_parent_df.loc[first_index]['t_temp']
								# Set the rec_code and recommendation
								row['rec_code'] = 5
								# Update the recommendation with the temp_a and temp_b
								rec_codes[5] = f'recorded as {temp_a} in raw data, should be {temp_b}, supply raw data'
								row['recommendation'] = rec_codes[5]
							else:
								row['rec_code'] = 0
								row['recommendation'] = rec_codes[0]
						else:
							row['rec_code'] = 0
							row['recommendation'] = rec_codes[0]
				else:
					# Get the parent_df from the raw data
					rd_parent_df = rd_param_df[rd_param_df['parent_parameter'] == parent_param]
					# Drop rows where the t_temp is nan
					rd_parent_df = rd_parent_df.dropna(subset=['t_temp'])
					# If there are any rows left
					if rd_parent_df.shape[0] > 0:
						# Replace the t_temp with the temporal precision score
						rd_parent_df['t_temp'] = rd_parent_df['t_temp'].map(temp_order)
						# Sort the parent df by fraction, then t_temp, then index
						rd_parent_df = rd_parent_df.sort_values(by=['fraction', 't_temp'], ascending=False)
						# Map the values back to the original values
						rd_parent_df['t_temp'] = rd_parent_df['t_temp'].map({v: k for k, v in temp_order.items()})
						# get the index of the first row
						first_index = rd_parent_df.index[0]
						# If the temporal precision of the parent parameter is not the same as the temporal precision of the current parameter
						if row['m_temp'] != rd_parent_df.loc[first_index]['t_temp']:
							# Set temp_a and temp_b
							temp_b = row['m_temp']
							temp_a = rd_parent_df.loc[first_index]['t_temp']
							# Set the rec_code and recommendation
							row['rec_code'] = 5
							# Update the recommendation with the temp_a and temp_b
							rec_codes[5] = f'recorded as {temp_a} in raw data, should be {temp_b}, supply raw data'
							row['recommendation'] = rec_codes[5]
						else:
							row['rec_code'] = 0
							row['recommendation'] = rec_codes[0]
					else:
						row['rec_code'] = 0
						row['recommendation'] = rec_codes[0]
			else:
				row['rec_code'] = 0
				row['recommendation'] = rec_codes[0]
	return row

In [13]:
# loop through models
for model in models:
	# load param_df
	print(model)
	param_df = pd.read_csv(f'scoring_sheets/reviewed/{model}_comparison_reviewed.csv')
	param_df.set_index('universal_term', inplace=True, drop=True)
	# load rd_param_df
	rd_param_df = pd.read_csv(f'scoring_sheets/reviewed/raw_data_only_{model}_comparison_reviewed.csv')
	rd_param_df.set_index('universal_term', inplace=True, drop=True)
	# copy param_df
	fill_df = param_df.copy()
	# drop rows where score is not 2
	fill_df = fill_df[fill_df['score'] == 2]
	# get a list of the parent_parameters
	parent_params = fill_df['parent_parameter'].unique()
	# for each parent parameter, drop all rows but the one with the highest fraction
	for parent_param in parent_params:
		parent_df = fill_df[fill_df['parent_parameter'] == parent_param]
		max_fraction = parent_df['fraction'].max()
		fill_df = fill_df.drop(parent_df[parent_df['fraction'] < max_fraction].index)
		# if there is a tie for max_fraction, drop all but the first
		fill_df = fill_df.drop(parent_df[parent_df['fraction'] == max_fraction].index[1:])
	# drop rows with a fraction of 0.75 or greater
	fill_df = fill_df[fill_df['fraction'] < 0.75]
	# Add columns to fill_df for rec code and recommendation
	fill_df['rec_code'] = None
	fill_df['recommendation'] = None
	# Apply the rec_code function to the fill_df
	fill_df2 = fill_df.apply(rec_code, axis=1)
	# create a column of the index values
	fill_df2['universal_term'] = fill_df2.index
	# if parent parameter is not nan, replace index with parent parameter
	fill_df2['universal_term'] = fill_df2['universal_term'].where(fill_df2['parent_parameter'].isnull(), fill_df2['parent_parameter'])
	# reset the index as a new index
	fill_df2.reset_index(drop=True, inplace=True)
	# drop columns parent parameter, fraction, m_temp, t_temp, score, m_param, t_param
	fill_df2.drop(columns=['parent_parameter', 'fraction', 't_temp', 'score', 'm_param', 't_param'], inplace=True)
	# rename m_temp to temporality
	fill_df2.rename(columns={'m_temp': 'temporality'}, inplace=True)
	# Find new rows to append
	merge_cols = ['universal_term', 'domain', 'temporality', 'situational', 'importance', 'rec_code', 'recommendation']
	merged = fill_df2.merge(rec_df[merge_cols], on=merge_cols, how='left', indicator=True)
	new_rows = merged[merged['_merge'] == 'left_only'][merge_cols]

	# Append new rows and update the model column
	rec_df = pd.concat([rec_df, new_rows], ignore_index=True)
	rec_df.loc[rec_df.index[-len(new_rows):], model] = True

	# Update existing rows for the current model
	match_idx = merged[merged['_merge'] == 'both'].index
	rec_df.loc[match_idx, model] = True

aquacrop
dssat
genericdescriptive
rodekoning
rotomgro
stics


In [14]:
print(rec_df)

                           universal_term                      domain  \
0    aboveground biomass on sampling date  structural characteristics   
1                            canopy cover  structural characteristics   
2                          flowering date  structural characteristics   
3                                maturity  structural characteristics   
4                         harvested yield                       yield   
..                                    ...                         ...   
133                    trellis dimensions                     trellis   
134                       tunnel material                      tunnel   
135                   tunnel opening date                      tunnel   
136   vegetative biomass on sampling date  structural characteristics   
137                    water stress index  structural characteristics   

    temporality situational importance rec_code  \
0          date          no   critical        0   
1          date      

In [15]:
# Add a column called Notes
rec_df['Notes'] = None

# Sort df by domain, then universal_term, then importance, then situational
rec_df = rec_df.sort_values(by=['domain', 'universal_term', 'importance', 'situational'])

In [16]:
# Save the recommendations to a csv
rec_df.to_csv('recommendations/preliminary_recommendations.csv', index=False)

Make the individual inspections for unique recommendations.

In [17]:
cols = ['universal_term', 'domain', 'temporality', 'situational', 'rec_code', 'recommendation']
rec_df = pd.read_csv('recommendations/reviewed_recommendations.csv', usecols=cols)
print(rec_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138 entries, 0 to 137
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   universal_term  138 non-null    object
 1   domain          138 non-null    object
 2   temporality     138 non-null    object
 3   situational     138 non-null    object
 4   rec_code        138 non-null    int64 
 5   recommendation  138 non-null    object
dtypes: int64(1), object(5)
memory usage: 6.6+ KB
None


In [18]:
# Remove identical rows
rec_df.drop_duplicates(inplace=True)
print(rec_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 111 entries, 0 to 137
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   universal_term  111 non-null    object
 1   domain          111 non-null    object
 2   temporality     111 non-null    object
 3   situational     111 non-null    object
 4   rec_code        111 non-null    int64 
 5   recommendation  111 non-null    object
dtypes: int64(1), object(5)
memory usage: 6.1+ KB
None


In [19]:
# Save to csv
rec_df.to_csv('recommendations/final_recommendations.csv', index=False)

In [20]:
# Create a dataframe with domain as the columns recommendation as the index
rec_sum = pd.DataFrame(columns=rec_df['domain'].unique(), index=rec_df['recommendation'].unique())
# populate the dataframe with the count of each recommendation in each domain
for domain in rec_sum.columns:
    for recommendation in rec_sum.index:
        rec_sum.loc[recommendation, domain] = rec_df[(rec_df['domain'] == domain) & (rec_df['recommendation'] == recommendation)].shape[0]


In [21]:
print(rec_sum)

                                                structural characteristics  \
destructive, use another metric                                         18   
not recorded, no barrier to recording                                    9   
beyond scope, use literature values                                      4   
always record in report                                                  0   
always record in raw data, supply raw data                               0   
supply raw data                                                          0   
requires specialized equipment                                           3   
recorded as frequency in report, should be date                          0   
too costly, use another metric                                           1   
laborious, use another metric                                            4   

                                                ground cover soil  \
destructive, use another metric                            0    0   
not

In [22]:
# Save to csv
rec_sum.to_csv('recommendations/final_recommendation_summary.csv')