# Parameter Prevalence
This notebook calculates the prevalence of the model parameters in the variety trial corpus.

In [1]:
# Import libraries
import numpy as np
import pandas as pd

In [2]:
# identify models
models = ['aquacrop', 'dssat', 'genericdescriptive',
          'rodekoning', 'rotomgro', 'stics']

# Set temporal precision scores
temp_order = {'time': 5, 'date': 4, 'frequency': 3, 'range': 2, 'static': 1}


In [3]:
def process_models(model):
	# load param_df
	param_df = pd.read_csv(f'scoring_sheets/reviewed/composite_{model}_comparison_reviewed.csv')
	# copy param_df
	fill_df = param_df.copy()
	# drop rows where score is not 2
	fill_df = fill_df[fill_df['score'] == 2]
	# drop unnecessary columns
	fill_df.drop(columns=['t_temp', 'score', 'm_param', 't_param', 'situational', 'importance'], inplace=True)
	# keep only rows with the same universal term with the highest fraction
	fill_df = fill_df.sort_values('fraction', ascending=False).drop_duplicates('universal_term').sort_index()
	# get a list of the parent_parameters
	parent_params = fill_df['parent_parameter'].unique()
	# for each parent parameter, drop all rows but the one with the highest fraction
	for parent_param in parent_params:
		parent_df = fill_df[fill_df['parent_parameter'] == parent_param]
		max_fraction = parent_df['fraction'].max()
		fill_df = fill_df.drop(parent_df[parent_df['fraction'] < max_fraction].index)
		# if there is a tie for max_fraction, drop all but the first
		fill_df = fill_df.drop(parent_df[parent_df['fraction'] == max_fraction].index[1:])
	# if parent parameter is not nan, replace index with parent parameter
	fill_df['universal_term'] = fill_df['universal_term'].where(fill_df['parent_parameter'].isnull(), fill_df['parent_parameter'])
	# reset the index as a new index
	fill_df.reset_index(drop=True, inplace=True)
	# drop columns parent parameter, fraction, m_temp, t_temp, score, m_param, t_param
	fill_df.drop(columns=['parent_parameter'], inplace=True)
	# rename m_temp to temporality
	fill_df.rename(columns={'m_temp': 'temporality'}, inplace=True)
	return fill_df

In [4]:
dssat_df = process_models('dssat')
aquacrop_df = process_models('aquacrop')
genericdescriptive_df = process_models('genericdescriptive')
rodekoning_df = process_models('rodekoning')
rotomgro_df = process_models('rotomgro')
stics_df = process_models('stics')

In [5]:
processed_dfs = {'dssat': dssat_df, 'aquacrop': aquacrop_df, 'genericdescriptive': genericdescriptive_df, 'rodekoning': rodekoning_df, 'rotomgro': rotomgro_df, 'stics': stics_df}
# Step 1: Combine all unique rows from the processed dataframes
all_rows = pd.concat([df[['universal_term', 'domain', 'temporality', 'fraction']] for df in processed_dfs.values()], ignore_index=True)
prev_df = all_rows.drop_duplicates().reset_index(drop=True)

# Step 2: Add columns for each model, initialized as NaN
for model in processed_dfs.keys():
    prev_df[model] = np.nan

# Step 3: Mark `True` for rows present in each model's dataframe
for model, model_df in processed_dfs.items():
    # Merge prev_df with the model dataframe to find exact matches
    matches = prev_df.merge(model_df[['universal_term', 'domain', 'temporality', 'fraction']], 
                            on=['universal_term', 'domain', 'temporality', 'fraction'], 
                            how='inner')
    
    # Find indices of matched rows
    match_indices = prev_df.index[
        prev_df.set_index(['universal_term', 'domain', 'temporality', 'fraction']).index.isin(
            matches.set_index(['universal_term', 'domain', 'temporality', 'fraction']).index
        )
    ]
    
    # Update the corresponding model column to True
    prev_df.loc[match_indices, model] = True

In [6]:
print(prev_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129 entries, 0 to 128
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   universal_term      129 non-null    object 
 1   domain              129 non-null    object 
 2   temporality         121 non-null    object 
 3   fraction            129 non-null    float64
 4   dssat               88 non-null     object 
 5   aquacrop            32 non-null     object 
 6   genericdescriptive  6 non-null      object 
 7   rodekoning          8 non-null      object 
 8   rotomgro            8 non-null      object 
 9   stics               62 non-null     object 
dtypes: float64(1), object(9)
memory usage: 10.2+ KB
None


In [None]:
# save prev_df
prev_df.to_csv('prevalence/composite_param_prevalence.csv', index=False)

In [8]:
# reload final prev_df
prev_df = pd.read_csv('prevalence/final_inspected/composite_param_prevalence.csv')

In [9]:
# create a df with columns for each unique domain
domain_df = pd.DataFrame(columns=prev_df['domain'].unique())
print(domain_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Data columns (total 18 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   structural characteristics  0 non-null      object
 1   ground cover                0 non-null      object
 2   soil                        0 non-null      object
 3   field history               0 non-null      object
 4   fertilizer                  0 non-null      object
 5   weeds, pests, diseases      0 non-null      object
 6   transplanting               0 non-null      object
 7   harvesting                  0 non-null      object
 8   yield                       0 non-null      object
 9   irrigation                  0 non-null      object
 10  location                    0 non-null      object
 11  spacing                     0 non-null      object
 12  pruning                     0 non-null      object
 13  seeding                     0 non-null      object
 14  plot  

In [10]:
# add a row for the number of universal terms per domain without using append or concat
n_params = prev_df.groupby('domain')['universal_term'].nunique()
print(n_params)
# sum
sum = n_params.sum()
print(sum)

domain
fertilizer                     7
field history                  5
ground cover                   2
harvesting                     3
irrigation                     5
location                       1
plot                           1
pruning                        2
seeding                        3
soil                          26
spacing                        3
structural characteristics    36
tillage                        3
transplanting                  7
trellis                        1
tunnel                         3
weeds, pests, diseases         7
yield                          3
Name: universal_term, dtype: int64
118


In [11]:
# create a df with columns for each unique domain
domain_df = pd.DataFrame(columns=prev_df['domain'].unique())
# add a row for the number of universal terms per domain without using append or concat
n_params = prev_df.groupby('domain')['universal_term'].nunique()
# add a row for the average fraction per domain
f_params = prev_df.groupby('domain')['fraction'].mean()
# add a row for the number of 0s per domain
n_0_params = prev_df.groupby('domain')['fraction'].apply(lambda x: (x == 0).sum())
# add a row for the fraction of 0s per domain
f_0_params = prev_df.groupby('domain')['fraction'].apply(lambda x: (x == 0).mean())
# add the rows to domain_df
# add a metric column that states each row's metric
metrics = ['n_params', 'f_params', 'n_0_params', 'f_0_params']
metric_values = [n_params, f_params, n_0_params, f_0_params]
# Manually construct the rows
for metric, values in zip(metrics, metric_values):
    new_row = pd.Series(values, name=metric)
    domain_df.loc[metric] = new_row
    

# Add a metric column
domain_df['metric'] = metrics

# Reset the index for a clean DataFrame
domain_df.reset_index(drop=True, inplace=True)

In [12]:
print(domain_df)

   structural characteristics  ground cover       soil  field history  \
0                   36.000000           2.0  26.000000          5.000   
1                    0.008649           0.0   0.063077          0.028   
2                   33.000000           2.0  21.000000          4.000   
3                    0.891892           1.0   0.807692          0.800   

   fertilizer  weeds, pests, diseases  transplanting  harvesting     yield  \
0    7.000000                7.000000       7.000000    3.000000  3.000000   
1    0.638571                0.157143       0.197143    0.363333  0.523333   
2    2.000000                2.000000       5.000000    1.000000  1.000000   
3    0.285714                0.285714       0.714286    0.333333  0.333333   

   irrigation  location   spacing  pruning  seeding  plot  tillage  trellis  \
0       5.000      1.00  3.000000     2.00   3.0000   1.0      3.0     1.00   
1       0.332      0.94  0.530000     0.08   0.2275   0.0      0.0     0.03   
2     

In [13]:
# save to csv
domain_df.to_csv('prevalence/final_inspected/composite_prevalence_summary.csv', index=False)

In [14]:
# get the number of static universal terms for each domain
static_df = prev_df[prev_df['temporality'] == 'static']
n_static_params = static_df.groupby('domain')['universal_term'].nunique()
# get the number of date universal terms for each domain
date_df = prev_df[prev_df['temporality'] == 'date']
n_date_params = date_df.groupby('domain')['universal_term'].nunique()
# columns
cols = ['n_static_params', 'n_date_params']
# combine the two dfs
static_date_df = pd.concat([n_static_params, n_date_params], axis=1)
static_date_df.columns = cols

In [15]:
# convert NaNs to 0
static_date_df.fillna(0, inplace=True)
# convert to int
static_date_df = static_date_df.astype(int)
print(static_date_df)

                            n_static_params  n_date_params
domain                                                    
fertilizer                                6              1
field history                             5              0
ground cover                              2              0
harvesting                                2              1
irrigation                                4              1
location                                  1              0
plot                                      1              0
pruning                                   1              1
seeding                                   1              3
soil                                     22              4
spacing                                   3              0
structural characteristics                1             36
tillage                                   2              1
transplanting                             5              2
trellis                                   1             

In [16]:
# save to csv
static_date_df.to_csv('prevalence/final_inspected/composite_temp_params_summary.csv')

In [17]:
# create a df with columns for each unique temporality
temp_df = pd.DataFrame(columns=prev_df['temporality'].unique())
# add a row for the number of universal terms per temporality without using append or concat
n_params = prev_df.groupby('temporality')['universal_term'].nunique()
# add a row for the average fraction per temporality
f_params = prev_df.groupby('temporality')['fraction'].mean()
# add a row for the number of 0s per temporality
n_0_params = prev_df.groupby('temporality')['fraction'].apply(lambda x: (x == 0).sum())
# add a row for the fraction of 0s per temporality
f_0_params = prev_df.groupby('temporality')['fraction'].apply(lambda x: (x == 0).mean())
# add the rows to temp_df
# add a metric column that states each row's metric
metrics = ['n_params', 'f_params', 'n_0_params', 'f_0_params']
metric_values = [n_params, f_params, n_0_params, f_0_params]
# Manually construct the rows
for metric, values in zip(metrics, metric_values):
	new_row = pd.Series(values, name=metric)
	temp_df.loc[metric] = new_row

# Add a metric column
temp_df['metric'] = metrics

# Reset the index for a clean DataFrame
temp_df.reset_index(drop=True, inplace=True)

In [18]:
print(temp_df)

        date     static      metric
0  57.000000  62.000000    n_params
1   0.112069   0.176774    f_params
2  42.000000  41.000000  n_0_params
3   0.724138   0.661290  f_0_params


In [19]:
# save to csv
temp_df.to_csv('prevalence/final_inspected/composite_temp_prevalence_summary.csv', index=False)