# Raw Data Processing
This notebook contains processing for the subset of trials with raw data available.

In [1]:
# Import packages
import numpy as np
import pandas as pd

In [2]:
# Load data
rdf = pd.read_csv('raw_vt_data.csv')
rdf.drop('Temporality', axis=1, inplace=True)
rdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91 entries, 0 to 90
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Attribute  91 non-null     object 
 1   Entity     91 non-null     object 
 2   Trial 009  3 non-null      float64
 3   Trial 011  2 non-null      float64
 4   Trial 023  2 non-null      float64
 5   Trial 048  1 non-null      float64
 6   Trial 066  23 non-null     float64
 7   Trial 081  6 non-null      float64
 8   Trial 094  10 non-null     float64
dtypes: float64(7), object(2)
memory usage: 6.5+ KB


In [3]:
# Drop rows with no data in the trial columns
trial_cols = rdf.columns[2:]
rdf = rdf.dropna(subset=trial_cols, how='all')
rdf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32 entries, 2 to 81
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Attribute  32 non-null     object 
 1   Entity     32 non-null     object 
 2   Trial 009  3 non-null      float64
 3   Trial 011  2 non-null      float64
 4   Trial 023  2 non-null      float64
 5   Trial 048  1 non-null      float64
 6   Trial 066  23 non-null     float64
 7   Trial 081  6 non-null      float64
 8   Trial 094  10 non-null     float64
dtypes: float64(7), object(2)
memory usage: 2.5+ KB


In [4]:
# Load trial report data
tdf = pd.read_csv('codes_all_processed.csv')
tdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82 entries, 0 to 81
Data columns (total 34 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   code       82 non-null     object 
 1   trial_001  82 non-null     float64
 2   trial_006  82 non-null     float64
 3   trial_008  82 non-null     float64
 4   trial_009  82 non-null     float64
 5   trial_011  82 non-null     float64
 6   trial_013  82 non-null     float64
 7   trial_015  82 non-null     float64
 8   trial_016  82 non-null     float64
 9   trial_018  82 non-null     float64
 10  trial_023  82 non-null     float64
 11  trial_026  82 non-null     float64
 12  trial_034  82 non-null     float64
 13  trial_036  82 non-null     float64
 14  trial_040  82 non-null     float64
 15  trial_044  82 non-null     float64
 16  trial_045  82 non-null     float64
 17  trial_048  82 non-null     float64
 18  trial_052  82 non-null     float64
 19  trial_055  82 non-null     float64
 20  trial_059  8

In [5]:
# Reformat columns
rdf.columns = rdf.columns.str.lower()
rdf.columns = rdf.columns.str.replace(' ', '_')
rdf.rename(columns={'attribute': 'code'}, inplace=True)
rdf['entity'] = rdf['entity'].str.strip()
rdf['entity'] = rdf['entity'].str.replace(' ', '_')
rdf['entity'] = rdf['entity'].str.replace(',', '')
rdf['entity'] = rdf['entity'].str.replace('structural_characteristics', 'structure_characteristics')

rdf.info()


<class 'pandas.core.frame.DataFrame'>
Index: 32 entries, 2 to 81
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   code       32 non-null     object 
 1   entity     32 non-null     object 
 2   trial_009  3 non-null      float64
 3   trial_011  2 non-null      float64
 4   trial_023  2 non-null      float64
 5   trial_048  1 non-null      float64
 6   trial_066  23 non-null     float64
 7   trial_081  6 non-null      float64
 8   trial_094  10 non-null     float64
dtypes: float64(7), object(2)
memory usage: 2.5+ KB


In [6]:
# Drop all columns in tdf that are not in rdf
common_columns = tdf.columns.intersection(rdf.columns)
tdf = tdf[common_columns]

# Replace 0 with NaN
tdf_merge = tdf.replace(0, np.nan)

tdf.info()
tdf_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82 entries, 0 to 81
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   code       82 non-null     object 
 1   trial_009  82 non-null     float64
 2   trial_011  82 non-null     float64
 3   trial_023  82 non-null     float64
 4   trial_048  82 non-null     float64
 5   trial_066  82 non-null     float64
 6   trial_081  82 non-null     float64
 7   trial_094  82 non-null     float64
 8   entity     82 non-null     object 
dtypes: float64(7), object(2)
memory usage: 5.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82 entries, 0 to 81
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   code       82 non-null     object 
 1   trial_009  37 non-null     float64
 2   trial_011  37 non-null     float64
 3   trial_023  43 non-null     float64
 4   trial_048  43 non-null     float64
 5   trial_066  36 non-n

In [7]:
# Set index to code
tdf.set_index('code', inplace=True)
tdf_merge.set_index('code', inplace=True)
rdf.set_index('code', inplace=True)

# Merge dataframes, with values of 1 overriding values of 0
df = rdf.combine_first(tdf_merge)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 93 entries, bed dimensions to within-row spacing
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   entity     93 non-null     object 
 1   trial_009  40 non-null     float64
 2   trial_011  39 non-null     float64
 3   trial_023  45 non-null     float64
 4   trial_048  44 non-null     float64
 5   trial_066  58 non-null     float64
 6   trial_081  50 non-null     float64
 7   trial_094  53 non-null     float64
dtypes: float64(7), object(1)
memory usage: 6.5+ KB


In [10]:
# Replace NaN with 0
df = df.fillna(0)
df.info()
tdf = tdf.fillna(0)
tdf.info()
rdf = rdf.fillna(0)
rdf.info()

# Save data
tdf.to_csv('raw_trials_codes_all_processed.csv', index=True)
df.to_csv('raw_data_codes_all_processed.csv', index=True)
rdf.to_csv('raw_data_only_codes_all_processed.csv', index=True)

<class 'pandas.core.frame.DataFrame'>
Index: 93 entries, bed dimensions to within-row spacing
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   entity     93 non-null     object 
 1   trial_009  93 non-null     float64
 2   trial_011  93 non-null     float64
 3   trial_023  93 non-null     float64
 4   trial_048  93 non-null     float64
 5   trial_066  93 non-null     float64
 6   trial_081  93 non-null     float64
 7   trial_094  93 non-null     float64
dtypes: float64(7), object(1)
memory usage: 6.5+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 82 entries, bed dimensions to harvested yield
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   trial_009  82 non-null     float64
 1   trial_011  82 non-null     float64
 2   trial_023  82 non-null     float64
 3   trial_048  82 non-null     float64
 4   trial_066  82 non-null     float64
 5   trial_081  82 

In [11]:
# Load trial directory
trdf = pd.read_csv('trials.csv', usecols=['Trial ID', 'Sample', 'Species', 'Trial Type'], dtype=str)
# Drop rows with missing values for the Sample column (unsampled trials)
trdf.dropna(subset=['Sample'], inplace=True)
trdf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32 entries, 0 to 93
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Trial ID    32 non-null     object
 1   Sample      32 non-null     object
 2   Species     32 non-null     object
 3   Trial Type  32 non-null     object
dtypes: object(4)
memory usage: 1.2+ KB


In [12]:
# Define function to create summary dataframes
def summarize(dfi, trdf):
    # Create a dataframe with the codes and the fraction of trials in which they appear
    df1 = dfi.drop(columns=['entity']).copy()
    sum_df = df1.sum(axis=1)
    sum_df = sum_df/df1.shape[1]
    sum_df = sum_df.to_frame()
    sum_df.columns = ['All_Trials']
    # Set decimal precision to 2
    sum_df = sum_df.round(2)
    # For the subset of trials with each species, add a column that indicates the fraction of trials in which each code appears
    species = ['Pepper', 'Tomato', 'Watermelon']
    for s in species:
        # Get the subset of trials that include the species substring in the Species column
        subset = trdf.loc[trdf['Species'].str.contains(s)]['Trial ID'].to_list()
        # Subset the data df to include columns from the subset of trials
        df_subset = df1[[col for col in df1.columns if col.split('_')[1] in subset]]
        # Calculate the fraction of trials in which each code appears
        sum_df[s] = df_subset.sum(axis=1)/df_subset.shape[1]
        # Set decimal precision to 2
        sum_df[s] = sum_df[s].round(2)

    # For the subset of trials with each trial type, add a column that indicates the fraction of trials in which each code appears
    trial_types = ['Management', 'Variety']
    for t in trial_types:
        # Get the subset of trials that include the trial type substring in the Trial Type column
        subset = trdf.loc[trdf['Trial Type'].str.contains(t)]['Trial ID'].to_list()
        # Subset the data df to include columns from the subset of trials
        df_subset = df1[[col for col in df1.columns if col.split('_')[1] in subset]]
        # Calculate the fraction of trials in which each code appears
        sum_df[t] = df_subset.sum(axis=1)/df_subset.shape[1]
        # Set decimal precision to 2
        sum_df[t] = sum_df[t].round(2)
    # add entity to sum_df using code and attribute from sum_df and ddf
    sum_df['entity'] = sum_df.index.map(dfi['entity'])
    return sum_df

In [13]:
df_sum = summarize(df, trdf)
df_sum.info()
tdf_sum = summarize(tdf, trdf)
tdf_sum.info()
rdf_sum = summarize(rdf, trdf)
rdf_sum.info()

<class 'pandas.core.frame.DataFrame'>
Index: 93 entries, bed dimensions to within-row spacing
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   All_Trials  93 non-null     float64
 1   Pepper      93 non-null     float64
 2   Tomato      93 non-null     float64
 3   Watermelon  93 non-null     float64
 4   Management  93 non-null     float64
 5   Variety     93 non-null     float64
 6   entity      93 non-null     object 
dtypes: float64(6), object(1)
memory usage: 7.9+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 82 entries, bed dimensions to harvested yield
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   All_Trials  82 non-null     float64
 1   Pepper      82 non-null     float64
 2   Tomato      82 non-null     float64
 3   Watermelon  82 non-null     float64
 4   Management  82 non-null     float64
 5   Variety     82 non-null     float64
 6

In [14]:
# Save to csv
df_sum.to_csv('raw_data_codes_summary_processed.csv', index=True)
tdf_sum.to_csv('raw_trials_codes_summary_processed.csv', index=True)
rdf_sum.to_csv('raw_data_only_codes_summary_processed.csv', index=True)