In [14]:
import os
import itertools

import numpy as np
import pandas as pd

In [15]:
# Generate parameter grid for the different analyses #####################################

# Expand grid function from dictionary of lists
def expand_grid(dict):
    rows = itertools.product(*dict.values())
    return pd.DataFrame.from_records(rows, columns = dict.keys())

# Load data source for grid
Y_df = pd.read_csv(os.path.join("data", "pheno_original.csv"))
parts = pd.read_csv(os.path.join("data", "partitions.csv"))

# Traits list
traits = Y_df.columns[[3, 9, 10, 12]].to_list()

In [16]:
# 1. top-markers-10k -------------------------------------------------------------------------
model_type = "top-markers-10k"

# 1.1. AroAdm ============================================================================
out_name = f"AroAdm_{model_type}.csv"

folds = ["AroAdm"]

grid_dict = {
    "model_type": [model_type],
    "architecture": ["MLP", "CNN"],
    "m_set": ["SNP", "all"],
    "traits": traits,
    "part": folds,
    "input_type": ["single-input"],
    "output_type": ["single-output"]
}

grid = expand_grid(grid_dict)
print(grid)

grid.to_csv(os.path.join("parameters", out_name), index = False)

         model_type architecture m_set                         traits    part  \
0   top-markers-10k          MLP   SNP    culm.diameter.1st.internode  AroAdm   
1   top-markers-10k          MLP   SNP                leaf.senescence  AroAdm   
2   top-markers-10k          MLP   SNP                   grain.weight  AroAdm   
3   top-markers-10k          MLP   SNP  time.to.flowering.from.sowing  AroAdm   
4   top-markers-10k          MLP   all    culm.diameter.1st.internode  AroAdm   
5   top-markers-10k          MLP   all                leaf.senescence  AroAdm   
6   top-markers-10k          MLP   all                   grain.weight  AroAdm   
7   top-markers-10k          MLP   all  time.to.flowering.from.sowing  AroAdm   
8   top-markers-10k          CNN   SNP    culm.diameter.1st.internode  AroAdm   
9   top-markers-10k          CNN   SNP                leaf.senescence  AroAdm   
10  top-markers-10k          CNN   SNP                   grain.weight  AroAdm   
11  top-markers-10k         

In [17]:
# 1.2. 10-fold-cv ========================================================================
out_name = f"10-fold-cv_{model_type}.csv"

folds = parts.columns.drop(["Accession", "AroAdm"]).to_list()

grid_dict = {
    "model_type": [model_type],
    "architecture": ["MLP", "CNN"],
    "m_set": ["SNP", "all"],
    "traits": traits,
    "part": folds,
    "input_type": ["single-input"],
    "output_type": ["single-output"]
}

grid = expand_grid(grid_dict)
print(grid)

grid.to_csv(os.path.join("parameters", out_name), index = False)

          model_type architecture m_set                         traits  \
0    top-markers-10k          MLP   SNP    culm.diameter.1st.internode   
1    top-markers-10k          MLP   SNP    culm.diameter.1st.internode   
2    top-markers-10k          MLP   SNP    culm.diameter.1st.internode   
3    top-markers-10k          MLP   SNP    culm.diameter.1st.internode   
4    top-markers-10k          MLP   SNP    culm.diameter.1st.internode   
..               ...          ...   ...                            ...   
155  top-markers-10k          CNN   all  time.to.flowering.from.sowing   
156  top-markers-10k          CNN   all  time.to.flowering.from.sowing   
157  top-markers-10k          CNN   all  time.to.flowering.from.sowing   
158  top-markers-10k          CNN   all  time.to.flowering.from.sowing   
159  top-markers-10k          CNN   all  time.to.flowering.from.sowing   

        part    input_type    output_type  
0     Fold.1  single-input  single-output  
1     Fold.2  single-in

In [18]:
# 2. kerPC -------------------------------------------------------------------------------
model_type = "kerPC"

# 2.1. AroAdm ============================================================================
out_name = f"AroAdm_{model_type}.csv"

folds = ["AroAdm"]

grid_dict = {
    "model_type": [model_type],
    "architecture": ["MLP"],
    "m_set": ["SNP", "all"],
    "traits": traits,
    "part": folds,
    "input_type": ["single-input", "multi-input"],
    "output_type": ["single-output"]
}

grid = expand_grid(grid_dict)

# No multi-input with SNP kernel case
grid = grid[~ ((grid.input_type == "multi-input") & (grid.m_set == "SNP"))]
print(grid)

grid.to_csv(os.path.join("parameters", out_name), index = False)

   model_type architecture m_set                         traits    part  \
0       kerPC          MLP   SNP    culm.diameter.1st.internode  AroAdm   
2       kerPC          MLP   SNP                leaf.senescence  AroAdm   
4       kerPC          MLP   SNP                   grain.weight  AroAdm   
6       kerPC          MLP   SNP  time.to.flowering.from.sowing  AroAdm   
8       kerPC          MLP   all    culm.diameter.1st.internode  AroAdm   
9       kerPC          MLP   all    culm.diameter.1st.internode  AroAdm   
10      kerPC          MLP   all                leaf.senescence  AroAdm   
11      kerPC          MLP   all                leaf.senescence  AroAdm   
12      kerPC          MLP   all                   grain.weight  AroAdm   
13      kerPC          MLP   all                   grain.weight  AroAdm   
14      kerPC          MLP   all  time.to.flowering.from.sowing  AroAdm   
15      kerPC          MLP   all  time.to.flowering.from.sowing  AroAdm   

      input_type    outp

In [19]:
# 2.2. 10-fold-cv ========================================================================
out_name = f"10-fold-cv_{model_type}.csv"

folds = parts.columns.drop(["Accession", "AroAdm"]).to_list()

grid_dict = {
    "model_type": [model_type],
    "architecture": ["MLP"],
    "m_set": ["SNP", "all"],
    "traits": traits,
    "part": folds,
    "input_type": ["single-input", "multi-input"],
    "output_type": ["single-output"]
}

grid = expand_grid(grid_dict)

# No multi-input with SNP kernel case
grid = grid[~ ((grid.input_type == "multi-input") & (grid.m_set == "SNP"))]
print(grid)

grid.to_csv(os.path.join("parameters", out_name), index = False)

    model_type architecture m_set                         traits     part  \
0        kerPC          MLP   SNP    culm.diameter.1st.internode   Fold.1   
2        kerPC          MLP   SNP    culm.diameter.1st.internode   Fold.2   
4        kerPC          MLP   SNP    culm.diameter.1st.internode   Fold.3   
6        kerPC          MLP   SNP    culm.diameter.1st.internode   Fold.4   
8        kerPC          MLP   SNP    culm.diameter.1st.internode   Fold.5   
..         ...          ...   ...                            ...      ...   
155      kerPC          MLP   all  time.to.flowering.from.sowing   Fold.8   
156      kerPC          MLP   all  time.to.flowering.from.sowing   Fold.9   
157      kerPC          MLP   all  time.to.flowering.from.sowing   Fold.9   
158      kerPC          MLP   all  time.to.flowering.from.sowing  Fold.10   
159      kerPC          MLP   all  time.to.flowering.from.sowing  Fold.10   

       input_type    output_type  
0    single-input  single-output  
2    

In [20]:
# 3. multi-output ------------------------------------------------------------------------

target_dict = {
    "culm.diameter.1st.internode": "binary",
    "leaf.senescence": "binary",
    "grain.weight": "continuous",
    "time.to.flowering.from.sowing": "continuous"
}

# Lists of traits to predict jointly
trait_lists = []
# Combinations of 2 with same variable type (binary or continuous)
for comb in itertools.combinations(traits, 2):
    target_type_list = []
    for trait in comb:
        target_type_list.append(target_dict[trait])
    if all(x == target_type_list[0] for x in target_type_list):
        trait_lists.append(list(comb))
# All traits at the same time
trait_lists.append(traits)
        
print(trait_lists)

[['culm.diameter.1st.internode', 'leaf.senescence'], ['grain.weight', 'time.to.flowering.from.sowing'], ['culm.diameter.1st.internode', 'leaf.senescence', 'grain.weight', 'time.to.flowering.from.sowing']]


In [21]:
# 3.1 top-markers-10k ========================================================================
model_type = "top-markers-10k"
output_type = "multi-output"
input_type = "multi-input"

# AroAdm #################################################################################
out_name = f"AroAdm_{model_type}_{input_type}_{output_type}.csv"

folds = ["AroAdm"]

grid_dict = {
    "model_type": [model_type],
    "architecture": ["MLP"],
    "m_set": ["SNP", "all"],
    "traits": trait_lists,
    "part": folds,
    "input_type": [input_type],
    "output_type": [output_type]
}

grid = expand_grid(grid_dict)
print(grid)

grid.to_csv(os.path.join("parameters", out_name), index = False)

        model_type architecture m_set  \
0  top-markers-10k          MLP   SNP   
1  top-markers-10k          MLP   SNP   
2  top-markers-10k          MLP   SNP   
3  top-markers-10k          MLP   all   
4  top-markers-10k          MLP   all   
5  top-markers-10k          MLP   all   

                                              traits    part   input_type  \
0     [culm.diameter.1st.internode, leaf.senescence]  AroAdm  multi-input   
1      [grain.weight, time.to.flowering.from.sowing]  AroAdm  multi-input   
2  [culm.diameter.1st.internode, leaf.senescence,...  AroAdm  multi-input   
3     [culm.diameter.1st.internode, leaf.senescence]  AroAdm  multi-input   
4      [grain.weight, time.to.flowering.from.sowing]  AroAdm  multi-input   
5  [culm.diameter.1st.internode, leaf.senescence,...  AroAdm  multi-input   

    output_type  
0  multi-output  
1  multi-output  
2  multi-output  
3  multi-output  
4  multi-output  
5  multi-output  


In [22]:
# 10-fold-cv #############################################################################
out_name = f"10-fold-cv_{model_type}_{input_type}_{output_type}.csv"

folds = parts.columns.drop(["Accession", "AroAdm"]).to_list()

grid_dict = {
    "model_type": [model_type],
    "architecture": ["MLP"],
    "m_set": ["SNP", "all"],
    "traits": trait_lists,
    "part": folds,
    "input_type": [input_type],
    "output_type": [output_type]
}

grid = expand_grid(grid_dict)
print(grid)

grid.to_csv(os.path.join("parameters", out_name), index = False)

         model_type architecture m_set  \
0   top-markers-10k          MLP   SNP   
1   top-markers-10k          MLP   SNP   
2   top-markers-10k          MLP   SNP   
3   top-markers-10k          MLP   SNP   
4   top-markers-10k          MLP   SNP   
5   top-markers-10k          MLP   SNP   
6   top-markers-10k          MLP   SNP   
7   top-markers-10k          MLP   SNP   
8   top-markers-10k          MLP   SNP   
9   top-markers-10k          MLP   SNP   
10  top-markers-10k          MLP   SNP   
11  top-markers-10k          MLP   SNP   
12  top-markers-10k          MLP   SNP   
13  top-markers-10k          MLP   SNP   
14  top-markers-10k          MLP   SNP   
15  top-markers-10k          MLP   SNP   
16  top-markers-10k          MLP   SNP   
17  top-markers-10k          MLP   SNP   
18  top-markers-10k          MLP   SNP   
19  top-markers-10k          MLP   SNP   
20  top-markers-10k          MLP   SNP   
21  top-markers-10k          MLP   SNP   
22  top-markers-10k          MLP  

In [23]:
# 3.2 kerPC ==============================================================================
model_type = "kerPC"
output_type = "multi-output"

# AroAdm #################################################################################
out_name = f"AroAdm_{model_type}_{output_type}.csv"

folds = ["AroAdm"]

grid_dict = {
    "model_type": [model_type],
    "architecture": ["MLP"],
    "m_set": ["SNP", "all"],
    "traits": trait_lists,
    "part": folds,
    "input_type": ["single-input", "multi-input"],
    "output_type": [output_type]
}

grid = expand_grid(grid_dict)

# No multi-input with SNP kernel case
grid = grid[~ ((grid.input_type == "multi-input") & (grid.m_set == "SNP"))]
print(grid)

grid.to_csv(os.path.join("parameters", out_name), index = False)

   model_type architecture m_set  \
0       kerPC          MLP   SNP   
2       kerPC          MLP   SNP   
4       kerPC          MLP   SNP   
6       kerPC          MLP   all   
7       kerPC          MLP   all   
8       kerPC          MLP   all   
9       kerPC          MLP   all   
10      kerPC          MLP   all   
11      kerPC          MLP   all   

                                               traits    part    input_type  \
0      [culm.diameter.1st.internode, leaf.senescence]  AroAdm  single-input   
2       [grain.weight, time.to.flowering.from.sowing]  AroAdm  single-input   
4   [culm.diameter.1st.internode, leaf.senescence,...  AroAdm  single-input   
6      [culm.diameter.1st.internode, leaf.senescence]  AroAdm  single-input   
7      [culm.diameter.1st.internode, leaf.senescence]  AroAdm   multi-input   
8       [grain.weight, time.to.flowering.from.sowing]  AroAdm  single-input   
9       [grain.weight, time.to.flowering.from.sowing]  AroAdm   multi-input   
10  [cu

In [24]:
# 10-fold-cv #############################################################################
out_name = f"10-fold-cv_{model_type}_{output_type}.csv"

folds = parts.columns.drop(["Accession", "AroAdm"]).to_list()

grid_dict = {
    "model_type": [model_type],
    "architecture": ["MLP"],
    "m_set": ["SNP", "all"],
    "traits": trait_lists,
    "part": folds,
    "input_type": ["single-input", "multi-input"],
    "output_type": [output_type]
}

grid = expand_grid(grid_dict)

# No multi-input with SNP kernel case
grid = grid[~ ((grid.input_type == "multi-input") & (grid.m_set == "SNP"))]
print(grid)

grid.to_csv(os.path.join("parameters", out_name), index = False)

    model_type architecture m_set  \
0        kerPC          MLP   SNP   
2        kerPC          MLP   SNP   
4        kerPC          MLP   SNP   
6        kerPC          MLP   SNP   
8        kerPC          MLP   SNP   
..         ...          ...   ...   
115      kerPC          MLP   all   
116      kerPC          MLP   all   
117      kerPC          MLP   all   
118      kerPC          MLP   all   
119      kerPC          MLP   all   

                                                traits     part    input_type  \
0       [culm.diameter.1st.internode, leaf.senescence]   Fold.1  single-input   
2       [culm.diameter.1st.internode, leaf.senescence]   Fold.2  single-input   
4       [culm.diameter.1st.internode, leaf.senescence]   Fold.3  single-input   
6       [culm.diameter.1st.internode, leaf.senescence]   Fold.4  single-input   
8       [culm.diameter.1st.internode, leaf.senescence]   Fold.5  single-input   
..                                                 ...      ...      