# Integrating samples and colour datasets

In [16]:
library(data.table)
library(tidyverse)

In [17]:
## Read sample information
samples = fread('~/Phd/Projects/2021-snap_hap/sample_info/samples_Amajus_SnapHap_LastUpdate-2023-10.txt', header=TRUE)
str(samples)

Classes ‘data.table’ and 'data.frame':	1074 obs. of  22 variables:
 $ PlantID_UPPER                : chr  "Z0147" "X4867" "Z2293" "Z0156" ...
 $ PlantID                      : chr  "z0147" "x4867" "z2293" "z0156" ...
 $ PlantID_LongFormat           : chr  "n96_Am_Pla_z0147" "n96_Am_Ave_x4867" "n96_Am_Pla_z2293" "n96_Am_Pla_z0156" ...
 $ PlantID_longFormat_refVersion: chr  "n96_Am_Pla_z0147_v3.5" "n96_Am_Ave_x4867_v3.5" "n96_Am_Pla_z2293_v3.5" "n96_Am_Pla_z0156_v3.5" ...
 $ Batch                        : chr  "n96-Pla" "n96-Ave" "n96-Pla" "n96-Pla" ...
 $ Batch_PlantID                : chr  "n96-Pla_Z0147" "n96-Ave_X4867" "n96-Pla_Z2293" "n96-Pla_Z0156" ...
 $ PlantID_Batch                : chr  "Z0147_n96-Pla" "X4867_n96-Ave" "Z2293_n96-Pla" "Z0156_n96-Pla" ...
 $ Species                      : chr  "Am" "Am" "Am" "Am" ...
 $ Location                     : chr  "Pla" "Ave" "Pla" "Pla" ...
 $ Coverage_mean                : num  0.472 0.514 0.52 0.537 0.545 ...
 $ Coverage_stdev         

In [18]:
## Read colour information
colour = fread('~/Phd/Projects/2021-snap_hap/sample_info/colour_info//IDs_colours_complete_20240314.csv', header=TRUE)
colour$Plant_ID = str_to_upper(colour$Plant_ID)
colnames(colour)[1] = 'PlantID_UPPER'
str(colour)

Classes ‘data.table’ and 'data.frame':	1068 obs. of  33 variables:
 $ PlantID_UPPER: chr  "PA0212" "PA0212" "PA0283" "PA0291" ...
 $ 2022_red     : num  NA NA NA NA NA NA NA NA 0.5 NA ...
 $ 2022_yellow  : num  NA NA NA NA NA NA NA NA 2.5 NA ...
 $ 2022_venation: num  NA NA NA NA NA NA NA NA 1 NA ...
 $ 2022_colour  : chr  "" "" "" "" ...
 $ 2021_red     : num  0.5 0.5 NA 3 4 4 1.5 0.5 NA 3.5 ...
 $ 2021_yellow  : num  2.5 2.5 NA 2.5 1 1.5 2 2 NA 1 ...
 $ 2021_venation: num  2.5 2.5 NA 3 3 3.5 2.5 1 NA 3 ...
 $ 2021_colour  : chr  "Ye" "Ye" "" "FO" ...
 $ 2020_red     : num  0.5 0.5 3.5 3 3.5 NA NA NA 0.5 NA ...
 $ 2020_yellow  : num  2.5 2.5 2 1.5 1 NA NA NA 2.5 NA ...
 $ 2020_venation: num  3.5 3.5 1 3 3 NA NA NA 2.5 NA ...
 $ 2020_colour  : chr  "Ye" "Ye" "FO" "FR" ...
 $ 2019_red     : num  NA NA NA NA NA NA NA NA NA NA ...
 $ 2019_yellow  : num  NA NA NA NA NA NA NA NA NA NA ...
 $ 2019_venation: num  NA NA NA NA NA NA NA NA NA NA ...
 $ 2019_colour  : chr  "" "" "" "" ...
 $ 2018

In [19]:
## Merge sample+colour information
merged = merge(samples, colour, by='PlantID_UPPER')
str(merged)

Classes ‘data.table’ and 'data.frame':	1068 obs. of  54 variables:
 $ PlantID_UPPER                : chr  "PA0212" "PA0212" "PA0283" "PA0291" ...
 $ PlantID                      : chr  "pa0212" "pa0212" "pa0283" "pa0291" ...
 $ PlantID_LongFormat           : chr  "2x-N708-1_Am_Pla_pa0212" "2x-N708-1_Am_Pla_pa0212" "2x-N708-2_Am_Pla_pa0283" "2x-N708-3_Am_Pla_pa0291" ...
 $ PlantID_longFormat_refVersion: chr  "2x-N708-1_Am_Pla_pa0212_v3.5" "2x-N708-1_Am_Pla_pa0212_v3.5" "2x-N708-2_Am_Pla_pa0283_v3.5" "2x-N708-3_Am_Pla_pa0291_v3.5" ...
 $ Batch                        : chr  "2x" "2x" "2x" "2x" ...
 $ Batch_PlantID                : chr  "2x_PA0212" "2x_PA0212" "2x_PA0283" "2x_PA0291" ...
 $ PlantID_Batch                : chr  "PA0212_2x" "PA0212_2x" "PA0283_2x" "PA0291_2x" ...
 $ Species                      : chr  "Am" "Am" "Am" "Am" ...
 $ Location                     : chr  "Pla" "Pla" "Pla" "Pla" ...
 $ Coverage_mean                : num  2.02 2.02 2.38 1.77 1.94 ...
 $ Coverage_stdev 

In [20]:
## Write merged sample+colour information
fwrite(merged, file='./samples+colour_merged.raw.csv', sep=',', row.names = F, col.names = T, quote = F)