In [0]:
CREATE OR REPLACE TABLE workspace.raw.protein(
  record_create_ts TIMESTAMP NOT NULL,
  record_update_ts TIMESTAMP NOT NULL,

  id STRING PRIMARY KEY NOT NULL,
  fasta_sequence STRING,

  blast_of_id STRING,
  blast_top_hit_number INT, 
  blast_percent_identity STRING,

  cytoplasmic_effector STRING,
  apoplastic_effector	STRING,
  non_effector STRING,
  prediction STRING,

  pfam_domain_acc_1 STRING,
  pfam_domain_name_1 STRING,
  pfam_domain_acc_2 STRING,
  pfam_domain_name_2 STRING,
  pfam_domain_acc_3 STRING,
  pfam_domain_name_3 STRING,
  pfam_domain_acc_4 STRING,
  pfam_domain_name_4 STRING,
  pfam_domain_acc_5 STRING,
  pfam_domain_name_5 STRING,	
  pfam_domain_acc_6 STRING,
  pfam_domain_name_6 STRING,
  pfam_domain_acc_7 STRING,
  pfam_domain_name_7 STRING,
  pfam_domain_acc_8 STRING,
  pfam_domain_name_8 STRING,
  pfam_domain_acc_9 STRING,
  pfam_domain_name_9 STRING,
  pfam_domain_acc_10 STRING,
  pfam_domain_name_10 STRING,

  molecular_weight_kda DOUBLE,
  isoelectric_point_pi DOUBLE,
  sequence_length DOUBLE
);

-- ****************************************************************************************************

CREATE OR REPLACE VIEW workspace.curated.blast_sequence AS
(
SELECT
  p.id AS accession_number
  ,p.fasta_sequence
  ,p1.id AS top1_blast_accession
  ,p1.fasta_sequence AS top1_blast_sequence
  ,p1.blast_percent_identity AS top1_blast_percent_identity
  ,p2.id AS top2_blast_accession
  ,p2.fasta_sequence AS top2_blast_sequence
  ,p2.blast_percent_identity AS top2_blast_percent_identity
  ,p3.id AS top3_blast_accession
  ,p3.fasta_sequence AS top3_blast_sequence
  ,p3.blast_percent_identity AS top3_blast_percent_identity
  ,p4.id AS top4_blast_accession
  ,p4.fasta_sequence AS top4_blast_sequence
  ,p4.blast_percent_identity AS top4_blast_percent_identity
  ,p5.id AS top5_blast_accession
  ,p5.fasta_sequence AS top5_blast_sequence
  ,p5.blast_percent_identity AS top5_blast_percent_identity

FROM workspace.raw.protein p
JOIN workspace.raw.protein p1 ON p.id = p1.blast_of_id AND p1.blast_top_hit_number = 1
JOIN workspace.raw.protein p2 ON p.id = p2.blast_of_id AND p2.blast_top_hit_number = 2
JOIN workspace.raw.protein p3 ON p.id = p3.blast_of_id AND p3.blast_top_hit_number = 3
JOIN workspace.raw.protein p4 ON p.id = p4.blast_of_id AND p4.blast_top_hit_number = 4
JOIN workspace.raw.protein p5 ON p.id = p5.blast_of_id AND p5.blast_top_hit_number = 5
);

-- ****************************************************************************************************

CREATE OR REPLACE VIEW workspace.curated.effectorp AS
(
SELECT
  CASE WHEN blast_of_id IS NULL THEN 'input_sequence' ELSE blast_of_id END AS blast_of_id
  ,id AS accession_number
  ,fasta_sequence AS fasta_sequence
  ,cytoplasmic_effector AS cytoplasmic_effector
  ,apoplastic_effector AS apoplastic_effector
  ,non_effector AS non_effector
  ,prediction AS effectorp_prediction
FROM workspace.raw.protein
);

-- ****************************************************************************************************

CREATE OR REPLACE VIEW workspace.curated.pfam AS
(
SELECT
  CASE WHEN blast_of_id IS NULL THEN 'input_sequence' ELSE blast_of_id END AS blast_of_id
  ,id AS accession_number
  ,fasta_sequence AS fasta_sequence
  ,pfam_domain_acc_1
  ,pfam_domain_name_1
  ,pfam_domain_acc_2
  ,pfam_domain_name_2
  ,pfam_domain_acc_3
  ,pfam_domain_name_3
  ,pfam_domain_acc_4
  ,pfam_domain_name_4
  ,pfam_domain_acc_5
  ,pfam_domain_name_5
  ,pfam_domain_acc_6
  ,pfam_domain_name_6
  ,pfam_domain_acc_7
  ,pfam_domain_name_7
  ,pfam_domain_acc_8
  ,pfam_domain_name_8
  ,pfam_domain_acc_9
  ,pfam_domain_name_9
  ,pfam_domain_acc_10
  ,pfam_domain_name_10
FROM workspace.raw.protein
);

-- ****************************************************************************************************

CREATE OR REPLACE VIEW workspace.curated.molecularweight AS
(
  SELECT
    CASE WHEN blast_of_id IS NULL THEN 'input_sequence' ELSE blast_of_id END AS blast_of_id
    ,id AS accession_number
    ,regexp_replace(fasta_sequence, '>(.|\\r|\\n)*?\\n', '') AS protein_sequence
    ,molecular_weight_kda
    ,isoelectric_point_pi
    ,sequence_length
    ,CASE WHEN molecular_weight_kda BETWEEN 14 AND 15.7 THEN 'Y' ELSE 'N' END AS `mw_between_14_and_15.7` 
FROM workspace.raw.protein
);

-- ****************************************************************************************************




