# structural

Extracts UniProt ids for further structural analysis

In [1]:
import better.files._
import File._
import java.io.{File => JFile}
import org.apache.spark.rdd._
import org.apache.spark.sql._
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.functions._
import group.research.aging.spark.extensions._
import group.research.aging.spark.extensions.functions._

In [35]:
import better.files._
import File._
import java.io.{File => JFile}

In [31]:
val yspecies = data_path + "sources/yspecies/"
val intersections_path = yspecies + "data/output/intersections/"
val ranked_path =intersections_path + "intersections_ranked.tsv"
val stage_one_input = yspecies +  "data/interim/selected/lifespan/"
val stage_two_input = yspecies +  "data/interim/stage_2/input/"
val genes_ranked = spark.readTSV(ranked_path, true)
genes_ranked.show(10, 100)

## Get fasta function



In [3]:
import sttp.client3.quick._
def getFasta(id: String)= id match {
    case "" | null => ""
    case id => quickRequest.get(uri"http://www.uniprot.org/uniprot/$id.fasta").send(backend).body
}

In [4]:
getFasta("Q6GZX4")

>sp|Q6GZX4|001R_FRG3G Putative transcription factor 001R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-001R PE=4 SV=1
MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPS
EKGLIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLD
AKIKAYNLTV

In [5]:
val get_fasta: String => String = getFasta _ 

import org.apache.spark.sql.functions.udf
val getProtein = udf(get_fasta)

### Downloading yspecies proteins<br>



In [32]:
val data_path = "/data/"
val samples_path = s"${data_path}samples/"
val ensembl_path = s"${data_path}ensembl/99/"
val graphdb_path = s"${data_path}databases/graphdb/import"
val yspecies = s"${data_path}sources/yspecies/"
val ranked_path = yspecies + "data/output/intersections/intersections_ranked.tsv"
val stage_one_input = yspecies +  "data/interim/selected/lifespan/"
val stage_two_input = yspecies +  "data/interim/stage_2/input/"
val intersections_path = yspecies + "data/output/intersections/"


In [33]:
val children = File(intersections_path + "proteins").children.toList
children

List(/data/sources/yspecies/data/output/intersections/proteins/84_TMEM258, /data/sources/yspecies/data/output/intersections/proteins/110_PKN3, /data/sources/yspecies/data/output/intersections/proteins/41_A2M, /data/sources/yspecies/data/output/intersecti

In [34]:

def write_fasta(fl: File) = {
    val name = fl.name.substring(fl.name.indexOf("_")+1) + ".tsv"
    val ids = fl.children.collectFirst{ case f if f.name == name => spark.readTSV(f.pathAsString, true) }.get
    .where($"uniprot".isNotNull).select("species","gene", "transcript","uniprot").as[(String, String, String,String)].collect().toList
    for((s,g, t, u) <- ids) {
     val str = getFasta(u)
     val prot: File = fl / s"${s}_${g}_${t}_${u}.fasta"
     print("writing to: "+prot.pathAsString)     
     prot.createIfNotExists()
     prot.overwrite(str)    
    }
}


writing to: /data/sources/yspecies/data/output/intersections/proteins/84_TMEM258/Gorilla_gorilla_ENSGGOG00000041617_ENSGGOT00000058456_A0A2I2YHX7.fastawriting to: /data/sources/yspecies/data/output/intersections/proteins/84_TMEM258/Pan_troglodytes_ENSPTRG00000042665_ENSPTRT00000086905_A0A2I3S303.fastawriting to: /data/sources/yspecies/data/output/intersections/proteins/84_TMEM258/Equus_caballus_ENSECAG00000018904_ENSECAT00000020347_F7AD30.fastawriting to: /data/sources/yspecies/data/output/intersections/proteins/84_TMEM258/Pan_paniscus_ENSPPAG00000037345_ENSPPAT00000051482_A0A2R9BG68.fastawriting to: /data/sources/yspecies/data/output/intersections/proteins/84_TMEM258/Macaca_mulatta_ENSMMUG00000019602_ENSMMUT00000027525_F7B8N9.fastawriting to: /data/sources/yspecies/data/output/intersections/proteins/84_TMEM258/Macaca_fascicularis_ENSMFAG00000030739_ENSMFAT00000065589_A0A2K5URL5.fastawriting to: /data/sources/yspecies/data/output/intersections/proteins/84_TMEM258/Macaca_nemestrina_ENSM

In [36]:
for(child <- children) write_fasta(child)

writing to: /data/sources/yspecies/data/output/intersections/proteins/84_TMEM258/Gorilla_gorilla_ENSGGOG00000041617_ENSGGOT00000058456_A0A2I2YHX7.fastawriting to: /data/sources/yspecies/data/output/intersections/proteins/84_TMEM258/Pan_troglodytes_ENSPTRG00000042665_ENSPTRT00000086905_A0A2I3S303.fastawriting to: /data/sources/yspecies/data/output/intersections/proteins/84_TMEM258/Equus_caballus_ENSECAG00000018904_ENSECAT00000020347_F7AD30.fastawriting to: /data/sources/yspecies/data/output/intersections/proteins/84_TMEM258/Pan_paniscus_ENSPPAG00000037345_ENSPPAT00000051482_A0A2R9BG68.fastawriting to: /data/sources/yspecies/data/output/intersections/proteins/84_TMEM258/Macaca_mulatta_ENSMMUG00000019602_ENSMMUT00000027525_F7B8N9.fastawriting to: /data/sources/yspecies/data/output/intersections/proteins/84_TMEM258/Macaca_fascicularis_ENSMFAG00000030739_ENSMFAT00000065589_A0A2K5URL5.fastawriting to: /data/sources/yspecies/data/output/intersections/proteins/84_TMEM258/Macaca_nemestrina_ENSM

In [37]:
print("DONE")

DONE

**Uniprot mappings<br>**

In [7]:
val mapping_cols = Seq("UniProtKB-AC","UniProtKB-ID","Entrez","RefSeq","GI","PDB","GO",
"UniRef100","UniRef90","UniRef50","UniParc","PIR",
"NCBI-taxon","MIM","UniGene","PubMed",
"EMBL","EMBL-CDS","Ensembl","Ensembl_TRS","Ensembl_PRO","Additional PubMed")

In [8]:
val mapping = spark.readTSV("/data/indexes/uniprot/idmapping_selected.tab").toDF(mapping_cols:_*)
mapping.limit(20).show(20, 1000)

+------------+------------+-------+-----------+-------------------------------+----+----------------------------------+----------------+---------------+---------------+-------------+----+----------+----+-------+------------------+--------+----------+-------+-----------+-----------+-----------------+
|UniProtKB-AC|UniProtKB-ID| Entrez|     RefSeq|                             GI| PDB|                                GO|       UniRef100|       UniRef90|       UniRef50|      UniParc| PIR|NCBI-taxon| MIM|UniGene|            PubMed|    EMBL|  EMBL-CDS|Ensembl|Ensembl_TRS|Ensembl_PRO|Additional PubMed|
+------------+------------+-------+-----------+-------------------------------+----+----------------------------------+----------------+---------------+---------------+-------------+----+----------+----+-------+------------------+--------+----------+-------+-----------+-----------+-----------------+
|      Q6GZX4|  001R_FRG3G|2947773|YP_031579.1|             81941549; 49237298|null|             

In [9]:
val ensembl2uniprot = mapping.select("Ensembl", "Ensembl_TRS", "Ensembl_PRO", "UniProtKB-AC").where($"Ensembl".isNotNull)
ensembl2uniprot.limit(10).show(10,1000)

+--------------------------------+------------------------------------------------------------------+------------------------------------------------------------------+------------+
|                         Ensembl|                                                       Ensembl_TRS|                                                       Ensembl_PRO|UniProtKB-AC|
+--------------------------------+------------------------------------------------------------------+------------------------------------------------------------------+------------+
|              ENSGALG00000004143|        ENSGALT00000044506; ENSGALT00000091201; ENSGALT00000107000|        ENSGALP00000043305; ENSGALP00000071615; ENSGALP00000066011|      Q5ZLQ6|
|                 ENSG00000166913|                                  ENST00000353703; ENST00000372839|                                  ENSP00000300161; ENSP00000361930|      P31946|
|              ENSMFAG00000043236|                                                ENSMFAT0

## Get write proteins



In [11]:
def get_protein(df: DataFrame, gene: String) = {
    val selection = df.select("species", "`" + gene + "`").as[(String,String)]
      .flatMap{ case (k,s)=>
       if(s=="" || s==null) List(k->"") else s.split(";").map(v=>k->v)
       }
      .toDF("species","Ensembl")
    val withExtra = df.select("species","common_name", "lifespan")
      .withColumn("symbol", lit(gene))
      .join(selection,"species")
    withExtra.join(ensembl2uniprot, Seq("Ensembl"), "left").withColumn("fasta", getProtein($"UniProtKB-AC"))
}

In [12]:
import java.io.File
import scala.util._
def write_proteins(df: DataFrame, genes: List[String], subfolder: String) = {
    for{
        p <- genes
        path = s"/data/species/pro_proteins/${p}.tsv"
        f = new File(path)       
        }
        {   
             if(!(f.exists() && f.isFile) ) 
             {
            Try{
                    val protein = get_protein(df, p)
                    protein.writeTSV(path, quote="\"" , quoteMode="ALL", rewrite =true)
                } match {
                    case Failure(ex) => 
                    println(s"EXCEPTION WITH ${p}!")
                    println(ex.toString)
                    case Success(_) =>
                    println(s"successfully wrote ${p} to ${path}!")
                }   
            } else println(path+"for gene" +  p +" exists, skipping!")
  
    }
}


### Getting results<br>




In [14]:
import better.files._
import File._
import java.io.{File => JFile}
val data_yspecies = "/data/sources/yspecies/data"
val d= better.files.File(data_yspecies)
val output = d / "output"
val stage_2 = output / "stage_2"
stage_2.children.toList.mkString("\n")

/data/sources/yspecies/data/output/stage_2/lifespan_with_traits.tsv
/data/sources/yspecies/data/output/stage_2/lifespan.tsv
/data/sources/yspecies/data/output/stage_2/shap_results.tsv
/data/sources/yspecies/data/output/stage_2/.gitignore

In [15]:
val pro = spark.readTSV("/data/species/pro_genes.tsv", header = true).withColumnRenamed("_c0", "species")
val anti = spark.readTSV("/data/species/anti_genes.tsv", header = true).withColumnRenamed("_c0", "species")
pro.show(10,1000)

+-------------------+--------------------------+--------+------------------+------------------+------------------+-------------------------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------------------------+------------------+------------------+------------------+------------------+-------------------------------------+
|            species|               common_name|lifespan|            METTL5|            TRIM66|          ADAMTS13|                                 DCTD|            NUCKS1|           SPATA20|            ARMC12|            CLDN16|          NAALADL1|            RASSF4|             PDE6B|             MYOM1|           TMEM182|            NSMCE1|              POLE|                                 UPP1|             SUMO1|              CPVL|           MOV10L1|              MBD4|           

In [16]:
anti.show(10)

+-------------------+--------------------+--------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+--------------------+------------------+--------------------+------------------+------------------+
|            species|         common_name|lifespan|              TWNK|              BRAP|           C6orf89|             MTCP1|             SIDT2|            IGSF10|               CA3|              LIAS|              RXRB|             RWDD1|           SMPDL3A|             ACTC1|               COX5B|            CDADC1|               CTPS2|             ADPRM|           ABHD16A|
+-------------------+--------------------+--------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-

In [17]:
val anti_genes  = anti.columns.drop(3).toList
anti_genes
val pro_genes  = pro.columns.drop(3).toLis
pro_genes

Error: value toLis is not a member of Array[String] (74)

In [18]:
import java.io.File
import scala.util._
def write_proteins(df: DataFrame, genes: List[String], subfolder: String) = {
    for{
        p <- genes
        path = s"/data/species/pro_proteins/${p}.tsv"
        f = new File(path)       
        }
        {   
             if(!(f.exists() && f.isFile) ) 
             {
            Try{
                    val protein = get_protein(df, p)
                    protein.writeTSV(path, quote="\"" , quoteMode="ALL", rewrite =true)
                } match {
                    case Failure(ex) => 
                    println(s"EXCEPTION WITH ${p}!")
                    println(ex.toString)
                    case Success(_) =>
                    println(s"successfully wrote ${p} to ${path}!")
                }   
            } else println(path+"for gene" +  p +" exists, skipping!")
  
    }
}


Error: not found: value get_protein (368)

In [19]:
write_proteins(pro,pro_genes, "pro_proteins")

parts of /data/species/pro_proteins/METTL5.tsv merged!
successfully wrote METTL5 to /data/species/pro_proteins/METTL5.tsv!
parts of /data/species/pro_proteins/TRIM66.tsv merged!
successfully wrote TRIM66 to /data/species/pro_proteins/TRIM66.tsv!
parts of /data/species/pro_proteins/ADAMTS13.tsv merged!
successfully wrote ADAMTS13 to /data/species/pro_proteins/ADAMTS13.tsv!
parts of /data/species/pro_proteins/DCTD.tsv merged!
successfully wrote DCTD to /data/species/pro_proteins/DCTD.tsv!
parts of /data/species/pro_proteins/NUCKS1.tsv merged!
successfully wrote NUCKS1 to /data/species/pro_proteins/NUCKS1.tsv!
parts of /data/species/pro_proteins/SPATA20.tsv merged!
successfully wrote SPATA20 to /data/species/pro_proteins/SPATA20.tsv!
parts of /data/species/pro_proteins/ARMC12.tsv merged!
successfully wrote ARMC12 to /data/species/pro_proteins/ARMC12.tsv!
parts of /data/species/pro_proteins/CLDN16.tsv merged!
successfully wrote CLDN16 to /data/species/pro_proteins/CLDN16.tsv!
parts of /dat

In [20]:
write_proteins(anti, anti_genes, "anti_proteins")

parts of /data/species/pro_proteins/TWNK.tsv merged!
successfully wrote TWNK to /data/species/pro_proteins/TWNK.tsv!
parts of /data/species/pro_proteins/BRAP.tsv merged!
successfully wrote BRAP to /data/species/pro_proteins/BRAP.tsv!
parts of /data/species/pro_proteins/C6orf89.tsv merged!
successfully wrote C6orf89 to /data/species/pro_proteins/C6orf89.tsv!
parts of /data/species/pro_proteins/MTCP1.tsv merged!
successfully wrote MTCP1 to /data/species/pro_proteins/MTCP1.tsv!
parts of /data/species/pro_proteins/SIDT2.tsv merged!
successfully wrote SIDT2 to /data/species/pro_proteins/SIDT2.tsv!
parts of /data/species/pro_proteins/IGSF10.tsv merged!
successfully wrote IGSF10 to /data/species/pro_proteins/IGSF10.tsv!
parts of /data/species/pro_proteins/CA3.tsv merged!
successfully wrote CA3 to /data/species/pro_proteins/CA3.tsv!
parts of /data/species/pro_proteins/LIAS.tsv merged!
successfully wrote LIAS to /data/species/pro_proteins/LIAS.tsv!
parts of /data/species/pro_proteins/RXRB.tsv m

For Moskalev

In [22]:
entrez.show(10,100)

+------+
|Entrez|
+------+
|185079|
|173404|
|176121|
|174762|
|181719|
|173468|
|186047|
|176879|
|175716|
|172195|
+------+
only showing top 10 rows



<div><h3>### Load sequences for selected isoforms ###</h3></div>

In [24]:
val pro_iso = spark.readTSV("/data/species/pro_isoforms_uniprot_mapping.tsv", true)
  .select("Ensembl_TRS", "UniProtKB-AC")
  .withColumn("fasta", getProtein($"UniProtKB-AC"))
pro_iso.show(10,100)

+------------------+------------+----------------------------------------------------------------------------------------------------+
|       Ensembl_TRS|UniProtKB-AC|                                                                                               fasta|
+------------------+------------+----------------------------------------------------------------------------------------------------+
|ENSMUST00000233923|      Q80X86|>sp|Q80X86|ARM12_MOUSE Armadillo repeat-containing protein 12 OS=Mus musculus OX=10090 GN=Armc12 ...|
|ENSMUST00000102891|      Q769J6|>sp|Q769J6|ATS13_MOUSE A disintegrin and metalloproteinase with thrombospondin motifs 13 OS=Mus m...|
|ENSBTAT00000008509|      Q9XT98|>sp|Q9XT98|CLD16_BOVIN Claudin-16 OS=Bos taurus OX=9913 GN=CLDN16 PE=2 SV=1
MGPGLAASHVSFPDSLLAKMR...|
|ENSPPYT00000017695|      Q5RC69|>sp|Q5RC69|DCTD_PONAB Deoxycytidylate deaminase OS=Pongo abelii OX=9601 GN=DCTD PE=2 SV=1
MSEVSCK...|
|ENSMUST00000007296|      Q9WVF7|>sp|Q9WVF7|DPOE1_MOUSE

In [25]:
pro_iso.writeTSV("/data/species/pro_isoforms_uniprot_sequences.tsv")

parts of /data/species/pro_isoforms_uniprot_sequences.tsv merged!


/data/species/pro_isoforms_uniprot_sequences.tsv

In [26]:
val anti_iso = spark.readTSV("/data/species/anti_isoforms_uniprot_mapping.tsv", true)
  .select("Ensembl_TRS", "UniProtKB-AC")
  .withColumn("fasta", getProtein($"UniProtKB-AC"))
anti_iso.writeTSV("/data/species/anti_isoforms_uniprot_sequences.tsv")

parts of /data/species/anti_isoforms_uniprot_sequences.tsv merged!


/data/species/anti_isoforms_uniprot_sequences.tsv

In [27]:
pro_iso.as[(String, String, String)]
.map{ case (trs, uni, fasta) => fasta.replace(">", s">${trs}|${uni}|")}.toDF
.writeTSV("/data/species/pro_isoforms_uniprot_sequences.fasta", header = false)

parts of /data/species/pro_isoforms_uniprot_sequences.fasta merged!


/data/species/pro_isoforms_uniprot_sequences.fasta

In [28]:
anti_iso.as[(String, String, String)]
.map{ case (trs, uni, fasta) => fasta.replace(">", s">${trs}|${uni}|")}.toDF
.writeTSV("/data/species/anti_isoforms_uniprot_sequences.fasta", header = false)

parts of /data/species/anti_isoforms_uniprot_sequences.fasta merged!


/data/species/anti_isoforms_uniprot_sequences.fasta