## Transformer learning process

In [3]:
import gc
from pyspark.sql import SparkSession

import pyspark
import pandas as pd
from pyspark.sql import functions as sf

from tqdm import tqdm
from data_types import Sequence, SequenceEntity,ResidueType
from loaders import dump

ModuleNotFoundError: No module named 'data_types'

In [5]:
spark = SparkSession.builder \
        .appName("ribonanza")\
        .config("spark.driver.memory", "10g") \
        .getOrCreate()
    
train_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("/data/data/csv/train_data_QUICK_START.csv")
    
# train_df = train_df[train_df["SN_filter"].values > 0]
train_df = train_df.drop(*[c for c in train_df.columns if "_error_" in c])

df_2A3 = train_df.filter(train_df.experiment_type == "2A3_MaP")
df_DMS = train_df.filter(train_df.experiment_type == "DMS_MaP")

pk50_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("/data/data/csv/PK50_silico_predictions.csv") \
    .withColumnRenamed('hotknots_mfe', 'hotknots') \
    ["sequence","hotknots"]
pk90_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("/data/data/csv/PK90_silico_predictions.csv") \
    .withColumnRenamed('hotknots_mfe', 'hotknots') \
    ["sequence","hotknots"]
r1_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("/data/data/csv/R1_silico_predictions.csv") \
    ["sequence","hotknots"]
gpn15k_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("/data/data/csv/GPN15k_silico_predictions.csv") \
    ["sequence","hotknots"]

pairing = pk50_df.union(pk90_df).union(r1_df).union(gpn15k_df)


df_2A3 = df_2A3.join(pairing, on='sequence')
df_DMS = df_DMS.join(pairing, on='sequence')

del pk50_df,pk90_df,r1_df,gpn15k_df,pairing,train_df
gc.collect()
    # return df_2A3, df_DMS
    # _2a3_csv_path = process_structure(df_2A3)
    # dms_csv_path = process_structure(df_DMS)
     

                                                                                

423

In [5]:
df_2A3.show()

24/03/13 19:29:49 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

+--------------------+------------+---------------+--------------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+

                                                                                

In [6]:

from pyspark.sql.functions import arrays_zip, col, explode,concat_ws,split
cols=["reactivity_00{:02d}".format(i) for i in range(1,27)]
test = df_2A3.withColumn("sequence", sf.expr("substr(sequence, 26, 999)"))\
    .drop(*cols)
df=test\
    .withColumn("reactivity", sf.concat_ws(",", *[sf.col(x) for x in test.columns if "reactivity_" in x]))\
    .withColumn("reactivity", sf.split(sf.col("reactivity"),","))\
    .withColumn("sequence", sf.split(sf.col("sequence"),""))\
    .withColumn("hotknots", sf.split(sf.col("hotknots"),""))\
    .withColumn("triplet", sf.arrays_zip("sequence", "reactivity","hotknots")) \
    .withColumn("triplet", sf.explode("triplet")) \
    .select("sequence_id",  sf.col("triplet").sequence.alias('nucleotide'),sf.col("triplet").reactivity.cast("float").alias('reactivity'),sf.col("triplet").hotknots.alias('pairing'))\
    .withColumn("reactivity", sf.when(sf.col("reactivity") < 0, 0).otherwise(col("reactivity"))) \
    .replace({'.': '0','(':'1',')':'1','{':'2','}':'2','[':'3',']':'3','<':'4','>':'4','A':'5','a':'5','B':'6','b':'6'},subset=['pairing'])
    # .replace({'A':ResidueType.ADEINE,'T':ResidueType.THYMINE,'G':ResidueType.GUANINE,'U':ResidueType.URACIL},subset=['nucleotide'])
# test.withColumn("col3", 

In [12]:
df2 = (
        df.select("sequence_id", "nucleotide", "pairing", "reactivity")
        .groupby("sequence_id")
        .agg(
            sf.collect_list(sf.struct("nucleotide", "pairing")).alias("tokens"),
            sf.collect_list("reactivity").alias("reactivity"),
        )
        .withColumn("length",sf.size("tokens"))
        .select("sequence_id", "tokens", "reactivity","length")
        .sort(sf.asc("length"))
    )

df2.sort(sf.desc("length")).randomSplit([0.7, .3], seed=42)[0].show()

                                                                                

+------------+--------------------+--------------------+------+
| sequence_id|              tokens|          reactivity|length|
+------------+--------------------+--------------------+------+
|0000d87cab97|[{A, 0}, {A, 0}, ...|[0.023, 0.0, 0.11...|   177|
|00026ef17e1b|[{A, 0}, {G, 0}, ...|[0.647, 0.0, 0.21...|   177|
|000742991bbf|[{A, 0}, {G, 0}, ...|[1.035, 1.358, 0....|   177|
|00075c42b441|[{A, 1}, {A, 1}, ...|[0.519, 1.619, 1....|   177|
|00086140a4ea|[{A, 0}, {U, 0}, ...|[1.634, 0.0, 0.0,...|   177|
|000d75f3ed68|[{A, 0}, {C, 0}, ...|[0.217, 0.0, 0.70...|   177|
|0011d4dac70a|[{A, 0}, {A, 0}, ...|[0.0, 0.75, 0.363...|   177|
|00126ae0da61|[{A, 0}, {G, 0}, ...|[0.001, 0.0, 0.08...|   177|
|0017443cad61|[{A, 0}, {G, 0}, ...|[0.8, 0.782, 0.70...|   177|
|001b2130f5cb|[{A, 0}, {A, 0}, ...|[0.072, 0.174, 0....|   177|
|001ca3573d6d|[{A, 0}, {G, 0}, ...|[0.0, 0.041, 0.04...|   177|
|001da05e2ad6|[{A, 0}, {G, 0}, ...|[0.369, 0.307, 0....|   177|
|00215cd8bfa7|[{A, 0}, {U, 0}, ...|[1.22

In [8]:
q = df_2A3.approxQuantile('reactivity', [0.25, 0.5, 0.75], 0) 
upper_limit = q[2] + 1.5*(q[2]-q[0])


AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `reactivity` cannot be resolved. Did you mean one of the following? [`reactivity_0001`, `reactivity_0002`, `reactivity_0003`, `reactivity_0004`, `reactivity_0005`].;
'Project ['reactivity]
+- Project [sequence#1948, sequence_id#1947, experiment_type#1949, dataset_name#1950, reactivity_0001#1951, reactivity_0002#1952, reactivity_0003#1953, reactivity_0004#1954, reactivity_0005#1955, reactivity_0006#1956, reactivity_0007#1957, reactivity_0008#1958, reactivity_0009#1959, reactivity_0010#1960, reactivity_0011#1961, reactivity_0012#1962, reactivity_0013#1963, reactivity_0014#1964, reactivity_0015#1965, reactivity_0016#1966, reactivity_0017#1967, reactivity_0018#1968, reactivity_0019#1969, reactivity_0020#1970, ... 187 more fields]
   +- Join Inner, (sequence#1948 = sequence#3008)
      :- Filter (experiment_type#1949 = 2A3_MaP)
      :  +- Project [sequence_id#1947, sequence#1948, experiment_type#1949, dataset_name#1950, reactivity_0001#1951, reactivity_0002#1952, reactivity_0003#1953, reactivity_0004#1954, reactivity_0005#1955, reactivity_0006#1956, reactivity_0007#1957, reactivity_0008#1958, reactivity_0009#1959, reactivity_0010#1960, reactivity_0011#1961, reactivity_0012#1962, reactivity_0013#1963, reactivity_0014#1964, reactivity_0015#1965, reactivity_0016#1966, reactivity_0017#1967, reactivity_0018#1968, reactivity_0019#1969, reactivity_0020#1970, ... 186 more fields]
      :     +- Relation [sequence_id#1947,sequence#1948,experiment_type#1949,dataset_name#1950,reactivity_0001#1951,reactivity_0002#1952,reactivity_0003#1953,reactivity_0004#1954,reactivity_0005#1955,reactivity_0006#1956,reactivity_0007#1957,reactivity_0008#1958,reactivity_0009#1959,reactivity_0010#1960,reactivity_0011#1961,reactivity_0012#1962,reactivity_0013#1963,reactivity_0014#1964,reactivity_0015#1965,reactivity_0016#1966,reactivity_0017#1967,reactivity_0018#1968,reactivity_0019#1969,reactivity_0020#1970,... 392 more fields] csv
      +- Union false, false
         :- Project [sequence#3008, hotknots#3069]
         :  +- Project [id#3007, sequence#3008, notes#3009, eterna_nupack#3010, eterna_eternafold+threshknot#3011, vienna2_mfe#3012, contrafold2_mfe#3013, eternafold_mfe#3014, e2efold_mfe#3015, hotknots_mfe#3016 AS hotknots#3069, ipknots_mfe#3017, knotty_mfe#3018, pknots_mfe#3019, spotrna_mfe#3020, vienna[threshknot]_mfe#3021, vienna[hungarian]_mfe#3022, eternafold[threshknot]_mfe#3023, eternafold[hungarian]_mfe#3024, contrafold[threshknot]_mfe#3025, contrafold[hungarian]_mfe#3026, nupack[threshknot]_mfe#3027, nupack[hungarian]_mfe#3028, shapify_mfe#3029, eternafold+hfold_1#3030, ... 7 more fields]
         :     +- Relation [id#3007,sequence#3008,notes#3009,eterna_nupack#3010,eterna_eternafold+threshknot#3011,vienna2_mfe#3012,contrafold2_mfe#3013,eternafold_mfe#3014,e2efold_mfe#3015,hotknots_mfe#3016,ipknots_mfe#3017,knotty_mfe#3018,pknots_mfe#3019,spotrna_mfe#3020,vienna[threshknot]_mfe#3021,vienna[hungarian]_mfe#3022,eternafold[threshknot]_mfe#3023,eternafold[hungarian]_mfe#3024,contrafold[threshknot]_mfe#3025,contrafold[hungarian]_mfe#3026,nupack[threshknot]_mfe#3027,nupack[hungarian]_mfe#3028,shapify_mfe#3029,eternafold+hfold_1#3030,... 7 more fields] csv
         :- Project [sequence#3125, hotknots#3191]
         :  +- Project [id#3121, title#3122, name#3123, body#3124, sequence#3125, eterna_nupack#3126, eterna_eternafold+threshknot#3127, vienna2_mfe#3128, contrafold2_mfe#3129, eternafold_mfe#3130, e2efold_mfe#3131, hotknots_mfe#3132 AS hotknots#3191, ipknots_mfe#3133, knotty_mfe#3134, pknots_mfe#3135, spotrna_mfe#3136, vienna[threshknot]_mfe#3137, vienna[hungarian]_mfe#3138, eternafold[threshknot]_mfe#3139, eternafold[hungarian]_mfe#3140, contrafold[threshknot]_mfe#3141, contrafold[hungarian]_mfe#3142, nupack[threshknot]_mfe#3143, nupack[hungarian]_mfe#3144, ... 11 more fields]
         :     +- Relation [id#3121,title#3122,name#3123,body#3124,sequence#3125,eterna_nupack#3126,eterna_eternafold+threshknot#3127,vienna2_mfe#3128,contrafold2_mfe#3129,eternafold_mfe#3130,e2efold_mfe#3131,hotknots_mfe#3132,ipknots_mfe#3133,knotty_mfe#3134,pknots_mfe#3135,spotrna_mfe#3136,vienna[threshknot]_mfe#3137,vienna[hungarian]_mfe#3138,eternafold[threshknot]_mfe#3139,eternafold[hungarian]_mfe#3140,contrafold[threshknot]_mfe#3141,contrafold[hungarian]_mfe#3142,nupack[threshknot]_mfe#3143,nupack[hungarian]_mfe#3144,... 11 more fields] csv
         :- Project [sequence#3251, hotknots#3259]
         :  +- Relation [rowID#3247,id#3248,name#3249,body#3250,sequence#3251,title#3252,vienna2_mfe#3253,vienna2_time#3254,contrafold2_mfe#3255,contrafold2_time#3256,eternafold_mfe#3257,eternafold_time#3258,hotknots#3259,hotknots_time#3260,ipknots#3261,ipknots_time#3262,knotty#3263,knotty_time#3264,spotrna#3265,spotrna_time#3266,nupack_pk#3267,nupack_pk_time#3268,vienna_2[threshknot]#3269,vienna_2[threshknot]_time#3270,... 20 more fields] csv
         +- Project [sequence#3357, hotknots#3364]
            +- Relation [rowID#3355,seqID#3356,sequence#3357,vienna2_mfe#3358,vienna2_time#3359,contrafold2_mfe#3360,contrafold2_time#3361,eternafold_mfe#3362,eternafold_time#3363,hotknots#3364,hotknots_time#3365,ipknots#3366,ipknots_time#3367,knotty#3368,knotty_time#3369,spotrna#3370,spotrna_time#3371,nupack_pk#3372,nupack_pk_time#3373,vienna_2[threshknot]#3374,vienna_2[threshknot]_time#3375,vienna_2[hungarian]#3376,vienna_2[hungarian]_time#3377,eternafold[threshknot]#3378,... 13 more fields] csv


In [8]:
upper_limit

1.4265000484883785