# Loading data
We have a lot of data in this competition, so it makes sense to use polars, a pandas alternative that is much faster and memory efficient, while multithreaded out of the box

In [None]:
%%time

import polars as pl
training_data=pl.read_csv("/kaggle/input/stanford-ribonanza-rna-folding/train_data.csv") 

In [None]:
training_data

# Get rid of duplicates

Following dropping duplicates, sorting by "sequence_id", "experiment_type" also allows us to format the data in a way where every 2 rows we have 2A3/DMS of the same sequence. This way we can also reshape the data into Nx2 later on. You can also drop duplicates along with different criteria, such as signal to noise, although here I just show the simplest method. 

In [None]:
#drop duplicates based on "sequence_id", "experiment_type"
print("before dropping duplicates data shape is:",training_data.shape)
training_data=training_data.unique(subset=["sequence_id", "experiment_type"]).sort(["sequence_id", "experiment_type"])
print("after dropping duplicates data shape is:",training_data.shape)
training_data.head()

# Filter data based on signal to noise
Filter based on the condition that SN_filter of both DMS/2A3 has to 1, matching how test set filtering is done. I will also save the filtered version of training data, which can be directly downloaded/used as notebook output

In [None]:
training_data

In [None]:
%%time
import numpy as np

# we are basically using the sort to transform the whole df into a matrix nx2
# instead of doing joins
SN=training_data['SN_filter'].to_numpy().astype('int32').reshape(-1,2)
SN=SN.min(-1)
SN=np.repeat(SN,2)
print("before filtering data shape is:",training_data.shape)
filtered_data=training_data.filter(SN==1)
print("after filtering data shape is:",filtered_data.shape)

filtered_data=filtered_data.drop(["reads",
                                  "signal_to_noise",
                                  "SN_filter"])

filtered_data.write_csv('train_QUICK_START.csv') #554 MB


In [None]:
del training_data

In [None]:
filtered_data.head()

# Modeling by averaging reactivities

Let's create a model by using avg reactivities for each experiment type as predictions. 

In [None]:
#get data
length=206
label_names=[f"reactivity_{i+1:04}" for i in range(length)]
labels=filtered_data[label_names].to_numpy().astype('float32').reshape(-1,2,206).transpose(0,2,1)
labels=labels.clip(0,1)
labels.shape

In [None]:
avg_DMS=np.nanmean(labels[:,:,1])
avg_2A3=np.nanmean(labels[:,:,0])

In [None]:
del labels

# Test data formatting
Do inference with our avg values and make a submission

## First let's load the test and sample sub to see what's going on

In [None]:
test=pl.read_csv("/kaggle/input/stanford-ribonanza-rna-folding/test_sequences.csv")
test

In [None]:
sample_sub=pl.read_csv("/kaggle/input/stanford-ribonanza-rna-folding/sample_submission.csv")
sample_sub

# Now put our predictions into sample sub and save

In [None]:
sample_sub=sample_sub.with_columns(pl.lit(avg_DMS).alias("reactivity_DMS_MaP"),
                                   pl.lit(avg_2A3).alias("reactivity_2A3_MaP"),)
sample_sub.head()

In [None]:
sample_sub.write_csv("submission.csv",float_precision=2)