In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import re
import torch
import torch.nn as nn

#approach: utilize a model for each output coordinate (5 sets)
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import RepeatedKFold, cross_val_score
from sklearn.metrics import mean_squared_error

In [2]:
training_path='/kaggle/input/stanford-rna-3d-folding/train_sequences.csv'
validation_path='/kaggle/input/stanford-rna-3d-folding/validation_sequences.csv'
testing_path='/kaggle/input/stanford-rna-3d-folding/test_sequences.csv'

training_labels_path='/kaggle/input/stanford-rna-3d-folding/train_labels.csv'
validation_labels_path='/kaggle/input/stanford-rna-3d-folding/validation_labels.csv'

#"testing labels path"
submission_path='/kaggle/input/stanford-rna-3d-folding/sample_submission.csv'

In [3]:
train_df=pd.read_csv(training_path)
update_train_df=train_df[train_df['temporal_cutoff']<"2022-05-27"]
update_train_df.tail(1)

Unnamed: 0,target_id,sequence,temporal_cutoff,description,all_sequences
691,7S3H_R,GGCUGCGUAUUUCUACUCUGUUGUUUUAGAGCUAGAAAUAGCAAGU...,2022-04-20,Cas9:sgRNA:DNA (S. pyogenes) with 0 RNA:DNA ba...,>7S3H_1|Chain A[auth N]|Non-target DNA strand|...


In [4]:
#additional validation set
validation_set_extra=train_df[train_df['temporal_cutoff']>"2022-05-27"]
len(validation_set_extra)

152

In [5]:
train_labels_df=pd.read_csv(training_labels_path)
train_labels_df.head(1)

Unnamed: 0,ID,resname,resid,x_1,y_1,z_1
0,1SCL_A_1,G,1,13.76,-25.974001,0.102


**Sequence structure prediction preprocessing**

In [6]:
#Extract target label: Protein Data Bank id, Monomer chain id
train_labels_df['target_id']=train_labels_df['ID'].str.rsplit('_',n=1).str[0]
train_labels_df['pdb_monomer']=train_labels_df['ID'].str.rsplit('_',n=1).str[1]

#update submission df
submission_df=pd.read_csv(submission_path)
submission_df['target_id']=submission_df['ID'].str.rsplit('_',n=1).str[0]
submission_df['pdb_monomer']=submission_df['ID'].str.rsplit('_',n=1).str[1]

train_full_df=update_train_df.merge(train_labels_df, how="left", on="target_id")

#train_labels_df.head(1)
train_full_df.head(1)
#submission_df.head(1)

Unnamed: 0,target_id,sequence,temporal_cutoff,description,all_sequences,ID,resname,resid,x_1,y_1,z_1,pdb_monomer
0,1SCL_A,GGGUGCUCAGUACGAGAGGAACCGCACCC,1995-01-26,"THE SARCIN-RICIN LOOP, A MODULAR RNA",>1SCL_1|Chain A|RNA SARCIN-RICIN LOOP|Rattus n...,1SCL_A_1,G,1,13.76,-25.974001,0.102,1


In [7]:
#fill in nan description in all_sequences with "missing"
train_full_df['all_sequences']=train_full_df['all_sequences'].astype(str).fillna("missing")

#fill in missing coordinates with mean values
training_coordinates=['x_1','y_1','z_1']
for t in training_coordinates:
    t_mean=train_full_df[t].mean()
    train_full_df[t]=train_full_df[t].fillna(t_mean)

train_full_df.isna().sum()

target_id          0
sequence           0
temporal_cutoff    0
description        0
all_sequences      0
ID                 0
resname            0
resid              0
x_1                0
y_1                0
z_1                0
pdb_monomer        0
dtype: int64

In [8]:
def df_label_extract_merge(df_path, label_path):
    df=pd.read_csv(df_path)
    ldf=pd.read_csv(label_path)
    ldf['target_id']=ldf['ID'].str.rsplit('_',n=1).str[0]
    ldf['pdb_monomer']=ldf['ID'].str.rsplit('_',n=1).str[1]
    return df.merge(ldf, how='left', on='target_id')

In [9]:
#validation has 40 structural coordinate output triplets, whereas training has only 1 triplet output (empty multi-dimensional slots)
validation_df=df_label_extract_merge(df_path=validation_path, label_path=validation_labels_path)
validation_df[training_coordinates+['x_2']].head(1)
validation_df.head(1)

Unnamed: 0,target_id,sequence,temporal_cutoff,description,all_sequences,ID,resname,resid,x_1,y_1,...,x_38,y_38,z_38,x_39,y_39,z_39,x_40,y_40,z_40,pdb_monomer
0,R1107,GGGGGCCACAGCAGAAGCGUUCACGUCGCAGCCCCUGUCAGCCAUU...,2022-05-28,CPEB3 ribozyme\nHuman\nhuman CPEB3 HDV-like ri...,>7QR4_1|Chain A|U1 small nuclear ribonucleopro...,R1107_1,G,1,-5.499,8.52,...,-1e+18,-1e+18,-1e+18,-1e+18,-1e+18,-1e+18,-1e+18,-1e+18,-1e+18,1


In [10]:
#convert submission csv as same format as original training csv
test_df=pd.read_csv(testing_path)
test_complete_df=test_df.merge(submission_df, how="left", on="target_id")
test_complete_df.head(1)

Unnamed: 0,target_id,sequence,temporal_cutoff,description,all_sequences,ID,resname,resid,x_1,y_1,...,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5,pdb_monomer
0,R1107,GGGGGCCACAGCAGAAGCGUUCACGUCGCAGCCCCUGUCAGCCAUU...,2022-05-28,CPEB3 ribozyme\nHuman\nhuman CPEB3 HDV-like ri...,>7QR4_1|Chain A|U1 small nuclear ribonucleopro...,R1107_1,G,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [11]:
#The validation data is identical to the test data? (Suppose to be for future testing data updates)
all(validation_df[list(test_complete_df.columns)]==test_complete_df)

True

In [12]:
print(train_full_df.loc[6500,'all_sequences'])

>1ZC8_1|Chain A|TLD 16S ribosomal RNA|Thermus thermophilus (274)
GGGGCUGAUUCUGGAUUCGACGGGAUAUUUCGGACGCGGGUUCAACUCCCGCCAGCUCC
>1ZC8_10|Chain J[auth K]|SsrA-binding protein|Thermus thermophilus (274)
GKSDKIIPIAENKEAKAKYDILETYEAGIVLKGSEVKSLREKGTVSFKDSFVRIENGEAWLYNLYIAPYKHATIENHDPLRKRKLLLHKREIMRLYGKVQEKGYTIIPLKLYWKNNKVKVLIALAKGKKL
>1ZC8_11|Chain K[auth Y]|Elongation factor Tu|Thermus thermophilus (274)
AKGEFIRTKPHVNVGTIGHVDHGKTTLTAALTYVAAAENPNVEVKDYGDIDKAPEERARGITINTAHVEYETAKRHYSHVDCPGHADYIKNMITGAAQMDGAILVVSAADGPMPQTREHILLARQVGVPYIVVFMNKVDMVDDPELLDLVEMEVRDLLNQYEFPGDEVPVIRGSALLALEEMHKNPKTKRGENEWVDKIWELLDAIDEYIPTPVRDVDKPFLMPVEDVFTITGRGTVATGRIERGKVKVGDEVEIVGLAPETRKTVVTGVEMHRKTLQEGIAGDNVGLLLRGVSREEVERGQVLAKPGSITPHTKFEASVYILKKEEGGRHTGFFTGYRPQFYFRTTDVTGVVRLPQGVEMVMPGDNVTFTVELIKPVALEEGLRFAIREGGRTVGAGVVTKILE
>1ZC8_2|Chain B|H2 16S rRNA|Thermus thermophilus (274)
UUGCGAAACAUGUAGG
>1ZC8_3|Chain C|H2b d mRNA|Thermus thermophilus (274)
CCCAAGGUGCAUGCGCAUGUAGUACCGAGGA
>1ZC8_4|Chain D[auth G]|protein ki

**Feature Engineering**
- Add 3D structural information in the PDB database https://search.rcsb.org/#search-services
- data augmentation & transfer learning: SPOT-RNA, pretrained on bpRNA database (stems, hairpin loops and pseudoknots)
- sequences converted to 2D images: MXFold2 utilizes UFold process to then input into a CNN and efficiently generate results
- ARES atom-level information instead of nucleotides (small training set used)
- EternalFold: uses chemical mapping, and riboswitch–ligand binding affinity data


In [13]:
print(torch.cuda.is_available())
len(train_full_df['pdb_monomer'].unique())

False


4298

In [14]:
#simple sequence length feature
train_full_df['seq_len']=train_full_df['sequence'].str.len()
validation_df['seq_len']=validation_df['sequence'].str.len()
test_complete_df['seq_len']=test_complete_df['sequence'].str.len()

#categorical columns
cat_cols=['target_id', 'resname']

#dataset inspection
print('unique ids:', len(train_full_df['target_id'].unique()))
print('unique resname:', len(train_full_df['resname'].unique()))
print(train_full_df['resname'].unique())
print('total datapoints:', len(train_full_df))

#Encode and create an embedding features from the target ids
encoded_ids, id_maps= pd.factorize(train_full_df['target_id'])
id_tensor=torch.tensor(encoded_ids, dtype=torch.long)
embed_size=int(len(train_full_df['target_id'].unique()))
embedding_dim=min(50, np.power(len(train_full_df['target_id'].unique()),0.25))
#print(np.ceil(embedding_dim))
embed_dim=int(np.ceil(embedding_dim))
embed_layer=nn.Embedding(num_embeddings=embed_size, embedding_dim=embed_dim)
embed_ids=embed_layer(id_tensor).detach().cpu().numpy()
embed_id_df=pd.DataFrame(embed_ids, columns=[f'embedim_{i}' for i in range(embed_dim)])

#TF-IDF transform description and all-sequences columns into word embedding features
tfidf_vec=TfidfVectorizer(min_df=5, max_df=0.9, max_features=5000) 

train_full_df['combined_descrip_allseqs']=train_full_df['description']+' '+train_full_df['all_sequences']

tfidf_all=tfidf_vec.fit_transform(train_full_df['combined_descrip_allseqs'])

#after inspecting some allsequence text some have 'xxxxxxxxxx'
# ft_names=list(tfidf_vec.get_feature_names_out())

# pattern = f"({re.escape('x')})\\1*"
# filter_words=[f for f in ft_names if not f.isnumeric() and not re.fullmatch(pattern, f)]

# print(len(filter_words))

tfidf_all_df=pd.DataFrame(tfidf_all.toarray(), columns=tfidf_vec.get_feature_names_out())

tfidf_all_df.head(3)

unique ids: 692
unique resname: 6
['G' 'U' 'C' 'A' '-' 'X']
total datapoints: 113616


Unnamed: 0,01,02,03,04,05,10,100440,10045,100673,10090,...,zb,zc,zd,ze,zf,zg,zh,zi,zika,zz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
embed_id_df.head(1)

Unnamed: 0,embedim_0,embedim_1,embedim_2,embedim_3,embedim_4,embedim_5
0,-0.91515,0.464639,1.085058,-0.012506,-0.527794,-1.678841


In [16]:
#try to reduce the number of word features
var_selector=VarianceThreshold(threshold=0.01)
da_select=var_selector.fit_transform(tfidf_all_df) #transform on test df
ft_names=var_selector.get_feature_names_out()
non_num_ftnames=[f for f in ft_names if not f.isnumeric()]
reduced_wfts_df=pd.DataFrame(da_select, columns=ft_names)[non_num_ftnames]
reduced_wfts_df.head(3)

Unnamed: 0,aestivum,auth,bacillus,coli,cuniculus,drosophila,escherichia,homo,jannaschii,melanogaster,methanococcus,oryctolagus,sapiens,subtilis,thermophilus,thermus,triticum
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
#Encode the resname
ohe_resname=pd.get_dummies(train_full_df['resname'], dtype='int')
#ohe_resname
# ohe_rdf=pd.DataFrame(ohe_resname.toarray(), columns=ohe.get_features_names_out())
ohe_resname=ohe_resname.drop(['-'], axis=1) #reduce dimensionality
ohe_resname.head(3)

Unnamed: 0,A,C,G,U,X
0,0,0,1,0,0
1,0,0,1,0,0
2,0,0,1,0,0


In [18]:
train_full_df['pdb_monomer']=train_full_df['pdb_monomer'].astype(int)
train_full_df['pdb_monomer'].unique()

array([   1,    2,    3, ..., 4296, 4297, 4298])

In [19]:
#numerical columns
num_cols=['resid', 'pdb_monomer', 'seq_len']

#TODO: merge features into X and run model; 
full_prep_train=pd.concat([embed_id_df,train_full_df[num_cols], reduced_wfts_df, ohe_resname], axis=1)
full_prep_train.head(3)

Unnamed: 0,embedim_0,embedim_1,embedim_2,embedim_3,embedim_4,embedim_5,resid,pdb_monomer,seq_len,aestivum,...,sapiens,subtilis,thermophilus,thermus,triticum,A,C,G,U,X
0,-0.91515,0.464639,1.085058,-0.012506,-0.527794,-1.678841,1,1,29,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0
1,-0.91515,0.464639,1.085058,-0.012506,-0.527794,-1.678841,2,2,29,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0
2,-0.91515,0.464639,1.085058,-0.012506,-0.527794,-1.678841,3,3,29,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0


In [20]:
tfidf_vec=TfidfVectorizer(min_df=5, max_df=0.9, max_features=5000)
var_selector=VarianceThreshold(threshold=0.01)
training_id_maps=[]
training_embed_layer={}
training_embed_dim={}

def preprocessing(df_path, label_path, dataset_type):
    if dataset_type not in ['train', 'validation', 'test']:
       return "Error! Must choose option out of 'train', 'validation', or 'test'"
    
    df=pd.read_csv(df_path)
    ldf=pd.read_csv(label_path)
    
    if dataset_type=='train':
        df=df[df['temporal_cutoff']<"2022-05-27"]
    ldf['target_id']=ldf['ID'].str.rsplit('_',n=1).str[0]
    ldf['pdb_monomer']=ldf['ID'].str.rsplit('_',n=1).str[1]
    merged_df=df.merge(ldf, how='left', on='target_id')

    #fill in nan description in all_sequences with "missing"
    merged_df['all_sequences']=merged_df['all_sequences'].astype(str).fillna("missing")

    #fill in missing coordinates with mean values
    target_coordinates=['x_1','y_1','z_1']
    for t in target_coordinates:
        t_mean=merged_df[t].mean()
        merged_df[t]=merged_df[t].fillna(t_mean)

    merged_df['pdb_monomer']=merged_df['pdb_monomer'].astype(int) #pdb_monomer feature
    merged_df['seq_len']=merged_df['sequence'].str.len() #sequence len feature
    
    #encoding target id and creating embeddings
    if dataset_type=='train':
        encoded_ids, id_maps=pd.factorize(merged_df['target_id']) 
        training_id_maps.extend(id_maps)
        id_tensor=torch.tensor(encoded_ids, dtype=torch.long)
        embed_size=int(len(merged_df['target_id'].unique()))
        embedding_dim=min(50, np.power(len(merged_df['target_id'].unique()),0.25)) #rule of thumb
        embed_dim=int(np.ceil(embedding_dim))
        training_embed_dim['train']=embed_dim
        embed_layer=nn.Embedding(num_embeddings=embed_size, embedding_dim=embed_dim)
        training_embed_layer['train_embed_layer']=embed_layer
        embed_ids=embed_layer(id_tensor).detach().cpu().numpy()
        embed_id_df=pd.DataFrame(embed_ids, columns=[f'embedim_{i}' for i in range(embed_dim)])
    elif dataset_type=="validation" or dataset_type=="test":
        map_list=list(training_id_maps)
        encoded_ids=pd.Series(merged_df['target_id']).map({cat:i for i, cat in enumerate(map_list)}).fillna(-1).astype(int)
        id_tensor=torch.tensor(encoded_ids.clip(0), dtype=torch.long) #make sure no unknown categories are passed to embeddings
        embed_ids=training_embed_layer['train_embed_layer'](id_tensor).detach().cpu().numpy()
        embed_id_df=pd.DataFrame(embed_ids, columns=[f'embedim_{i}' for i in range(training_embed_dim['train'])])
    
    #tf-idf transformed description and all-sequences columns into word embedding features
    merged_df['combined__descrip_allseqs']=merged_df['description']+' '+merged_df['all_sequences']
    if dataset_type=="train":
        tfidf_all=tfidf_vec.fit_transform(merged_df['combined__descrip_allseqs'])
        tfidf_all_df=pd.DataFrame(tfidf_all.toarray(), columns=tfidf_vec.get_feature_names_out())
        da_select=var_selector.fit_transform(tfidf_all_df) #transform on test df
    elif dataset_type=="validation" and (tfidf_vec.get_feature_names_out().size!=0):
        tfidf_all=tfidf_vec.transform(merged_df['combined__descrip_allseqs'])
        tfidf_all_df=pd.DataFrame(tfidf_all.toarray(), columns=tfidf_vec.get_feature_names_out())
        da_select=var_selector.transform(tfidf_all_df) 
    elif dataset_type=="test" and (tfidf_vec.get_feature_names_out().size!=0):
        tfidf_all=tfidf_vec.transform(merged_df['combined__descrip_allseqs'])
        tfidf_all_df=pd.DataFrame(tfidf_all.toarray(), columns=tfidf_vec.get_feature_names_out())
        da_select=var_selector.transform(tfidf_all_df) 
    ft_names=var_selector.get_feature_names_out()
    non_num_ftnames=[f for f in ft_names if not f.isnumeric()]
    reduced_wfts_df=pd.DataFrame(da_select, columns=ft_names)[non_num_ftnames]
    
    #resname feature: RNA A-U, C-G pairings
    ohe_resname=pd.get_dummies(merged_df['resname'], dtype='int') 
    if '-' in ohe_resname.columns:
        ohe_resname=ohe_resname.drop(['-'], axis=1)
    if 'X' in ohe_resname.columns:
        ohe_resname=ohe_resname.drop(['X'], axis=1)

    num_cols=['resid', 'pdb_monomer', 'seq_len']
    final_df=pd.concat([merged_df['sequence'], embed_id_df,merged_df[num_cols], reduced_wfts_df, ohe_resname], axis=1)

    return final_df, merged_df[target_coordinates]

In [21]:
#streamline the preprocessing for validation and test sets; run evaluations
training_df, training_coord=preprocessing(df_path=training_path, label_path=training_labels_path, dataset_type='train')

In [22]:
# all(training_df==full_prep_train)
# if training_id_maps:
#     print(True)
# training_embed_layer
training_df.shape
#training_coord[['x_1']]

(113616, 31)

In [23]:
validation_df, validation_coord=preprocessing(df_path=validation_path, label_path=validation_labels_path, dataset_type='validation')

In [24]:
validation_df.head(1)
print(validation_df.shape)

(2515, 31)


In [25]:
#testing_coord should be empty since not provided initially
testing_df, testing_coord=preprocessing(df_path=testing_path, label_path=submission_path, dataset_type='test')
testing_df.head(1)
print(testing_df.shape)

(2515, 31)


In [26]:
for tc in list(training_df.columns): 
    if tc not in list(validation_df.columns):
        print(tc)
print(training_df.columns)
print(validation_df.columns)

Index(['sequence', 'embedim_0', 'embedim_1', 'embedim_2', 'embedim_3',
       'embedim_4', 'embedim_5', 'resid', 'pdb_monomer', 'seq_len', 'aestivum',
       'auth', 'bacillus', 'coli', 'cuniculus', 'drosophila', 'escherichia',
       'homo', 'jannaschii', 'melanogaster', 'methanococcus', 'oryctolagus',
       'sapiens', 'subtilis', 'thermophilus', 'thermus', 'triticum', 'A', 'C',
       'G', 'U'],
      dtype='object')
Index(['sequence', 'embedim_0', 'embedim_1', 'embedim_2', 'embedim_3',
       'embedim_4', 'embedim_5', 'resid', 'pdb_monomer', 'seq_len', 'aestivum',
       'auth', 'bacillus', 'coli', 'cuniculus', 'drosophila', 'escherichia',
       'homo', 'jannaschii', 'melanogaster', 'methanococcus', 'oryctolagus',
       'sapiens', 'subtilis', 'thermophilus', 'thermus', 'triticum', 'A', 'C',
       'G', 'U'],
      dtype='object')


In [36]:
#y: coordinates
y_list=training_coordinates

xgb_model_yi={}
cb_model_yi={}

train_df=training_df.drop(['sequence'], axis=1)

for y in y_list:
    catboost_model=CatBoostRegressor(iterations=5,
                                     depth=7,
                                     learning_rate=0.1,
                                     loss_function='RMSE',
                                     verbose=False)
    catboost_model.fit(train_df, training_coord[[y]])
    cb_model_yi[y]=catboost_model
    
    xgboost_model=xgb.XGBRegressor(
        objective='reg:squarederror', #since there are negative coordinates
        n_estimators=100 , 
        max_depth=7,
        learning_rate=0.1,
        subsample=0.6,
        colsample_bytree=0.6
        )
    
    xgboost_model.fit(train_df, training_coord[[y]])
    xgb_model_yi[y]=xgboost_model

In [38]:
from collections import defaultdict
xgb_val_predictions={}
xgb_val_results=defaultdict(list)

cb_val_predictions={}
cb_val_results=defaultdict(list)

for y in y_list:
    val_df=validation_df.drop(['sequence'], axis=1)

    cb_preds=cb_model_yi[y].predict(val_df)
    rmse=np.sqrt(mean_squared_error(validation_coord[y], cb_preds))
    cb_val_predictions[y]=cb_preds
    cb_val_results[y].append(rmse)
    
    xgb_preds=xgb_model_yi[y].predict(val_df)
    rmse=np.sqrt(mean_squared_error(validation_coord[y], xgb_preds))
    xgb_val_predictions[y]=xgb_preds
    xgb_val_results[y].append(rmse)
    

In [39]:
#inspect catboost and xgboost perform identically
cb_val_result_df=pd.DataFrame(cb_val_results)
xgb_val_result_df=pd.DataFrame(xgb_val_results)

In [40]:
cb_val_result_df

Unnamed: 0,x_1,y_1,z_1
0,7.722833e+16,7.722833e+16,7.722833e+16


In [41]:
xgb_val_result_df

Unnamed: 0,x_1,y_1,z_1
0,7.722833e+16,7.722833e+16,7.722833e+16


In [42]:
validation_coord[y_list]

Unnamed: 0,x_1,y_1,z_1
0,-5.499000,8.520000,8.605000
1,-5.826000,10.453000,14.010000
2,-5.849000,14.768000,17.584999
3,-5.784000,19.985001,18.666000
4,-5.755000,25.533001,17.132999
...,...,...,...
2510,87.870003,105.432999,115.183998
2511,92.911003,105.394997,113.741997
2512,99.012001,105.749001,113.073997
2513,103.861000,103.453003,114.589996


In [43]:
pd.DataFrame(cb_val_predictions) 

Unnamed: 0,x_1,y_1,z_1
0,51.250184,54.168855,66.134419
1,51.250184,54.168855,66.134419
2,51.250184,54.168855,66.134419
3,51.250184,54.168855,66.134419
4,51.250184,54.168855,66.134419
...,...,...,...
2510,83.195197,60.755837,80.382719
2511,83.195197,60.755837,80.382719
2512,83.195197,60.755837,80.382719
2513,83.195197,60.755837,80.382719


In [44]:
pd.DataFrame(xgb_val_predictions) 

Unnamed: 0,x_1,y_1,z_1
0,29.118488,20.492775,29.735546
1,29.118488,20.492775,28.633831
2,29.118488,20.492775,28.633831
3,29.118488,20.492775,28.633831
4,29.118488,20.701342,28.633831
...,...,...,...
2510,98.356750,103.878250,135.236023
2511,99.060646,103.878250,135.236023
2512,99.060646,103.878250,135.236023
2513,99.060646,103.878250,135.236023
