In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV


# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

from sklearn.model_selection import train_test_split

import nltk

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
full_train = pd.read_json('/content/drive/MyDrive/1 MIDS/W207 Applied Machine Learning/Final Project/train.json', lines=True)
full_train = full_train.set_index(keys='index')
full_train.info()
full_train.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2400 entries, 0 to 2399
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   2400 non-null   object 
 1   sequence             2400 non-null   object 
 2   structure            2400 non-null   object 
 3   predicted_loop_type  2400 non-null   object 
 4   signal_to_noise      2400 non-null   float64
 5   SN_filter            2400 non-null   int64  
 6   seq_length           2400 non-null   int64  
 7   seq_scored           2400 non-null   int64  
 8   reactivity_error     2400 non-null   object 
 9   deg_error_Mg_pH10    2400 non-null   object 
 10  deg_error_pH10       2400 non-null   object 
 11  deg_error_Mg_50C     2400 non-null   object 
 12  deg_error_50C        2400 non-null   object 
 13  reactivity           2400 non-null   object 
 14  deg_Mg_pH10          2400 non-null   object 
 15  deg_pH10             2400 non-null   o

Unnamed: 0_level_0,id,sequence,structure,predicted_loop_type,signal_to_noise,SN_filter,seq_length,seq_scored,reactivity_error,deg_error_Mg_pH10,deg_error_pH10,deg_error_Mg_50C,deg_error_50C,reactivity,deg_Mg_pH10,deg_pH10,deg_Mg_50C,deg_50C
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,id_001f94081,GGAAAAGCUCUAAUAACAGGAGACUAGGACUACGUAUUUCUAGGUA...,.....((((((.......)))).)).((.....((..((((((......,EEEEESSSSSSHHHHHHHSSSSBSSXSSIIIIISSIISSSSSSHHH...,6.894,1,107,68,"[0.1359, 0.20700000000000002, 0.1633, 0.1452, ...","[0.26130000000000003, 0.38420000000000004, 0.1...","[0.2631, 0.28600000000000003, 0.0964, 0.1574, ...","[0.1501, 0.275, 0.0947, 0.18660000000000002, 0...","[0.2167, 0.34750000000000003, 0.188, 0.2124, 0...","[0.3297, 1.5693000000000001, 1.1227, 0.8686, 0...","[0.7556, 2.983, 0.2526, 1.3789, 0.637600000000...","[2.3375, 3.5060000000000002, 0.3008, 1.0108, 0...","[0.35810000000000003, 2.9683, 0.2589, 1.4552, ...","[0.6382, 3.4773, 0.9988, 1.3228, 0.78770000000..."
1,id_0049f53ba,GGAAAAAGCGCGCGCGGUUAGCGCGCGCUUUUGCGCGCGCUGUACC...,.....(((((((((((((((((((((((....)))))))))).)))...,EEEEESSSSSSSSSSSSSSSSSSSSSSSHHHHSSSSSSSSSSBSSS...,0.193,0,107,68,"[2.8272, 2.8272, 2.8272, 4.7343, 2.5676, 2.567...","[73705.3985, 73705.3985, 73705.3985, 73705.398...","[10.1986, 9.2418, 5.0933, 5.0933, 5.0933, 5.09...","[16.6174, 13.868, 8.1968, 8.1968, 8.1968, 8.19...","[15.4857, 7.9596, 13.3957, 5.8777, 5.8777, 5.8...","[0.0, 0.0, 0.0, 2.2965, 0.0, 0.0, 0.0, 0.0, 0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[4.947, 4.4523, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[4.8511, 4.0426, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[7.6692, 0.0, 10.9561, 0.0, 0.0, 0.0, 0.0, 0.0..."
2,id_006f36f57,GGAAAGUGCUCAGAUAAGCUAAGCUCGAAUAGCAAUCGAAUAGAAU...,.....((((.((.....((((.(((.....)))..((((......)...,EEEEESSSSISSIIIIISSSSMSSSHHHHHSSSMMSSSSHHHHHHS...,8.8,1,107,68,"[0.0931, 0.13290000000000002, 0.11280000000000...","[0.1365, 0.2237, 0.1812, 0.1333, 0.1148, 0.160...","[0.17020000000000002, 0.178, 0.111, 0.091, 0.0...","[0.1033, 0.1464, 0.1126, 0.09620000000000001, ...","[0.14980000000000002, 0.1761, 0.1517, 0.116700...","[0.44820000000000004, 1.4822, 1.1819, 0.743400...","[0.2504, 1.4021, 0.9804, 0.49670000000000003, ...","[2.243, 2.9361, 1.0553, 0.721, 0.6396000000000...","[0.5163, 1.6823000000000001, 1.0426, 0.7902, 0...","[0.9501000000000001, 1.7974999999999999, 1.499..."
3,id_0082d463b,GGAAAAGCGCGCGCGCGCGCGCGAAAAAGCGCGCGCGCGCGCGCGC...,......((((((((((((((((......))))))))))))))))((...,EEEEEESSSSSSSSSSSSSSSSHHHHHHSSSSSSSSSSSSSSSSSS...,0.104,0,107,68,"[3.5229, 6.0748, 3.0374, 3.0374, 3.0374, 3.037...","[73705.3985, 73705.3985, 73705.3985, 73705.398...","[11.8007, 12.7566, 5.7733, 5.7733, 5.7733, 5.7...","[121286.7181, 121286.7182, 121286.7181, 121286...","[15.3995, 8.1124, 7.7824, 7.7824, 7.7824, 7.78...","[0.0, 2.2399, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....","[0.0, -0.5083, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...","[3.4248, 6.8128, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[0.0, -0.8365, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...","[7.6692, -1.3223, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0..."
4,id_0087940f4,GGAAAAUAUAUAAUAUAUUAUAUAAAUAUAUUAUAGAAGUAUAAUA...,.....(((((((.((((((((((((.(((((((((....)))))))...,EEEEESSSSSSSBSSSSSSSSSSSSBSSSSSSSSSHHHHSSSSSSS...,0.423,0,107,68,"[1.665, 2.1728, 2.0041, 1.2405, 0.620200000000...","[4.2139, 3.9637000000000002, 3.2467, 2.4716, 1...","[3.0942, 3.015, 2.1212, 2.0552, 0.881500000000...","[2.6717, 2.4818, 1.9919, 2.5484999999999998, 1...","[1.3285, 3.6173, 1.3057, 1.3021, 1.1507, 1.150...","[0.8267, 2.6577, 2.8481, 0.40090000000000003, ...","[2.1058, 3.138, 2.5437000000000003, 1.0932, 0....","[4.7366, 4.6243, 1.2068, 1.1538, 0.0, 0.0, 0.7...","[2.2052, 1.7947000000000002, 0.7457, 3.1233, 0...","[0.0, 5.1198, -0.3551, -0.3518, 0.0, 0.0, 0.0,..."


In [3]:
full_test = pd.read_json('/content/drive/MyDrive/1 MIDS/W207 Applied Machine Learning/Final Project/test.json', lines=True)
full_test = full_test.set_index(keys='index')
full_test.info()
full_test.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3634 entries, 0 to 3633
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id                   3634 non-null   object
 1   sequence             3634 non-null   object
 2   structure            3634 non-null   object
 3   predicted_loop_type  3634 non-null   object
 4   seq_length           3634 non-null   int64 
 5   seq_scored           3634 non-null   int64 
dtypes: int64(2), object(4)
memory usage: 198.7+ KB


Unnamed: 0_level_0,id,sequence,structure,predicted_loop_type,seq_length,seq_scored
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,id_00073f8be,GGAAAAGUACGACUUGAGUACGGAAAACGUACCAACUCGAUUAAAA...,......((((((((((.(((((.....))))))))((((((((......,EEEEEESSSSSSSSSSBSSSSSHHHHHSSSSSSSSSSSSSSSSHHH...,107,68
1,id_000ae4237,GGAAACGGGUUCCGCGGAUUGCUGCUAAUAAGAGUAAUCUCUAAAU...,.....((((..((((((...(((((.....((((....)))).......,EEEEESSSSIISSSSSSIIISSSSSIIIIISSSSHHHHSSSSIIII...,130,91
2,id_00131c573,GGAAAACAAAACGGCCUGGAAGACGAAGGAAUUCGGCGCGAAGGCC...,...........((.(((.(.(..((..((..((((...))))..))...,EEEEEEEEEEESSISSSISISIISSIISSIISSSSHHHSSSSIISS...,107,68
3,id_00181fd34,GGAAAGGAUCUCUAUCGAAGGAUAGAGAUCGCUCGCGACGGCACGA...,......((((((((((....))))))))))((((((..((.(((.....,EEEEEESSSSSSSSSSHHHHSSSSSSSSSSSSSSSSIISSISSSHH...,107,68
4,id_0020473f7,GGAAACCCGCCCGCGCCCGCCCGCGCUGCUGCCGUGCCUCCUCUCC...,.....(((((((((((((((((((((((((((((((((((((((((...,EEEEESSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS...,130,91


The test set is a mixture of 'private test' and 'public test' data, differ by seq_length/ seq_score, should seperate the two sets. 

In [4]:
public_test = full_test[full_test.seq_length==107]
private_test = full_test[full_test.seq_length==130]
print(public_test.shape)
print(private_test.shape)


(629, 6)
(3005, 6)


In [5]:
perc =[.20, .40, .60, .80]
include =['object', 'float', 'int']
descriptive_summary = full_train.describe(percentiles = perc, include = include)

descriptive_summary

Unnamed: 0,id,sequence,structure,predicted_loop_type,signal_to_noise,SN_filter,seq_length,seq_scored,reactivity_error,deg_error_Mg_pH10,deg_error_pH10,deg_error_Mg_50C,deg_error_50C,reactivity,deg_Mg_pH10,deg_pH10,deg_Mg_50C,deg_50C
count,2400,2400,2400,2400,2400.0,2400.0,2400.0,2400.0,2400,2400,2400,2400,2400,2400,2400,2400,2400,2400
unique,2400,2400,1507,1530,,,,,2394,2394,2389,2390,2387,2395,2394,2391,2392,2385
top,id_1f465a329,GGAAAGUGCGGCCGCGUAUUUCGACAAGAUAGCGAAAGAUAGCACG...,.................................................,EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE...,,,,,"[146151.225, 146151.225, 146151.225, 146151.22...","[104235.1742, 104235.1742, 104235.1742, 104235...","[314833.5695, 314833.5695, 314833.5695, 314833...","[171525.3217, 171525.3217, 171525.3217, 171525...","[271158.604, 271158.604, 271158.604, 271158.60...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
freq,1,1,278,278,,,,,6,6,10,9,13,6,6,10,9,15
mean,,,,,4.530456,0.662083,107.0,68.0,,,,,,,,,,
std,,,,,2.835142,0.473099,0.0,0.0,,,,,,,,,,
min,,,,,-0.103,0.0,107.0,68.0,,,,,,,,,,
20%,,,,,1.8568,0.0,107.0,68.0,,,,,,,,,,
40%,,,,,3.7152,1.0,107.0,68.0,,,,,,,,,,
50%,,,,,4.4425,1.0,107.0,68.0,,,,,,,,,,


The data is structured quite complicated with different length for sequences can be different in different data set, so just double check that length is expected. 

In [6]:
def check_seq_length (data,seq, expected_length):
  return data.apply(lambda x:len(x[seq]) == x[expected_length], axis = 1)

for seq in ['sequence', 'structure', 'predicted_loop_type']: 
  print(f'Is the length for {seq} as expected?', all(check_seq_length(full_train, seq, 'seq_length')))


for seq in ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']: 
  print(f'Is the length for {seq} as expected?', all(check_seq_length(full_train, seq, 'seq_scored')))



Is the length for sequence as expected? True
Is the length for structure as expected? True
Is the length for predicted_loop_type as expected? True
Is the length for reactivity as expected? True
Is the length for deg_Mg_pH10 as expected? True
Is the length for deg_pH10 as expected? True
Is the length for deg_Mg_50C as expected? True
Is the length for deg_50C as expected? True


In [7]:
token2int = {x:i for i, x in enumerate('().ACGUBEHIMSX')}

def tokenise(data, dic):
  temp = list(data.sequence) + list(data.structure) +list(data.predicted_loop_type)
  temp = np.array(temp)

  k = np.array(list(dic.keys()))
  v = np.array(list(dic.values()))

  # Get argsort indices
  sidx = k.argsort()

  ks = k[sidx]
  vs = v[sidx]
  return vs[np.searchsorted(ks,temp)]

full_train['feature_array'] = full_train.apply(lambda x: tokenise(x, token2int), axis = 1)

In [10]:
np.array(full_train['feature_array'].tolist()).shape

(2400, 321)

In [20]:
 # split into a training and a dev data 
 
 train_data, dev_test = train_test_split(
    full_train, test_size=.2, random_state=34)
 train_data.shape

(1920, 19)

In [21]:
 dev_data, test_data = train_test_split(
    dev_test, test_size=.5, random_state=34)
 dev_data.shape

(240, 19)

In [17]:
np.array(train_data['feature_array'].tolist())[0]

array([ 5,  5,  3,  3,  3,  3,  5,  5,  5,  3,  4,  4,  4,  3,  5,  5,  5,
        4,  4,  4,  3,  5,  5,  3,  5,  3,  4,  5,  3,  4,  5,  5,  4,  4,
        4,  3,  5,  5,  5,  3,  3,  3,  4,  4,  5,  5,  5,  4,  4,  5,  4,
        5,  4,  4,  4,  5,  5,  5,  4,  4,  4,  5,  5,  5,  4,  4,  4,  3,
        5,  4,  5,  4,  3,  6,  4,  6,  6,  4,  5,  5,  3,  6,  5,  4,  5,
        4,  3,  3,  3,  3,  5,  3,  3,  3,  4,  3,  3,  4,  3,  3,  4,  3,
        3,  4,  3,  3,  4,  2,  2,  2,  2,  2,  2,  0,  0,  0,  2,  0,  0,
        0,  2,  0,  0,  0,  0,  0,  0,  2,  0,  0,  2,  0,  2,  0,  0,  2,
        0,  0,  0,  0,  0,  0,  2,  0,  0,  2,  2,  2,  2,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  2,  0,  0,  0,  0,  0,  0,  0,  2,  2,  2,  2,  1,
        1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
        2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  8,  8,  8,  8,  8,  8, 12,
       12, 12,  7, 12, 12

In [15]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

train_feature = np.array(train_data['feature_array'].tolist())
train_reactivity = np.array(train_data['reactivity'].tolist())

dev_feature = np.array(dev_data['feature_array'].tolist())
dev_reactivity = np.array(dev_data['reactivity'].tolist())


clf_single = MLPRegressor()
clf_single.fit(train_feature, train_reactivity)
print(f'Initial model accuracy {clf_single.score(dev_feature, dev_reactivity)}')


# def a grid search for var_smoothing 
params = {'alpha': [1.0e-10, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0],
          'hidden_layer_sizes': [(100,), (200,), (100, 100), (100, 50, 50, 50)]
          }

# start a grid search with min train data 
clf = GridSearchCV(clf_single, params, cv=5)
clf.fit(train_feature, train_reactivity)

# get the best performing model from grid search 
clf_best = clf.best_estimator_
clf_best.fit(train_feature, train_reactivity)
print(f'Final model accuracy {clf_best.score(dev_feature, dev_reactivity)}')


Initial model accuracy 0.02362154018193479




Initial model accuracy 0.10742336126189678




In [29]:
def build_nn(input_feature, output_target, params,  dev_feature, dev_target):

  clf_single = MLPRegressor(max_iter=300)
  # clf_single.fit(input_feature, output_target)
  # print(f'Initial model accuracy {clf_single.score(dev_feature, dev_reactivity)}')

  # def a grid search for var_smoothing 

  # start a grid search with min train data 
  clf = GridSearchCV(clf_single, params, cv=5)
  clf.fit(input_feature, output_target)

  # get the best performing model from grid search 
  clf_best = clf.best_estimator_
  clf_best.fit(train_feature, train_reactivity)
  print(f'Final model accuracy {clf_best.score(dev_feature, dev_target)}')
  return clf_best

In [None]:
targets = ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']
nn_estimator = [] 

params = {'alpha': [1.0e-10, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0],
          'hidden_layer_sizes': [(100,), (200,), (100, 100), (100, 50, 50, 50)]
          }

train_feature = np.array(train_data['feature_array'].tolist())
dev_feature = np.array(dev_data['feature_array'].tolist())


for target in targets: 

  output_target = np.array(train_data[target].tolist())
  dev_target = np.array(dev_data[target].tolist())

  best_model = build_nn(train_feature, output_target, params,  dev_feature, dev_target)
  nn_estimator +=[best_model]

