In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

import nltk

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
df = pd.read_json('/content/drive/MyDrive/1 MIDS/W207 Applied Machine Learning/Final Project/train.json', lines=True)
df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2400 entries, 0 to 2399
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   index                2400 non-null   int64  
 1   id                   2400 non-null   object 
 2   sequence             2400 non-null   object 
 3   structure            2400 non-null   object 
 4   predicted_loop_type  2400 non-null   object 
 5   signal_to_noise      2400 non-null   float64
 6   SN_filter            2400 non-null   int64  
 7   seq_length           2400 non-null   int64  
 8   seq_scored           2400 non-null   int64  
 9   reactivity_error     2400 non-null   object 
 10  deg_error_Mg_pH10    2400 non-null   object 
 11  deg_error_pH10       2400 non-null   object 
 12  deg_error_Mg_50C     2400 non-null   object 
 13  deg_error_50C        2400 non-null   object 
 14  reactivity           2400 non-null   object 
 15  deg_Mg_pH10          2400 non-null   o

Unnamed: 0,index,id,sequence,structure,predicted_loop_type,signal_to_noise,SN_filter,seq_length,seq_scored,reactivity_error,deg_error_Mg_pH10,deg_error_pH10,deg_error_Mg_50C,deg_error_50C,reactivity,deg_Mg_pH10,deg_pH10,deg_Mg_50C,deg_50C
0,0,id_001f94081,GGAAAAGCUCUAAUAACAGGAGACUAGGACUACGUAUUUCUAGGUA...,.....((((((.......)))).)).((.....((..((((((......,EEEEESSSSSSHHHHHHHSSSSBSSXSSIIIIISSIISSSSSSHHH...,6.894,1,107,68,"[0.1359, 0.20700000000000002, 0.1633, 0.1452, ...","[0.26130000000000003, 0.38420000000000004, 0.1...","[0.2631, 0.28600000000000003, 0.0964, 0.1574, ...","[0.1501, 0.275, 0.0947, 0.18660000000000002, 0...","[0.2167, 0.34750000000000003, 0.188, 0.2124, 0...","[0.3297, 1.5693000000000001, 1.1227, 0.8686, 0...","[0.7556, 2.983, 0.2526, 1.3789, 0.637600000000...","[2.3375, 3.5060000000000002, 0.3008, 1.0108, 0...","[0.35810000000000003, 2.9683, 0.2589, 1.4552, ...","[0.6382, 3.4773, 0.9988, 1.3228, 0.78770000000..."
1,1,id_0049f53ba,GGAAAAAGCGCGCGCGGUUAGCGCGCGCUUUUGCGCGCGCUGUACC...,.....(((((((((((((((((((((((....)))))))))).)))...,EEEEESSSSSSSSSSSSSSSSSSSSSSSHHHHSSSSSSSSSSBSSS...,0.193,0,107,68,"[2.8272, 2.8272, 2.8272, 4.7343, 2.5676, 2.567...","[73705.3985, 73705.3985, 73705.3985, 73705.398...","[10.1986, 9.2418, 5.0933, 5.0933, 5.0933, 5.09...","[16.6174, 13.868, 8.1968, 8.1968, 8.1968, 8.19...","[15.4857, 7.9596, 13.3957, 5.8777, 5.8777, 5.8...","[0.0, 0.0, 0.0, 2.2965, 0.0, 0.0, 0.0, 0.0, 0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[4.947, 4.4523, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[4.8511, 4.0426, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[7.6692, 0.0, 10.9561, 0.0, 0.0, 0.0, 0.0, 0.0..."
2,2,id_006f36f57,GGAAAGUGCUCAGAUAAGCUAAGCUCGAAUAGCAAUCGAAUAGAAU...,.....((((.((.....((((.(((.....)))..((((......)...,EEEEESSSSISSIIIIISSSSMSSSHHHHHSSSMMSSSSHHHHHHS...,8.800,1,107,68,"[0.0931, 0.13290000000000002, 0.11280000000000...","[0.1365, 0.2237, 0.1812, 0.1333, 0.1148, 0.160...","[0.17020000000000002, 0.178, 0.111, 0.091, 0.0...","[0.1033, 0.1464, 0.1126, 0.09620000000000001, ...","[0.14980000000000002, 0.1761, 0.1517, 0.116700...","[0.44820000000000004, 1.4822, 1.1819, 0.743400...","[0.2504, 1.4021, 0.9804, 0.49670000000000003, ...","[2.243, 2.9361, 1.0553, 0.721, 0.6396000000000...","[0.5163, 1.6823000000000001, 1.0426, 0.7902, 0...","[0.9501000000000001, 1.7974999999999999, 1.499..."
3,3,id_0082d463b,GGAAAAGCGCGCGCGCGCGCGCGAAAAAGCGCGCGCGCGCGCGCGC...,......((((((((((((((((......))))))))))))))))((...,EEEEEESSSSSSSSSSSSSSSSHHHHHHSSSSSSSSSSSSSSSSSS...,0.104,0,107,68,"[3.5229, 6.0748, 3.0374, 3.0374, 3.0374, 3.037...","[73705.3985, 73705.3985, 73705.3985, 73705.398...","[11.8007, 12.7566, 5.7733, 5.7733, 5.7733, 5.7...","[121286.7181, 121286.7182, 121286.7181, 121286...","[15.3995, 8.1124, 7.7824, 7.7824, 7.7824, 7.78...","[0.0, 2.2399, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....","[0.0, -0.5083, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...","[3.4248, 6.8128, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[0.0, -0.8365, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...","[7.6692, -1.3223, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0..."
4,4,id_0087940f4,GGAAAAUAUAUAAUAUAUUAUAUAAAUAUAUUAUAGAAGUAUAAUA...,.....(((((((.((((((((((((.(((((((((....)))))))...,EEEEESSSSSSSBSSSSSSSSSSSSBSSSSSSSSSHHHHSSSSSSS...,0.423,0,107,68,"[1.665, 2.1728, 2.0041, 1.2405, 0.620200000000...","[4.2139, 3.9637000000000002, 3.2467, 2.4716, 1...","[3.0942, 3.015, 2.1212, 2.0552, 0.881500000000...","[2.6717, 2.4818, 1.9919, 2.5484999999999998, 1...","[1.3285, 3.6173, 1.3057, 1.3021, 1.1507, 1.150...","[0.8267, 2.6577, 2.8481, 0.40090000000000003, ...","[2.1058, 3.138, 2.5437000000000003, 1.0932, 0....","[4.7366, 4.6243, 1.2068, 1.1538, 0.0, 0.0, 0.7...","[2.2052, 1.7947000000000002, 0.7457, 3.1233, 0...","[0.0, 5.1198, -0.3551, -0.3518, 0.0, 0.0, 0.0,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,2395,id_ff84602f7,GGAAAAUAGCAGAGGAAAUACUAGAGCAAUUGCAAAGGCCGAUCAU...,........((..((......))...)).........(((..........,EEEEEEEESSIISSHHHHHHSSIIISSXXXXXXXXXSSSHHHHHHH...,4.036,1,107,68,"[0.2585, 0.29710000000000003, 0.2748, 0.205000...","[0.2745, 0.37010000000000004, 0.291, 0.1137000...","[0.3446, 0.3815, 0.26940000000000003, 0.1182, ...","[0.2093, 0.2985, 0.2922, 0.08360000000000001, ...","[0.29460000000000003, 0.40850000000000003, 0.3...","[0.6957, 1.251, 1.3235999999999999, 0.7521, 0....","[0.6439, 2.0117, 1.3682, 0.0918, 0.65860000000...","[2.1589, 3.3601, 1.6179000000000001, 0.1344000...","[0.47900000000000004, 1.9583, 2.4635, 0.0512, ...","[0.5759000000000001, 2.3736, 1.4158, 0.1914000..."
2396,2396,id_ff85fcdba,GGAAAACAAAAACAAACAACAAAAACAAACAACAAAAACAAACAAC...,.................................................,EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE...,3.227,1,107,68,"[0.2169, 0.2513, 0.2303, 0.22260000000000002, ...","[0.1883, 0.22790000000000002, 0.1497, 0.154300...","[0.5582, 0.4887, 0.3845, 0.34040000000000004, ...","[0.2758, 0.3659, 0.2155, 0.28340000000000004, ...","[0.401, 0.388, 0.3403, 0.3608, 0.3057, 0.242, ...","[0.2891, 0.4496, 0.7165, 0.7128, 0.59310000000...","[0.3619, 0.6924, 0.2988, 0.3639, 0.545, 0.2263...","[2.8541, 1.6106, 1.4343, 1.0797, 0.6803, 0.559...","[0.2964, 0.9351, 0.2555, 0.7603000000000001, 0...","[0.6526000000000001, 0.2548, 0.6927, 0.9316000..."
2397,2397,id_ffa99f541,GGAAAGCCAUACCUAGGCUUCGGCCUAGGUAUGGCGGUGAUCUGGU...,.....(((((((((((((....)))))))))))))(((((((((((...,EEEEESSSSSSSSSSSSSHHHHSSSSSSSSSSSSSSSSSSSSSSSS...,0.345,0,107,68,"[1.5567, 1.2999, 0.9492, 0.9879, 0.8146, 0.455...","[1.9599000000000002, 1.3505, 1.619600000000000...","[3.7554, 2.618, 2.0539, 0.9674, 1.6963, 1.6687...","[1.545, 1.7649, 1.2032, 0.9904000000000001, 1....","[4.747, 4.8132, 3.0986, 2.9818, 2.8738, 1.4856...","[1.6204, 1.7046000000000001, 0.6201, 0.795, 0....","[1.7708, 0.67, 1.7689, 1.0919, 0.0, 0.0, 0.0, ...","[6.4088, 3.4249, 1.649, 0.0, 0.8095, 0.7951, 0...","[0.6038, 3.0322, 0.9703, 0.4756, 0.9154, 0.0, ...","[2.9632, 6.3911999999999995, 1.5339, 1.4749, 1..."
2398,2398,id_ffe06f3fe,GGAAACGAUAGCAGAAGAGAUCGAUAUAGAGCAUAAGCUAAGAAUA...,.....((((..(....)..))))......(((....)))..........,EEEEESSSSIISHHHHSIISSSSXXXXXXSSSHHHHSSSXXXXXXX...,5.553,0,107,68,"[0.1431, 0.1847, 0.15960000000000002, 0.1466, ...","[0.13970000000000002, 0.2404, 0.1443, 0.122400...","[0.2275, 0.2394, 0.14350000000000002, 0.1273, ...","[0.0944, 0.1453, 0.1067, 0.0994, 0.06470000000...","[0.1691, 0.22740000000000002, 0.178, 0.1762, 0...","[0.6919000000000001, 1.4823, 1.3685, 1.2473, 0...","[0.4544, 2.4603, 0.8778, 0.6402, 0.28340000000...","[2.7157999999999998, 3.1249000000000002, 1.137...","[0.3262, 1.3932, 0.8832000000000001, 0.8144, 0...","[0.5814, 1.5119, 1.1749, 1.2676, 0.22190000000..."


In [None]:
perc =[.20, .40, .60, .80]
include =['object', 'float', 'int']
descriptive_summary = df.describe(percentiles = perc, include = include)

descriptive_summary

Unnamed: 0,index,id,sequence,structure,predicted_loop_type,seq_length,seq_scored
count,3634.0,3634,3634,3634,3634,3634.0,3634.0
unique,,3634,3634,2381,2415,,
top,,id_75ca3a981,GGAAAUUUCGAGUACGGGAAGGUUAAGUGAUUGGCUUCGGGAACUU...,.....((((((((((.....))))))))))....((((((((((.....,EEEEESSSSSSSSSSHHHHHSSSSSSSSSSXXXXSSSSSSSSSSHH...,,
freq,,1,1,175,118,,
mean,1816.5,,,,,126.018987,87.018987
std,1049.189767,,,,,8.702624,8.702624
min,0.0,,,,,107.0,68.0
20%,726.6,,,,,130.0,91.0
40%,1453.2,,,,,130.0,91.0
50%,1816.5,,,,,130.0,91.0


In [None]:
!pwd

/content
