In [81]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale

### SVM (support vector machine) is good for cases where p >> n
i.e. where the number of columns/features/variables is much greater than the number of observations

In our case, we have 1000-1500 lipid features and only 60 total observations (10 mice, 6 samples each)

However, SVM is O(n^2) so classical SVM tends to overwhelm memory after 10,000 rows.

### Questions to answer with machine learning:

1. Is RBG or FBG a better model for blood glucose testing in diabetic nile rats>  
2. Which features are the best predictor of diabetes in nile rats? 
    1. Loadings plot in PCA
    2. Dimensionality reduction 

### Models to consider

1. Time series ... something...
3. Dimensionality reduction
    1. LASSO
    2. Ridge/Elastic net regression
4. Accounting for non-independence in observations
    1. Random Effects modelling within Linear Mixed Models (LMM)

5. From the Mike Snyder paper on multiomics correlations to prediabetes https://www.nature.com/articles/s41586-019-1236-x:
    1. " l1 regularization is used to encourage the sparsity of the learned coefficient. "
    2. Regularization is a penalty that increases as model complexity increases
    3. L1 = Lasso Regression
    4. L2 = Ridge Regression 
        1. Main difference is the penalty term used 
    

In [154]:
fr = pd.read_csv(r'..\data\processed\filtered_raw_data.csv', index_col=0)
file_grouping = pd.read_csv(r'..\data\metadata\file_groupings.csv', index_col=0)
fr

Unnamed: 0,Unique Identifier,Retention Time (min),Quant Ion,Polarity,Area (max),Identification,Lipid Class,Features Found,20210729_AJ_Toh_RatBloodGlucose_T1060M_20210322_8wk_FBG,20210729_AJ_Toh_RatBloodGlucose_T1060M_20210325_8wk_RBG,...,20210729_AJ_Toh_RatBloodGlucose_T1101M_20210402_9wk_FBG,20210729_AJ_Toh_RatBloodGlucose_T1101M_20210406_9wk_RBG,20210729_AJ_Toh_RatBloodGlucose_T1101M_20210410_10wk_FBG,20210729_AJ_Toh_RatBloodGlucose_T1101M_20210413_10wk_RBG,20210729_AJ_Toh_RatBloodGlucose_T1102M_20210327_8wk_FBG,20210729_AJ_Toh_RatBloodGlucose_T1102M_20210330_8wk_RBG,20210729_AJ_Toh_RatBloodGlucose_T1102M_20210402_9wk_FBG,20210729_AJ_Toh_RatBloodGlucose_T1102M_20210406_9wk_RBG,20210729_AJ_Toh_RatBloodGlucose_T1102M_20210410_10wk_FBG,20210729_AJ_Toh_RatBloodGlucose_T1102M_20210413_10wk_RBG
0,unknown_679.05463_0.536,0.536,679.05463,+,2.408421e+07,unknown,unknown,49,1.145548e+07,1.728814e+07,...,1.231938e+07,1.258442e+07,6.377443e+06,1.771940e+07,1.647246e+07,1.366983e+07,1.568381e+07,1.290439e+07,1.353820e+07,1.360893e+07
1,unknown_641.1087_0.537,0.537,641.10870,+,5.522238e+07,unknown,unknown,36,4.677270e+07,5.522238e+07,...,3.428703e+07,3.415574e+07,2.455848e+07,3.852845e+07,4.172903e+07,6.220326e+06,4.279894e+07,3.789880e+07,3.219858e+07,2.889174e+07
2,unknown_951.15973_0.541,0.541,951.15973,-,3.238113e+07,unknown,unknown,10,1.625939e+07,3.238113e+07,...,1.802672e+07,1.658884e+07,1.349634e+07,1.569916e+07,1.912593e+07,1.216820e+07,2.036861e+07,1.952822e+07,1.953922e+07,1.543462e+07
3,unknown_329.03067_0.542,0.542,329.03067,-,2.882770e+08,unknown,unknown,55,1.290199e+08,6.760284e+07,...,1.575765e+08,2.173414e+08,1.282113e+08,2.882770e+08,1.142664e+08,1.452781e+08,1.763549e+08,2.236196e+08,1.790767e+08,1.651242e+08
4,unknown_969.13147_0.544,0.544,969.13147,-,1.697296e+07,unknown,unknown,14,1.089197e+07,1.697296e+07,...,7.257362e+06,9.879815e+06,6.178661e+06,6.578465e+06,8.023917e+06,4.351532e+06,1.132409e+07,1.040661e+07,8.373267e+06,9.059848e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1481,unknown_974.9118_20.163,20.163,974.91180,+,4.515337e+06,unknown,unknown,4,6.585959e+05,3.591638e+05,...,3.448085e+05,5.558781e+05,1.153422e+05,3.896841e+05,3.842817e+05,5.605229e+05,3.046913e+05,1.110650e+06,5.464527e+05,7.021282e+05
1482,unknown_593.58716_20.175,20.175,593.58716,+,3.296976e+06,unknown,unknown,19,1.801313e+06,1.867413e+05,...,7.956023e+05,3.804952e+05,4.007204e+05,2.390561e+05,1.291650e+06,2.886770e+05,8.545798e+05,6.331233e+05,1.647502e+06,2.088592e+05
1483,unknown_871.80945_20.196,20.196,871.80945,+,2.595829e+06,unknown,unknown,6,1.053780e+06,7.649254e+04,...,4.949319e+05,1.446089e+05,2.794735e+05,7.948007e+04,8.523295e+05,8.869149e+04,4.884374e+05,1.233201e+05,1.052764e+06,4.261791e+04
1484,unknown_1003.93829_20.435,20.435,1003.93829,+,3.203813e+06,unknown,unknown,4,5.844701e+05,2.637219e+05,...,2.294775e+05,6.374218e+05,4.261154e+04,3.421911e+05,3.890957e+05,7.912981e+05,1.910285e+05,1.147269e+06,4.434897e+05,5.509051e+05


In [40]:
quant_filenames = file_grouping[file_grouping['quant_file'] == True]['File Name'].to_list()
quant_filenames

['20210729_AJ_Toh_RatBloodGlucose_T1076M_20210405_10wk_FBG',
 '20210729_AJ_Toh_RatBloodGlucose_T1062M_20210322_8wk_FBG',
 '20210729_AJ_Toh_RatBloodGlucose_T1093M_20210402_9wk_FBG',
 '20210729_AJ_Toh_RatBloodGlucose_T1060M_20210325_8wk_RBG',
 '20210729_AJ_Toh_RatBloodGlucose_T1091M_20210410_10wk_FBG',
 '20210729_AJ_Toh_RatBloodGlucose_T1092M_20210406_9wk_RBG',
 '20210729_AJ_Toh_RatBloodGlucose_T1093M_20210413_10wk_RBG',
 '20210729_AJ_Toh_RatBloodGlucose_T1082M_20210405_10wk_FBG',
 '20210729_AJ_Toh_RatBloodGlucose_T1060M_20210401_9wk_RBG',
 '20210729_AJ_Toh_RatBloodGlucose_T1101M_20210402_9wk_FBG',
 '20210729_AJ_Toh_RatBloodGlucose_T1062M_20210408_10wk_RBG',
 '20210729_AJ_Toh_RatBloodGlucose_T1091M_20210330_8wk_RBG',
 '20210729_AJ_Toh_RatBloodGlucose_T1074M_20210405_10wk_FBG',
 '20210729_AJ_Toh_RatBloodGlucose_T1101M_20210413_10wk_RBG',
 '20210729_AJ_Toh_RatBloodGlucose_T1076M_20210329_9wk_FBG',
 '20210729_AJ_Toh_RatBloodGlucose_T1092M_20210402_9wk_FBG',
 '20210729_AJ_Toh_RatBloodGlucose

In [74]:
fr[quant_filenames].iloc[0:5, 0:3].T

Unnamed: 0,0,1,2,3,4
20210729_AJ_Toh_RatBloodGlucose_T1076M_20210405_10wk_FBG,1388749.0,3035806.0,1663567.0,22151940.0,681656.4
20210729_AJ_Toh_RatBloodGlucose_T1062M_20210322_8wk_FBG,10494430.0,25372880.0,14024380.0,118735500.0,6906003.0
20210729_AJ_Toh_RatBloodGlucose_T1093M_20210402_9wk_FBG,9897531.0,22077640.0,8735853.0,111440500.0,3505334.0


array([[0.        , 0.        , 0.        , ..., 0.1264256 , 0.02761148,
        0.        ],
       [0.4012118 , 0.42802335, 0.40240212, ..., 0.14562449, 0.02091204,
        0.03890562],
       [0.37491124, 0.36488002, 0.23023592, ..., 0.79856239, 0.20566073,
        0.25602656],
       ...,
       [0.73288182, 0.65976443, 0.70277078, ..., 0.02169909, 0.01560124,
        0.03127962],
       [0.95062647, 0.81202148, 0.8139675 , ..., 0.0018939 , 0.09156507,
        0.12729206],
       [0.79733941, 0.67962568, 0.51127367, ..., 0.        , 0.06926706,
        0.07341166]])

In [143]:
fr[quant_filenames].T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1471,1472,1473,1479,1480,1481,1482,1483,1484,1486
20210729_AJ_Toh_RatBloodGlucose_T1076M_20210405_10wk_FBG,1388749.0,3035806.0,1663567.0,22151940.0,681656.4,8682616.0,374751800.0,190244.8,653076.1,21061100.0,...,340758.0,331084.5,357403.5,87650.36,488164.5,114907.3,216517.0,346056.1,103209.1,13956.36
20210729_AJ_Toh_RatBloodGlucose_T1062M_20210322_8wk_FBG,10494430.0,25372880.0,14024380.0,118735500.0,6906003.0,80684690.0,1338627000.0,293346.4,4245591.0,79759800.0,...,147830.5,242976.6,749895.8,195212.5,671392.1,169602.6,622868.1,395500.2,81846.97,84000.65
20210729_AJ_Toh_RatBloodGlucose_T1093M_20210402_9wk_FBG,9897531.0,22077640.0,8735853.0,111440500.0,3505334.0,72308150.0,825739000.0,906924.5,4715529.0,52672920.0,...,2399226.0,1834312.0,2817875.0,2294994.0,3066207.0,1137056.0,2862648.0,2077054.0,670945.4,474897.4
20210729_AJ_Toh_RatBloodGlucose_T1060M_20210325_8wk_RBG,17288140.0,55222380.0,32381130.0,67602840.0,16972960.0,168582000.0,4234264000.0,423187.7,5076570.0,138026400.0,...,1324342.0,715278.4,1606961.0,359085.0,252195.4,359163.8,186741.3,76492.54,263721.9,193514.5
20210729_AJ_Toh_RatBloodGlucose_T1091M_20210410_10wk_FBG,21296220.0,46346470.0,17782540.0,172325900.0,5381199.0,130501700.0,1526645000.0,1184495.0,5991201.0,89107440.0,...,219091.7,365774.7,1001713.0,322344.9,1178122.0,265367.6,1071320.0,920598.9,171477.1,154765.0
20210729_AJ_Toh_RatBloodGlucose_T1092M_20210406_9wk_RBG,16236840.0,33333060.0,19471280.0,176580200.0,9870417.0,112061800.0,1813532000.0,295387.9,6113765.0,87454040.0,...,349494.6,452816.9,1057110.0,360463.8,238336.7,288553.2,175427.5,49316.99,201497.4,154530.4
20210729_AJ_Toh_RatBloodGlucose_T1093M_20210413_10wk_RBG,16147740.0,30110810.0,13522410.0,188008700.0,7580783.0,82836630.0,1023909000.0,673771.0,5514658.0,61093000.0,...,538302.1,635490.0,1156747.0,338901.5,484220.9,299503.7,329468.3,102598.7,250357.9,189325.2
20210729_AJ_Toh_RatBloodGlucose_T1082M_20210405_10wk_FBG,11525040.0,28619640.0,18722230.0,166653100.0,8089105.0,87715810.0,1231058000.0,271062.7,6723898.0,71904800.0,...,100472.6,48318.59,489106.9,43553.68,651401.6,23830.3,776423.6,709940.1,15165.85,22643.35
20210729_AJ_Toh_RatBloodGlucose_T1060M_20210401_9wk_RBG,23818490.0,36999510.0,9969525.0,217059900.0,5828327.0,77802030.0,947404100.0,317728.2,7656925.0,52569100.0,...,868731.9,814591.0,1648307.0,529255.9,328291.6,495788.5,224930.4,96033.05,378589.3,221887.3
20210729_AJ_Toh_RatBloodGlucose_T1101M_20210402_9wk_FBG,12319380.0,34287030.0,18026720.0,157576500.0,7257362.0,115507500.0,1447831000.0,5197247.0,6416476.0,84222240.0,...,738650.5,591211.2,1242546.0,673354.9,888633.4,344808.5,795602.3,494931.9,229477.5,15931.48


In [144]:
scaler = MinMaxScaler()# defaults to range of (0, 1)
# Can also use minmax_scale as a one-liner:
# minmax_scale(fr[quant_filenames].T)

fr_scaled = pd.DataFrame(scaler.fit_transform(fr[quant_filenames].T), 
                         columns=fr[quant_filenames].T.columns,
                         index=fr[quant_filenames].T.index)

# print(fr_scaled.shape)
# SVM needs to be in the shape of (observations, features)
# print(fr_scaled.T.shape)
fr_scaled.iloc[:, 0:1].min()
# MinMaxScaler(fr[quant_filenames].iloc[:, 1].values.reshape(-1,1),)

0    0.0
dtype: float64

In [148]:
file_grouping

Unnamed: 0,File Name,analytical_run_order,fr_name,rat_label,bg_type,quant_file,week,lipidex_file_number
0,20210729_AJ_Toh_RatBloodGlucose_Water_R1,1,20210729_AJ_Toh_RatBloodGlucose_Water_R1.raw (...,,,False,,65
1,20210729_AJ_Toh_RatBloodGlucose_Water_R2,2,20210729_AJ_Toh_RatBloodGlucose_Water_R2.raw (...,,,False,,66
2,20210729_AJ_Toh_RatBloodGlucose_Water_R3,3,20210729_AJ_Toh_RatBloodGlucose_Water_R3.raw (...,,,False,,67
3,20210729_AJ_Toh_RatBloodGlucose_SolventBlank,4,20210729_AJ_Toh_RatBloodGlucose_SolventBlank.r...,,,False,,2
4,20210729_AJ_Toh_RatBloodGlucose_ExtractionBlank,5,20210729_AJ_Toh_RatBloodGlucose_ExtractionBlan...,,,False,,1
...,...,...,...,...,...,...,...,...
62,20210729_AJ_Toh_RatBloodGlucose_T1062M_2021040...,63,20210729_AJ_Toh_RatBloodGlucose_T1062M_2021040...,1062.0,RBG,True,9.0,14
63,20210729_AJ_Toh_RatBloodGlucose_T1101M_2021033...,64,20210729_AJ_Toh_RatBloodGlucose_T1101M_2021033...,1101.0,RBG,True,8.0,54
64,20210729_AJ_Toh_RatBloodGlucose_T1092M_2021033...,65,20210729_AJ_Toh_RatBloodGlucose_T1092M_2021033...,1092.0,RBG,True,8.0,42
65,20210729_AJ_Toh_RatBloodGlucose_T1092M_2021041...,66,20210729_AJ_Toh_RatBloodGlucose_T1092M_2021041...,1092.0,RBG,True,10.0,46


In [149]:
fr_scaled.index

Index(['20210729_AJ_Toh_RatBloodGlucose_T1076M_20210405_10wk_FBG',
       '20210729_AJ_Toh_RatBloodGlucose_T1062M_20210322_8wk_FBG',
       '20210729_AJ_Toh_RatBloodGlucose_T1093M_20210402_9wk_FBG',
       '20210729_AJ_Toh_RatBloodGlucose_T1060M_20210325_8wk_RBG',
       '20210729_AJ_Toh_RatBloodGlucose_T1091M_20210410_10wk_FBG',
       '20210729_AJ_Toh_RatBloodGlucose_T1092M_20210406_9wk_RBG',
       '20210729_AJ_Toh_RatBloodGlucose_T1093M_20210413_10wk_RBG',
       '20210729_AJ_Toh_RatBloodGlucose_T1082M_20210405_10wk_FBG',
       '20210729_AJ_Toh_RatBloodGlucose_T1060M_20210401_9wk_RBG',
       '20210729_AJ_Toh_RatBloodGlucose_T1101M_20210402_9wk_FBG',
       '20210729_AJ_Toh_RatBloodGlucose_T1062M_20210408_10wk_RBG',
       '20210729_AJ_Toh_RatBloodGlucose_T1091M_20210330_8wk_RBG',
       '20210729_AJ_Toh_RatBloodGlucose_T1074M_20210405_10wk_FBG',
       '20210729_AJ_Toh_RatBloodGlucose_T1101M_20210413_10wk_RBG',
       '20210729_AJ_Toh_RatBloodGlucose_T1076M_20210329_9wk_FBG',
   

In [155]:
file_grouping[file_grouping['File Name'].isin(fr_scaled.index)]

Unnamed: 0,File Name,analytical_run_order,fr_name,rat_label,bg_type,quant_file,week,lipidex_file_number,date,glucose_tolerance
6,20210729_AJ_Toh_RatBloodGlucose_T1076M_2021040...,7,20210729_AJ_Toh_RatBloodGlucose_T1076M_2021040...,1076.0,FBG,True,10.0,27,2021-04-05,diabetic
7,20210729_AJ_Toh_RatBloodGlucose_T1062M_2021032...,8,20210729_AJ_Toh_RatBloodGlucose_T1062M_2021032...,1062.0,FBG,True,8.0,9,2021-03-22,impaired
8,20210729_AJ_Toh_RatBloodGlucose_T1093M_2021040...,9,20210729_AJ_Toh_RatBloodGlucose_T1093M_2021040...,1093.0,FBG,True,9.0,49,2021-04-02,normal
9,20210729_AJ_Toh_RatBloodGlucose_T1060M_2021032...,10,20210729_AJ_Toh_RatBloodGlucose_T1060M_2021032...,1060.0,RBG,True,8.0,4,2021-03-25,impaired
10,20210729_AJ_Toh_RatBloodGlucose_T1091M_2021041...,11,20210729_AJ_Toh_RatBloodGlucose_T1091M_2021041...,1091.0,FBG,True,10.0,39,2021-04-10,normal
11,20210729_AJ_Toh_RatBloodGlucose_T1092M_2021040...,12,20210729_AJ_Toh_RatBloodGlucose_T1092M_2021040...,1092.0,RBG,True,9.0,44,2021-04-06,impaired
12,20210729_AJ_Toh_RatBloodGlucose_T1093M_2021041...,13,20210729_AJ_Toh_RatBloodGlucose_T1093M_2021041...,1093.0,RBG,True,10.0,52,2021-04-13,normal
13,20210729_AJ_Toh_RatBloodGlucose_T1082M_2021040...,14,20210729_AJ_Toh_RatBloodGlucose_T1082M_2021040...,1082.0,FBG,True,10.0,33,2021-04-05,diabetic
14,20210729_AJ_Toh_RatBloodGlucose_T1060M_2021040...,15,20210729_AJ_Toh_RatBloodGlucose_T1060M_2021040...,1060.0,RBG,True,9.0,6,2021-04-01,impaired
15,20210729_AJ_Toh_RatBloodGlucose_T1101M_2021040...,16,20210729_AJ_Toh_RatBloodGlucose_T1101M_2021040...,1101.0,FBG,True,9.0,55,2021-04-02,diabetic


In [113]:
fr_scaled.iloc[:, 0:1].min()

0    0.0
dtype: float64

In [None]:
## split data

x_train, x_test, y_train, y_test = train_test_split(features=fr_scaled, 
                                                    target=, 
                                                    test_size=0.2,
                                                    random_state=128792387,
                                                    )

In [4]:
svm_clf = svm.SVC(kernel='linear')  # linear is one of several possible kernels, can also use radial 

svm_clf.fit(x_train, y_train)

y_pred = svm_clf.predict(x_test)

In [86]:
from sklearn import datasets

#Load dataset
cancer = datasets.load_breast_cancer()

In [93]:
cancer.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,