In [2]:
# -*- coding: utf-8 -*-

# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import permutation_test_score
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.utils import shuffle
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.offline as pyo
import cross_validation
import plotting
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

from pyChemometrics import ChemometricsScaler

import os
from unipair import unipair
from vip import vip_scores



#Make directory
# path folder

data = pd.read_csv('/Volumes/CAS9/Aeiwz/Documents/Thesis/Analyse/Dataset/FW_noesy_pqn.csv')


pair = unipair(dataset=data, column_name='Class')

dataset = pair.get_dataset()

names = pair.get_name()


In [3]:
Group = 'Class'
i = 1

plot_name = names[i]


test_select = dataset[i]



X = test_select.iloc[:, 14:]
df_X = X
#fill nan with 0
X = X.fillna(0)
meta = test_select.iloc[:, :14]
Y = test_select[Group]
Y1 = pd.Categorical(Y).codes
ppm = list(np.ravel(X.columns).astype(float))
# Use pandas Categorical type to generate the dummy enconding of the Y vector (0 and 1) 
name = names[i]


spectra = test_select.iloc[:, 14:]
ppm = list(spectra.columns.astype(float))
X = spectra.values
y = Y


In [10]:

# Create a pipeline with data preprocessing and OPLS-DA model
pipeline = Pipeline([
                        ('scale', ChemometricsScaler(scale_power=0.5)),
                        ('oplsda', PLSRegression(n_components=2)),
                        ('opls', cross_validation.CrossValidation(kfold=3, estimator='opls', scaler='pareto'))
                        ])

oplsda = pipeline.named_steps['oplsda']
cv = pipeline.named_steps['opls']
cv.fit(X, y)

oplsda.fit(X, pd.Categorical(y).codes)
n_permutate = 1000

# Permutation test to assess the significance of the model
acc_score, permutation_scores, p_value = permutation_test_score(
pipeline.named_steps['oplsda'], X, pd.Categorical(y).codes, cv=3, n_permutations=n_permutate, n_jobs=-1, random_state=57, verbose=10)


s_scores_df = pd.DataFrame({'correlation': cv.correlation,'covariance': cv.covariance}, index=ppm)
df_opls_scores = pd.DataFrame({'t_scores': cv.scores, 't_ortho': cv.orthogonal_score, 't_pred': cv.predictive_score, 'label': y})
vips = vip_scores(model=oplsda, features_name=ppm)
plot = vips.vip_plot(threshold=2)
plot.show()

    

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.179160159013503s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.10051417350769043s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1554090976715088s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Done 100 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 160 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1

In [3]:
oplsda.n_components, len(oplsda.y_scores_), 

(2, 10)

In [5]:
from anova import anova_oplsda

In [6]:
model = anova_oplsda(X, y, n_components=2, cv=3)



        X: predictor variables as <class 'numpy.ndarray'> of shape (10, 22915) 

        Y: response variable as <class 'pandas.core.series.Series'> of shape (10,) 

        n_components: 2 components for OLSDA model 

        cv: 3-fold 

        


In [8]:
model.fit()

In [9]:
model.summary()

Unnamed: 0_level_0,F-statistic,p-value
Number of components,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.049072,0.830235
1,0.165325,0.694961
