In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go


In [None]:
df = pd.read_csv('/Users/aeiwz/Library/CloudStorage/OneDrive-KhonKaenUniversity/KKUPC/Project/Alpha/KKUPC6602013/Re-analyse/Raw data/Dataset/KKUPC6602013_dataset.csv')


In [None]:
#Drop QC
df = df2.drop(df2[df2['Group'] == 'QC'].index)


In [None]:
spectra = df.iloc[:, 43:]
ppm = spectra.columns.values.astype(float)


In [None]:

#Preprocessing data set to decrease noise
def decrease_noise(spectra, window_length=11, polyorder=2):
    import numpy as np
    from scipy.signal import savgol_filter
    """
    Decrease the noise of spectra using Savitzky-Golay filter.
    
    Parameters:
    - spectra: numpy array or pandas DataFrame
        The spectra data to be processed.
    - window_length: int, optional (default=11)
        The length of the window used for filtering.
    - polyorder: int, optional (default=2)
        The order of the polynomial used for fitting.
    
    Returns:
    - filtered_spectra: numpy array or pandas DataFrame
        The spectra data after noise reduction.
    """
    if isinstance(spectra, np.ndarray):
        filtered_spectra = savgol_filter(spectra, window_length, polyorder, axis=1)
    elif isinstance(spectra, pd.DataFrame):
        filtered_spectra = spectra.apply(lambda x: savgol_filter(x, window_length, polyorder))
    else:
        raise ValueError("Invalid data type. Expected numpy array or pandas DataFrame.")
    
    return filtered_spectra


In [None]:
spec_denoise = decrease_noise(spectra, window_length=11, polyorder=3)


In [None]:
sub_spec = pd.DataFrame()
for i in range(len(df['Intervention'].unique())):
    index_ = df[df['Intervention'] == df['Intervention'].unique()[i]].index
    spec_ = spec_denoise.iloc[index_, :]
    mean_spec = spec_.mean(axis=0)
    spec_ = np.absolute(spec_ - mean_spec)
    #spec_ = decrease_noise(spec_, window_length=4, polyorder=2)
    sub_spec = pd.concat([sub_spec, spec_], axis=0)


In [None]:
import matplotlib.pyplot as plt
%matplotlib widget

plt.figure(figsize=(10, 5))
for i in range(len(spectra)):
    plt.plot(ppm, spec_denoise.iloc[i, :])


#invert x-axis

plt.gca().invert_xaxis()
plt.xlabel('ppm')
plt.ylabel('Intensity')
plt.title('Spectra of each sample')
plt.show()


plt.figure(figsize=(10, 5))
for i in range(len(spectra)):
    plt.plot(ppm, sub_spec.iloc[i, :])


#invert x-axis

plt.gca().invert_xaxis()
plt.xlabel('ppm')
plt.ylabel('Intensity')
plt.title('Spectra of each sample')
plt.show()


In [None]:

# Import the required python packages including 
# the custom Chemometric Model objects
import numpy as np



from sklearn import preprocessing
import pandas as pd
import matplotlib.pyplot as plt

from pyChemometrics.ChemometricsPCA import ChemometricsPCA
from pyChemometrics.ChemometricsScaler import ChemometricsScaler

# Use to obtain same values as in the text
np.random.seed(350)


import plotly.express as px
import plotly.graph_objects as go

from sklearn import decomposition
from sklearn.preprocessing import scale
from pca_ellipse import confidence_ellipse

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.pipeline import Pipeline

In [None]:

#fill nan with 0
X = sub_spec
Y = df['Intervention']
Y1 = pd.Categorical(Y).codes
ppm = list(np.ravel(X.columns).astype(float))
# Use pandas Categorical type to generate the dummy enconding of the Y vector (0 and 1) 

In [None]:
scale__ = 'UV'
scale_power_ = 1


model_scaler = ChemometricsScaler(scale_power=scale_power_)
model_scaler.fit(X)
model_X = model_scaler.transform(X)

pca_model = decomposition.PCA(n_components=2)
pca_model.fit(model_X)

scores_ = pca_model.transform(model_X)
df_scores_ = pd.DataFrame(scores_, columns=['PC1', 'PC2'])
#df_scores_.index = X.index

df2_scores_ = pd.concat([df_scores_, Y], axis=1)

#save PCA score to csv
#df2_scores_.to_csv('{}/PCA_scores_{}.csv'.format(Scores_save, name[i]))

loadings_ = pca_model.components_.T
df_loadings_ = pd.DataFrame(loadings_, columns=['PC1', 'PC2'], index=np.ravel(ppm))
#df_loadings_.to_csv(Loading_save + '/Loading_scores ' + plot_name + '.csv')

explained_variance_ = pca_model.explained_variance_ratio_
explained_variance_

explained_variance_ = np.insert(explained_variance_, 0, 0)

cumulative_variance_ = np.cumsum(np.round(explained_variance_, decimals=3))

pc_df_ = pd.DataFrame(['','PC1', 'PC2'], columns=['PC'])
explained_variance_df_ = pd.DataFrame(explained_variance_, columns=['Explained Variance'])
cumulative_variance_df_ = pd.DataFrame(cumulative_variance_, columns=['Cumulative Variance'])

df_explained_variance_ = pd.concat([pc_df_, explained_variance_df_, cumulative_variance_df_], axis=1)
#df_explained_variance_.to_csv(R2_save + '/R2 ' + plot_name + '.csv')


X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=357)
X_test = model_scaler.transform(X_test)
X_test_pca = pca_model.transform(X_test)

# Inverse transform the test set from the PCA space
X_test_reconstructed = pca_model.inverse_transform(X_test_pca)


# Calculate Q2 score for the test set
q2_test = r2_score(X_test, X_test_reconstructed)


# PCA plot
pca_label = df2_scores_.index


fig = px.scatter(df2_scores_, x='PC1', y='PC2',
                color='Intervention',
                color_discrete_map={
                                    "UCMS": "#E91E63",        
                                    "TKM powder 150 mg/kg": "#FF9800",
                                    "Imipramine": "#FFEB3B",       
                                    "TKM powder 37.5 mg/kg": "#9C27B0",
                                    "Vitamin E": "#03A9F4",
                                    "Diazepam": "#4CAF50",        
                                    "TKM powder 600 mg/kg": "#B30000",
                                    "0.5% SCMC": "#3F51B5"
                                    }, 
                title='<b>PCA Scores Plot ({} Scaling)<b>'.format(scale__), 
                height=900, width=1300,
                labels={"PC1": "PC1 R<sup>2</sup>X: {} %".format(np.round(df_explained_variance_.iloc[1,1]*100, decimals=2)),
                        "PC2": "PC2 R<sup>2</sup>X: {} %".format(np.round(df_explained_variance_.iloc[2,1]*100, decimals=2))},
                text=df.index)

#fig.add_annotation(yref = 'paper', y = -1.06, xref = 'paper', x=1.06 , text='Q2' +' = {}'.format(np.round(df_explained_variance_.iloc[2,2], decimals=2)))
#fig.update_annotations(font = {
#    'size': 20}, showarrow=False)

#set data point fill alpha with boarder in each color
fig.update_traces(marker=dict(size=35, opacity=0.7, line=dict(width=2, color='DarkSlateGrey')))

fig.add_annotation(dict(font=dict(color="black",size=20),
                        #x=x_loc,
                        x=1.0,
                        y=0.05,
                        showarrow=False,
                        text='<b>R<sup>2</sup>X (Cum): {}%<b>'.format(np.round(df_explained_variance_.iloc[2,2]*100, decimals=2)),
                        textangle=0,
                        xref="paper",
                        yref="paper"),
                        # set alignment of text to left side of entry
                        align="left")

fig.add_annotation(dict(font=dict(color="black",size=20),
                          #x=x_loc,
                        x=1.0,
                        y=0.01,
                        showarrow=False,
                        text='<b>Q<sup>2</sup>X (Cum): {}%<b>'.format(np.round(q2_test*100, decimals=2)),
                        textangle=0,
                        xref="paper",
                        yref="paper"),
                        # set alignment of text to left side of entry
                        align="left")



fig.update_traces(marker=dict(size=35))
#fig.update_traces(textposition='top center') #Text label position

#fig.update_traces(marker=dict(size=12, color=Y1_color, marker=Y2_marker))
fig.add_shape(type='path',
            path=confidence_ellipse(df2_scores_['PC1'], df2_scores_['PC2']))



fig.update_xaxes(zeroline=True, zerolinewidth=2, zerolinecolor='Black')
fig.update_yaxes(zeroline=True, zerolinewidth=2, zerolinecolor='Black')
fig.update_xaxes(showline=True, linewidth=2, linecolor='black')
fig.update_yaxes(showline=True, linewidth=2, linecolor='black')
fig.update_layout(
    title={
        'y':1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    font=dict(size=20))
fig.update_layout(paper_bgcolor='rgba(0,0,0,0)',plot_bgcolor='rgba(0,0,0,0)')

#fig.show()
#fig.write_image(PNG_save + "/PCA " + plot_name + ".png")
#fig.write_html(HTML_save + "/PCA " + plot_name + ".html")



In [None]:

#fill nan with 0
X = spec_denoise
Y = df['Intervention']
Y1 = pd.Categorical(Y).codes
ppm = list(np.ravel(X.columns).astype(float))
# Use pandas Categorical type to generate the dummy enconding of the Y vector (0 and 1) 

In [None]:
scale__ = 'UV'
scale_power_ = 1

# Mean Centering (MC):
#scaling_object_mc = ChemometricsScaler(scale_power=0)

# Pareto scaling (Par):
# scaling_object_par = ChemometricsScaler(scale_power=0.5)


model_scaler = ChemometricsScaler(scale_power=scale_power_)
model_scaler.fit(X)
model_X = model_scaler.transform(X)

pca_model = decomposition.PCA(n_components=2)
pca_model.fit(model_X)

scores_ = pca_model.transform(model_X)
df_scores_ = pd.DataFrame(scores_, columns=['PC1', 'PC2'])
#df_scores_.index = X.index

df2_scores_ = pd.concat([df_scores_, Y], axis=1)

#save PCA score to csv
#df2_scores_.to_csv('{}/PCA_scores_{}.csv'.format(Scores_save, name[i]))

loadings_ = pca_model.components_.T
df_loadings_ = pd.DataFrame(loadings_, columns=['PC1', 'PC2'], index=np.ravel(ppm))
#df_loadings_.to_csv(Loading_save + '/Loading_scores ' + plot_name + '.csv')

explained_variance_ = pca_model.explained_variance_ratio_
explained_variance_

explained_variance_ = np.insert(explained_variance_, 0, 0)

cumulative_variance_ = np.cumsum(np.round(explained_variance_, decimals=3))

pc_df_ = pd.DataFrame(['','PC1', 'PC2'], columns=['PC'])
explained_variance_df_ = pd.DataFrame(explained_variance_, columns=['Explained Variance'])
cumulative_variance_df_ = pd.DataFrame(cumulative_variance_, columns=['Cumulative Variance'])

df_explained_variance_ = pd.concat([pc_df_, explained_variance_df_, cumulative_variance_df_], axis=1)
#df_explained_variance_.to_csv(R2_save + '/R2 ' + plot_name + '.csv')


X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=357)
X_test = model_scaler.transform(X_test)
X_test_pca = pca_model.transform(X_test)

# Inverse transform the test set from the PCA space
X_test_reconstructed = pca_model.inverse_transform(X_test_pca)


# Calculate Q2 score for the test set
q2_test = r2_score(X_test, X_test_reconstructed)


# PCA plot
pca_label = df2_scores_.index


fig = px.scatter(df2_scores_, x='PC1', y='PC2',
                color='Intervention',
                color_discrete_map={
                                    "UCMS": "#E91E63",        
                                    "TKM powder 150 mg/kg": "#FF9800",
                                    "Imipramine": "#FFEB3B",       
                                    "TKM powder 37.5 mg/kg": "#9C27B0",
                                    "Vitamin E": "#03A9F4",
                                    "Diazepam": "#4CAF50",        
                                    "TKM powder 600 mg/kg": "#B30000",
                                    "0.5% SCMC": "#3F51B5"
                                    }, 
                title='<b>PCA Scores Plot ({} Scaling)<b>'.format(scale__), 
                height=900, width=1300,
                labels={"PC1": "PC1 R<sup>2</sup>X: {} %".format(np.round(df_explained_variance_.iloc[1,1]*100, decimals=2)),
                        "PC2": "PC2 R<sup>2</sup>X: {} %".format(np.round(df_explained_variance_.iloc[2,1]*100, decimals=2))},
                text=df['Cage'])

#fig.add_annotation(yref = 'paper', y = -1.06, xref = 'paper', x=1.06 , text='Q2' +' = {}'.format(np.round(df_explained_variance_.iloc[2,2], decimals=2)))
#fig.update_annotations(font = {
#    'size': 20}, showarrow=False)

#set data point fill alpha with boarder in each color
fig.update_traces(marker=dict(size=35, opacity=0.7, line=dict(width=2, color='DarkSlateGrey')))

fig.add_annotation(dict(font=dict(color="black",size=20),
                        #x=x_loc,
                        x=1.0,
                        y=0.05,
                        showarrow=False,
                        text='<b>R<sup>2</sup>X (Cum): {}%<b>'.format(np.round(df_explained_variance_.iloc[2,2]*100, decimals=2)),
                        textangle=0,
                        xref="paper",
                        yref="paper"),
                        # set alignment of text to left side of entry
                        align="left")

fig.add_annotation(dict(font=dict(color="black",size=20),
                          #x=x_loc,
                        x=1.0,
                        y=0.01,
                        showarrow=False,
                        text='<b>Q<sup>2</sup>X (Cum): {}%<b>'.format(np.round(q2_test*100, decimals=2)),
                        textangle=0,
                        xref="paper",
                        yref="paper"),
                        # set alignment of text to left side of entry
                        align="left")



fig.update_traces(marker=dict(size=35))
#fig.update_traces(textposition='top center') #Text label position

#fig.update_traces(marker=dict(size=12, color=Y1_color, marker=Y2_marker))
fig.add_shape(type='path',
            path=confidence_ellipse(df2_scores_['PC1'], df2_scores_['PC2']))



fig.update_xaxes(zeroline=True, zerolinewidth=2, zerolinecolor='Black')
fig.update_yaxes(zeroline=True, zerolinewidth=2, zerolinecolor='Black')
fig.update_xaxes(showline=True, linewidth=2, linecolor='black')
fig.update_yaxes(showline=True, linewidth=2, linecolor='black')
fig.update_layout(
    title={
        'y':1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    font=dict(size=20))
fig.update_layout(paper_bgcolor='rgba(0,0,0,0)',plot_bgcolor='rgba(0,0,0,0)')

#fig.show()
#fig.write_image(PNG_save + "/PCA " + plot_name + ".png")
#fig.write_html(HTML_save + "/PCA " + plot_name + ".html")



In [1]:
import pandas as pd

In [2]:
df1 = pd.read_csv('/Users/aeiwz/Library/CloudStorage/OneDrive-KhonKaenUniversity/KKUPC/Project/Alpha/KKUPC6602013/Re-analyse/Raw data/Dataset/KKUPC6602013_dataset.csv')
df2 = pd.read_csv('/Users/aeiwz/Library/CloudStorage/OneDrive-KhonKaenUniversity/KKUPC/Project/Alpha/KKUPC6602014/Dataset/KKUPC6602014_dataset.csv')

  df1 = pd.read_csv('/Users/aeiwz/Library/CloudStorage/OneDrive-KhonKaenUniversity/KKUPC/Project/Alpha/KKUPC6602013/Re-analyse/Raw data/Dataset/KKUPC6602013_dataset.csv')
  df2 = pd.read_csv('/Users/aeiwz/Library/CloudStorage/OneDrive-KhonKaenUniversity/KKUPC/Project/Alpha/KKUPC6602014/Dataset/KKUPC6602014_dataset.csv')


In [45]:
df = df1
spectra = df.iloc[:, 28:]
ppm = spectra.columns.values.astype(float)
metadata = df.iloc[:, :28]
feature_ = df.columns


In [6]:
def preprocessing_data(spectra, feature_name, metadata, window_length=11, polyorder=2):
    
    """
    Preprocessing data set to decrease noise and normalize.

    Parameters:
    - spectra: numpy array or pandas DataFrame
        The spectra data to be processed.
    - feature_name: list


    """
    from denoise_spec import Denoise
    import pandas as pd
    import numpy as np
    spectra = Denoise.decrease_noise(spectra, window_length=window_length, polyorder=polyorder)
    spectra = pd.DataFrame(spectra)
    spectra.columns = feature_name
    combind = pd.concat([metadata, spectra], axis=1, ignore_index=True)

    return combind
    

In [43]:
data = preprocessing_data(spectra, ppm, metadata, window_length=17, polyorder=3)
data.columns = feature_

In [44]:
data.to_csv('/Users/aeiwz/Library/CloudStorage/OneDrive-KhonKaenUniversity/KKUPC/Project/Alpha/KKUPC6602013/Re-analyse/Raw data/Dataset/KKUPC6602013_dataset_preprocessed.csv', index=False)