In [39]:
import pandas as pd
import numpy as np
import scipy
import plotly.graph_objects as go
from plotly.graph_objs import FigureWidget
from plotly.callbacks import Points, InputDeviceState
from ipywidgets import HBox, VBox, Button, interact
from functools import partial
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

## Neocis data##

In [40]:
df = pd.read_csv('https://raw.githubusercontent.com/bevi-rosso/Neocis/master/Neocis.csv', sep=';')
X = np.array(df.index)

In [41]:
@interact(variabile=df.columns)
def sel(variabile):
    fig=go.Figure()
    fig.add_trace(go.Scatter(x=np.arange(len(df.index)),y=df[variabile], mode='markers+lines', marker_color = 'green'))
    fig.show()

interactive(children=(Dropdown(description='variabile', options=('Tingresso', 'Solvente', 'Nd_on_PBu', 'Al_on_…

## PCA ##

In [42]:
# PCA
pca = PCA()
xs = scale(df)
x_red = pca.fit_transform(xs)
V = pca.components_
nPC = np.arange(len(V)) + 1
dfV = pd.DataFrame(V, columns=[df.columns], index=[nPC])
EV_cum = pd.DataFrame(np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100),
                  columns =['% Explained Variance'], index=[nPC])

In [43]:
print ('PCs = ', len(V))
dfV

PCs =  7


Unnamed: 0,Tingresso,Solvente,Nd_on_PBu,Al_on_Nd,Sty_on_Bde,Mooney,VEI
1,-0.358783,-0.418246,0.329424,-0.414178,-0.363168,0.350172,0.402206
2,0.449975,-0.174692,-0.563403,-0.084179,-0.372736,0.516521,-0.191753
3,0.408843,0.140916,0.443297,-0.407815,0.50385,0.408207,-0.172243
4,-0.138822,-0.080493,-0.410253,0.182902,0.58702,0.3129,0.574447
5,0.668456,-0.284311,0.29085,0.317304,-0.121018,-0.221395,0.472649
6,0.010482,0.799333,0.124231,0.161756,-0.340413,0.301865,0.335195
7,-0.188228,-0.220116,0.33047,0.70339,0.013905,0.453771,-0.325653


In [44]:
EV_cum.T

Unnamed: 0,1,2,3,4,5,6,7
% Explained Variance,73.34,87.23,92.85,96.04,98.3,99.38,100.01


In [45]:
# Calculate ellipse bounds and plot with scores
theta = np.concatenate((np.linspace(-np.pi, np.pi, 50), np.linspace(np.pi, -np.pi, 50)))
circle = np.array((np.cos(theta), np.sin(theta)))
sigma = np.cov(np.array((x_red[:, 0], x_red[:, 1])))
ed = np.sqrt(scipy.stats.chi2.ppf(0.95, 2))
ell = np.transpose(circle).dot(np.linalg.cholesky(sigma) * ed)
a, b = np.max(ell[: ,0]), np.max(ell[: ,1]) #95% ellipse bounds
t = np.linspace(0, 2 * np.pi, 100)

In [46]:
# Score plot
col = [x for x in range(len(X))]
xtit = 'PC1 - explained variance = '+str(np.round(100*pca.explained_variance_ratio_[0],decimals=2))+'%'
ytit = 'PC2 - explained variance = '+str(np.round(100*pca.explained_variance_ratio_[1],decimals=2))+'%'
fig=go.Figure()
fig.add_trace(go.Scatter(x=x_red[:,0], y=x_red[:,1], mode='markers', marker=dict(symbol=[200], 
             color=col, line_width=2, size=10, colorscale='Viridis', showscale=True)))
fig.add_trace(go.Scatter(x=a * np.cos(t),y=b * np.sin(t),mode='lines',
                        line=dict(color='lightgreen', width=2, dash='dash')))
fig.update_layout(height=600, width=800, title='PCA - Score plot', 
                  xaxis_title=xtit,yaxis_title=ytit, showlegend=False, xaxis_zeroline=True, yaxis_zeroline=True, 
                  xaxis_zerolinecolor='blue', yaxis_zerolinecolor='blue')

In [47]:
score=pd.DataFrame(x_red)
index = np.linspace(1,len(V),num=len(V))
index = index.astype('int')
index = index.astype('str')
score.columns=index
@interact(PCa=score.columns, PCb=score.columns)
def sel(PCa, PCb):
    fig=go.Figure()
    fig.add_trace(go.Scatter(x=np.arange(len(df.index)),y=score[PCa], name=PCa, mode='lines', marker_color = 'green'))
    fig.add_trace(go.Scatter(x=np.arange(len(df.index)),y=score[PCb], name=PCb, mode='lines', marker_color = 'red'))
    fig.update_layout(height=600, width=800, title='PCA - components plot',
                     xaxis_title='n° samples', yaxis_title='PCs',
                     xaxis_zeroline=True, yaxis_zerolinecolor='blue')
    fig.show()

interactive(children=(Dropdown(description='PCa', options=('1', '2', '3', '4', '5', '6', '7'), value='1'), Dro…

In [48]:
arr=np.array(dfV)
min(arr[1,:])-0.4

-0.9634028481257921

In [49]:
# Loadings plot
arr = np.array(dfV)
label = np.array(df.columns)
plot = go.Scatter(x=arr[0,:], y=arr[1,:], mode='markers+text', marker=dict(symbol=[201],color='darkred', size=10), 
              text = label, textposition="middle left")
fig=go.Figure()
fig.add_trace(plot)
fig.update_layout(height=600, width=800, title='PCA - Loadings plot', xaxis_title=xtit,yaxis_title=ytit, 
                  xaxis_zeroline=True, yaxis_zeroline=True, xaxis_zerolinecolor='blue', yaxis_zerolinecolor='blue')
fig.update_xaxes(range=[min(arr[0,:])-0.2, max(arr[0,:])+0.2])
fig.update_yaxes(range=[min(arr[1,:])-0.2, max(arr[1,:])+0.2])

In [50]:
N = len(df)
f1 = FigureWidget(**{
    'data': [{'marker': {'cmax': 2, 'cmin': 0, 'color': np.zeros(N), 'size': 8},
              'mode': 'markers', 'type': 'scatter', 'x': x_red[:,0], 'y': x_red[:,1]}],
    'layout': {'dragmode': 'lasso', 'xaxis': {'title': 'PC1'}, 'yaxis': {'title': 'PC2'}}})
scatt1 = f1.data[0]

In [51]:
f2 = FigureWidget(**{
    'data': [{'marker': {'cmax': 2, 'cmin': 0, 'color': np.zeros(N), 'size': 8},
              'mode': 'markers+lines', 'type': 'scatter', 'x': X, 'y': x_red[:,0]}],
    'layout': {'dragmode': 'lasso', 'xaxis': {'title': 'n°sample'}, 'yaxis': {'title': 'PC1'}}})
scatt2 = f2.data[0]

In [52]:
f3 = FigureWidget(**{
    'data': [{'marker': {'cmax': 2, 'cmin': 0, 'color': np.zeros(N), 'size': 8},
              'mode': 'markers+lines', 'type': 'scatter', 'x': X, 'y': x_red[:,1]}],
    'layout': {'dragmode': 'lasso', 'xaxis': {'title': 'n°sample'}, 'yaxis': {'title': 'PC2'}}})
scatt3 = f3.data[0]

In [53]:
# Configure brush on both plots to update both plots
def brush(trace, points, state):
    inds = np.array(points.point_inds)
    if inds.size:
        selected = scatt1.marker.color.copy()
        selected[inds] = 1
        scatt1.marker.color = selected
        scatt2.marker.color = selected
        scatt3.marker.color = selected
    
scatt1.on_selection(brush)
scatt2.on_selection(brush)
scatt3.on_selection(brush)

In [54]:
# Reset brush
def reset_brush(btn):
    selected = np.zeros(N)
    scatt1.marker.color = selected
    scatt2.marker.color = selected
    scatt3.marker.color = selected
    
# Create reset button
button = Button(description="RESET")
button.on_click(reset_brush)

In [55]:
upper=HBox([f1, button])
lower=HBox([f2,f3])
dashboard = VBox([upper, lower])
dashboard

VBox(children=(HBox(children=(FigureWidget({
    'data': [{'marker': {'cmax': 2,
                         'cmi…

## PLS ##

In [56]:
# Separa x e y
x = df.drop(['Mooney','VEI'],axis=1)         # elimina colonne y
y = df.Mooney
# PLS (https://scikit-learn.org/stable/modules/generated/sklearn.cross_decomposition.PLSRegression.html)
pls2 = PLSRegression(n_components=5)
pls2.fit(x, y)
Y_pred=pd.DataFrame(pls2.predict(x))

In [57]:
# Loadings plot

label_x = np.array(x.columns)
plot_x = go.Scatter(x=pls2.x_loadings_[:,0], y=pls2.x_loadings_[:,1], 
                  mode='markers+text', marker=dict(symbol=[201],color='darkgreen', size=10), 
                  text = label_x, textposition="top left")
plot_y = go.Scatter(x=pls2.y_loadings_[:,0], y=pls2.y_loadings_[:,1], 
                  mode='markers+text', marker=dict(symbol=[201],color='darkred', size=10), 
                  text = 'Mooney', textposition="top left")

fig=go.Figure()
fig.add_trace(plot_x)
fig.add_trace(plot_y)
fig.update_layout(height=600, width=800, title='PLS - Loadings plot', xaxis_title='PC1',yaxis_title='PC2', 
                  xaxis_zeroline=True, yaxis_zeroline=True, xaxis_zerolinecolor='blue', yaxis_zerolinecolor='blue',
                  showlegend=False)
fig.update_xaxes(range=[min(pls2.x_loadings_[:,0])-0.2, max(pls2.x_loadings_[:,1])+0.2])
fig.update_yaxes(range=[min(pls2.x_loadings_[:,0])-0.2, max(pls2.x_loadings_[:,1])+0.2])

In [58]:
# Create linear regression object
y_exp=np.array(y)                 # Dataframe to array
y_exp=y_exp[:,np.newaxis]         # Add 2nd dimension
y_pls=np.array(Y_pred[0])         # Dataframe to array

regr = linear_model.LinearRegression()
regr.fit(y_exp, y_pls)
y_regr = regr.predict(y_exp)

rmse=mean_squared_error(y_exp,y_pls,squared=False)
r2=r2_score(y_exp,y_pls)

In [59]:
# Regression plot
title_pls= 'PLS regression: r^2 = '+str(np.round(r2, decimals=5))+','+' RMSE = '+str(np.round(rmse, decimals=5))
fig=go.Figure()
fig.add_trace(go.Scatter(x=np.array(y), y=y_pls, mode='markers', marker=dict(symbol=[200], 
             color=col, line_width=2, size=10, colorscale='Viridis', showscale=True)))
fig.add_trace(go.Scatter(x=np.array(y), y=y_regr, mode='lines'))
fig.update_layout(height=600, width=800, title = title_pls, 
                  xaxis_title='Mooney',yaxis_title='Mooney preditecd', 
                  showlegend=False, xaxis_zeroline=True, yaxis_zeroline=True, 
                  xaxis_zerolinecolor='blue', yaxis_zerolinecolor='blue')