<a href="https://colab.research.google.com/github/ajayaram92/Paper-Testing/blob/main/Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing Modules

In [1]:
import requests
import pandas as pd
import numpy as np
import tensorflow as tf
from scipy import stats
from scipy.signal import savgol_filter

from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error, r2_score

Getting Data from GitHub

In [2]:
def get_data(url):
  df = pd.read_csv(url)
  df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
  return df

Pre Processing function for Sav Gol Fiter and Standardising

In [32]:
url="https://raw.githubusercontent.com/ajayaram92/Paper-Testing/main/ABv1.csv"
urltest = "https://raw.githubusercontent.com/ajayaram92/Paper-Testing/4e49e5fde3d27b7a24d8fa9cd3b5e76ca8b9db3f/peach_spectra_brix.csv"
df = get_data(url)
testdf = get_data(urltest)
print(len(testdf.columns))

print(get_data(url))

601
      LATITUDE  LONGITUDE    OC   NDVI_v1    s2_1    s2_2    s2_3    s2_4  \
0    12.868275  75.239625  0.51  0.108786   730.0  1022.0  1456.0  2533.0   
1    12.853780  74.972660  0.79  0.138859  1214.5  1320.0  1963.0  2400.0   
2    12.853512  74.974800  1.41  0.150655  1202.5  1468.0  2082.0  2478.0   
3    12.890035  75.230001  0.58  0.093620   740.0  1488.0  2016.0  2447.0   
4    12.853595  74.972970  0.46  0.147096  1174.0  1447.0  2029.0  2357.0   
..         ...        ...   ...       ...     ...     ...     ...     ...   
156        NaN        NaN   NaN       NaN     NaN     NaN     NaN     NaN   
157        NaN        NaN   NaN       NaN     NaN     NaN     NaN     NaN   
158        NaN        NaN   NaN       NaN     NaN     NaN     NaN     NaN   
159        NaN        NaN   NaN       NaN     NaN     NaN     NaN     NaN   
160        NaN        NaN   NaN       NaN     NaN     NaN     NaN     NaN   

       s2_5    s2_6  ...     _mean   _median    _stdev      _min      _

In [7]:
def pre_processing(data, sav_gol=True, sav_win=11, sav_pol=2, std=True):
  
  new_df =  data.drop(["LATITUDE", "LONGITUDE", "OC"], axis =1)  # removing lat long and oc for Sav gol
  if sav_gol==True:   # checking if sav_gol is required
    new_df = pd.DataFrame(savgol_filter(new_df, sav_win, sav_pol, axis=1))
  new_df.columns = data.columns[3:]
  
  new_df["OC"] = data["OC"]
  if std==True:       # checking if standardising is required
      mean = new_df.mean(axis = 0)
      std = new_df.std(axis = 0)
      new_df = (new_df-mean)/std
  
  return (new_df)

In [29]:
pro_df = pre_processing(df)

In [None]:
print(df.mean(axis=0))
print(pre_processing(df))

LATITUDE       12.920576
LONGITUDE      74.991137
OC              0.962673
443.9         750.347926
496.6        1041.041475
560          1412.092166
664.5        1809.993088
703.9        2086.711982
740.2        2603.760369
782.5        2900.937788
835.1        2924.559908
864.8        3120.755760
945          3080.288018
1613.7       3659.124424
2202.4       2615.682028
dtype: float64
        443.9     496.6       560     664.5     703.9     740.2     782.5  \
0   -1.359204 -1.505741 -1.456658 -1.392360 -1.384494 -1.442043 -1.506616   
1   -1.833071 -0.889543 -0.080230  0.383639  0.595658  0.638761  0.730407   
2   -1.221219 -1.173820 -0.987467 -0.828131 -0.733301 -0.693434 -0.484871   
3   -0.572184 -0.335736 -0.134436 -0.031155 -0.004799 -0.035738  0.063719   
4   -1.789664 -1.804455 -1.604452 -1.428915 -1.344719 -1.346818 -1.185321   
..        ...       ...       ...       ...       ...       ...       ...   
212 -0.209428  0.756358  1.418258  1.772530  1.971647  2.097676  2.2404

In [26]:
def PLSR(data):
    y = data["OC"]
    X = data.drop("OC", axis=1)
    n_comp= len(data.columns)
    if n_comp > 100:
      n_comp = 20
    mse = []
    component = np.arange(1, n_comp)
 
    for i in component:
        pls = PLSRegression(n_components=i)
 
        # Cross-validation
        y_cv = cross_val_predict(pls, X, y, cv=10)
        mse.append(r2_score(y, y_cv))



    # Calculate minimum in MSE
    print(mse)
    msemin = np.argmax(mse)
    print("number of components used", msemin+1)
 
    # Define PLS object with optimal number of components
    pls_opt = PLSRegression(n_components=msemin+1)
 
    # Fir to the entire dataset
    pls_opt.fit(X, y)
    y_c = pls_opt.predict(X)
 
    # Cross-validation
    y_cv = cross_val_predict(pls_opt, X, y, cv=10)
 
    # Calculate scores for calibration and cross-validation
    score_c = r2_score(y, y_c)
    score_cv = r2_score(y, y_cv)
 
    # Calculate mean squared error for calibration and cross validation
    mse_c = mean_squared_error(y, y_c)
    mse_cv = mean_squared_error(y, y_cv)
 
    print('R2 calib: %5.3f'  % score_c)
    print('R2 CV: %5.3f'  % score_cv)
    print('MSE calib: %5.3f' % mse_c)
    print('MSE CV: %5.3f' % mse_cv)
 
    # Plot regression and figures of merit
    rangey = max(y) - min(y)
    rangex = max(y_c) - min(y_c)
    #z = np.polyfit(y, y_c, 1)

    return score_cv, mse_cv

In [30]:
print(PLSR(df.drop(["LONGITUDE","LATITUDE"], axis=1)))
print(PLSR(testdf))

ValueError: ignored