In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neural_network import MLPRegressor, MLPClassifier

import warnings
warnings.filterwarnings(action='ignore')

In [3]:
data = pd.read_csv('/content/PhysicoChemical_Data - Sheet1.csv')

In [4]:
data

Unnamed: 0,S/N,Sample_Identity,Ca2+,Mg2+,Na+,K+,Al3+,H+,Cl-,SO42-,NO3-,CO32-,TOC(%),Carbohydrate(ug/g)
0,1,Station1,1.9,0.6,0.9,0.3,1.2,0.7,0.4,0.8,0.2,0.0,14.39,5.36
1,2,Station2,2.1,0.7,0.9,0.4,1.1,0.6,0.2,0.5,0.1,0.0,15.54,2.43
2,3,Station3,1.9,0.5,0.8,0.3,0.9,0.5,0.3,0.7,0.1,0.1,14.86,4.3
3,4,Station4,1.8,0.5,0.7,0.3,1.0,0.6,0.3,0.7,0.2,0.1,16.74,6.14
4,5,Station5,2.0,0.8,0.8,0.4,1.2,0.7,0.5,0.9,0.4,0.2,15.65,7.56
5,6,Station6,1.9,0.6,0.8,0.4,1.1,0.7,0.6,0.9,0.3,0.0,20.92,4.95
6,7,Station7,2.1,0.7,0.8,0.3,1.2,0.7,0.5,0.8,0.2,0.0,18.77,6.39
7,8,Station8,2.2,0.6,0.8,0.3,1.0,0.6,0.3,0.7,0.3,0.0,18.25,4.41
8,9,Station9,2.0,0.6,0.8,0.2,0.8,0.7,0.3,0.9,0.2,0.0,15.92,4.16
9,10,Station10,1.0,0.5,0.7,0.4,1.1,0.6,0.4,0.8,0.3,0.1,16.48,2.84


# Preprocessing

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   S/N                 10 non-null     int64  
 1   Sample_Identity     10 non-null     object 
 2   Ca2+                10 non-null     float64
 3   Mg2+                10 non-null     float64
 4   Na+                 10 non-null     float64
 5   K+                  10 non-null     float64
 6   Al3+                10 non-null     float64
 7   H+                  10 non-null     float64
 8   Cl-                 10 non-null     float64
 9   SO42-               10 non-null     float64
 10  NO3-                10 non-null     float64
 11  CO32-               10 non-null     float64
 12  TOC(%)              10 non-null     float64
 13  Carbohydrate(ug/g)  10 non-null     float64
dtypes: float64(12), int64(1), object(1)
memory usage: 1.2+ KB


In [16]:
def preprocess_inputs(df, task='regression'):
    df = df.copy()
    
    y = df['TOC(%)']
    
    X = df.drop(['TOC(%)', 'Sample_Identity', 'S/N'], axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale feature data (X)
    scaler = StandardScaler()
    scaler.fit(X_train)
    
    X_train = pd.DataFrame(scaler.transform(X_train), columns=X.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)
    
    return X_train, X_test, y_train, y_test

In [17]:
X_train, X_test, y_train, y_test = preprocess_inputs(data, task='regression')

In [18]:
X_train

Unnamed: 0,Ca2+,Mg2+,Na+,K+,Al3+,H+,Cl-,SO42-,NO3-,CO32-,Carbohydrate(ug/g)
0,0.114708,1.946657,-0.223607,1.020621,1.104315,0.866025,1.006231,0.928279,1.739253,2.157277,1.71149
1,-0.688247,-0.324443,1.341641,-0.408248,1.104315,0.866025,0.223607,0.206284,-0.474342,-0.588348,0.239857
2,-1.491202,-1.459993,-1.788854,-0.408248,-0.441726,-1.154701,-0.559017,-0.515711,-0.474342,0.784465,0.761618
3,0.917663,0.811107,1.341641,1.020621,0.331295,-1.154701,-1.341641,-1.9597,-1.581139,-0.588348,-1.72009
4,1.720618,-0.324443,-0.223607,-0.408248,-0.441726,-1.154701,-0.559017,-0.515711,0.632456,-0.588348,-0.395621
5,0.114708,-0.324443,-0.223607,-1.837117,-1.987767,0.866025,-0.559017,0.928279,-0.474342,-0.588348,-0.562852
6,-0.688247,-0.324443,-0.223607,1.020621,0.331295,0.866025,1.788854,0.928279,0.632456,-0.588348,-0.034402


In [19]:
y_train

4    15.65
0    14.39
3    16.74
1    15.54
7    18.25
8    15.92
5    20.92
Name: TOC(%), dtype: float64

In [20]:
lin_reg_model = LinearRegression()
lin_reg_model.fit(X_train, y_train)

print("Linear Regression R^2: {:.5f}".format(lin_reg_model.score(X_test, y_test)))

Linear Regression R^2: -1.75402


In [21]:
nn_reg_model = MLPRegressor(hidden_layer_sizes=(16, 16))
nn_reg_model.fit(X_train, y_train)

print("NN Regression R^2: {:.5f}".format(nn_reg_model.score(X_test, y_test)))

NN Regression R^2: -21.48895


In [43]:
test_data = np.array([[0.1178708,	4.946657,	-0.223607,	1.020621,	1.104315,	0.866025,	1.006231,	0.928279,	1.739253,	2.157277, 1.711490]])

In [44]:
# Make predictions using the testing set
toc_y_pred = lin_reg_model.predict(test_data)

In [45]:
toc_y_pred

array([14.04008887])