In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd

data = pd.read_excel('nanoplastic_data.xlsx')
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=42)

knn = KNeighborsRegressor()

param_grid = {
    'n_neighbors': [3, 5, 7, 9], 
    'weights': ['uniform', 'distance'],  
    'metric': ['euclidean', 'manhattan']  
}


grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)


grid_search.fit(Xtrain, Ytrain)

print("Best Parameters: ", grid_search.best_params_)
print("Best Score: ", -grid_search.best_score_)

y_pred = grid_search.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: ", mse)


In [5]:
from sklearn.model_selection import cross_val_score,cross_val_predict,train_test_split
from sklearn.neighbors import KNeighborsRegressor
import pandas as pd
X = pd.read_excel('nanoplastic_std_x0.xlsx')
y = pd.read_excel('nanoplastic_std_y.xlsx')['Cell viability']

Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=60)

knn = KNeighborsRegressor()
CV_score = cross_val_score(knn, Xtrain, Ytrain, cv=5).mean()
regressor = knn.fit(Xtrain, Ytrain)
score_test = regressor.score(Xtest,Ytest)
print("5cv:",CV_score,"TEST:",score_test)

5cv: 0.31782682176148896 TEST: 0.483842333209731


In [6]:
for i in range(500):
    Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=i)
    rfc = KNeighborsRegressor()
    CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    regressor = rfc.fit(Xtrain, Ytrain)
    score_test = regressor.score(Xtest,Ytest)
#     if CV < CV_score and test < score_test:
#         CV = CV_score
#         test = score_test
    print("5cv:",CV_score,"TEST:",score_test)
    if CV_score>0.40 and score_test>0.4:
        print("5cv:",CV_score,"TEST:",score_test,"random_state:",i)

5cv: 0.3560757159271544 TEST: 0.35072861841864333
5cv: 0.41372601755489447 TEST: 0.3672831948156128
5cv: 0.3913212040604427 TEST: 0.2697743176016628
5cv: 0.38991296968027944 TEST: 0.33064753332615626
5cv: 0.3775159841047693 TEST: 0.41185643219689405
5cv: 0.34397120384772367 TEST: 0.4567302912138327
5cv: 0.3898543309015642 TEST: 0.301103065516484
5cv: 0.3397536258871468 TEST: 0.4995598455539606
5cv: 0.35556341613738096 TEST: 0.46318938000293564
5cv: 0.3796068343718466 TEST: 0.39703437953812426
5cv: 0.3167422498417346 TEST: 0.39790337646883933
5cv: 0.37695414823774415 TEST: 0.43400510816287274
5cv: 0.3860450757920155 TEST: 0.36560779596824855
5cv: 0.39203819296305653 TEST: 0.3318240035486756
5cv: 0.3894683803212602 TEST: 0.33894927447573353
5cv: 0.3104028506854228 TEST: 0.5189413457575571
5cv: 0.32846194247667193 TEST: 0.47107639241704746
5cv: 0.3538969261303707 TEST: 0.3695966121051568
5cv: 0.3889549621510307 TEST: 0.393554526388585
5cv: 0.3561858888017295 TEST: 0.32952817347461694
5cv:

In [8]:
import numpy as np
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import mean_absolute_error 
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=296)

rfc = KNeighborsRegressor(n_neighbors=5)
CV_score = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
regressor = rfc.fit(Xtrain, Ytrain)
CV_predictions = cross_val_predict(rfc, Xtrain,Ytrain,cv=5)
rmse = np.sqrt(mean_squared_error(Ytrain,CV_predictions))


score_test = regressor.score(Xtest,Ytest)
print("5cv:",CV_score,"TEST:",score_test)
print("rmse_5CV",rmse)

5cv: 0.42583776868336753 TEST: 0.3299121068043256
rmse_5CV 18.15711196752332


In [11]:
score_5cv_all = []
for i in range(0, 12, 1):
    rfc =KNeighborsRegressor(n_neighbors=i+1)
    score_5cv =cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass


score_max_5cv = max(score_5cv_all)
n_neighbors_5cv = range(0, 12,1)[score_5cv_all.index(max(score_5cv_all))]+1

print("Best_5cv score:{}".format(score_max_5cv),
      "n_neighbors_5cv:{}".format(n_neighbors_5cv))


score_5cv_all = []
for i in ['uniform', 'distance']:
    rfc = KNeighborsRegressor(weights=i
                                , n_neighbors=n_neighbors_5cv)
    score_5cv = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass

score_max_5cv = max(score_5cv_all)
weights_5cv = ['uniform', 'distance'][score_5cv_all.index(score_max_5cv)]

print("Best_5cv score:{}".format(score_max_5cv),
      "weights_5cv:{}".format(weights_5cv))

score_5cv_all = []
for i in ['brute', 'kd_tree','auto', 'ball_tree']:
    rfc = KNeighborsRegressor(algorithm=i
                                , weights=weights_5cv
                                , n_neighbors=n_neighbors_5cv)
    score_5cv = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass

score_max_5cv = max(score_5cv_all)
algorithm_5cv = ['brute', 'kd_tree','auto', 'ball_tree'][score_5cv_all.index(score_max_5cv)]

print("Best_5cv score:{}".format(score_max_5cv),
      "algorithm_5cv:{}".format(algorithm_5cv))

score_5cv_all = []
for i in range(0, 1000, 1):
    rfc = KNeighborsRegressor(leaf_size=i+1
                                , algorithm=algorithm_5cv
                                , weights=weights_5cv
                                , n_neighbors=n_neighbors_5cv)
    score_5cv = cross_val_score(rfc, Xtrain, Ytrain, cv=5).mean()
    score_5cv_all.append(score_5cv)
    pass

score_max_5cv = max(score_5cv_all)
leaf_size_5cv = range(10, 1000, 1)[score_5cv_all.index(score_max_5cv)]+1

print("Best_5cv score:{}".format(score_max_5cv),
      "leaf_size_5cv:{}".format(leaf_size_5cv))


Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.2,random_state=82)

knn = KNeighborsRegressor(leaf_size=leaf_size_5cv
                                , algorithm=algorithm_5cv
                                , weights=weights_5cv
                                , n_neighbors=n_neighbors_5cv)

CV_score = cross_val_score(knn, Xtrain, Ytrain, cv=5).mean()
CV_predictions = cross_val_predict(knn, Xtrain,Ytrain,cv=5)
rmse = np.sqrt(mean_squared_error(Ytrain,CV_predictions))
mae = mean_absolute_error(Ytrain,CV_predictions)
print("r2_5cv:",CV_score,"rmse_5CV",rmse,"MAE_5CV",mae)
expvspred_5cv = {'Exp': Ytrain, 'Pred':CV_predictions}
pd.DataFrame(expvspred_5cv).to_excel('KNN_5fcv_pred.xlsx')

'''
Test set validation

'''

knn = KNeighborsRegressor(leaf_size=leaf_size_5cv
                                , algorithm=algorithm_5cv
                                , weights=weights_5cv
                                , n_neighbors=n_neighbors_5cv)
regressor = knn.fit(Xtrain, Ytrain)
test_predictions = regressor.predict(Xtest)
score_test = regressor.score(Xtest,Ytest)
test_rmse = np.sqrt(mean_squared_error(Ytest,test_predictions))
test_mae = mean_absolute_error(Ytest,test_predictions)
print("test:",score_test)
print("rmse_test",test_rmse)
print("mae_test",test_mae)
expvspred_test = {'Exp':Ytest,'Pred':test_predictions}
pd.DataFrame(expvspred_test).to_excel('KNN_test_pred.xlsx')

Best_5cv score:0.45279878033598564 n_neighbors_5cv:2
Best_5cv score:0.45279878033598564 weights_5cv:uniform
Best_5cv score:0.45456236746223444 algorithm_5cv:ball_tree
Best_5cv score:0.45456236746223444 leaf_size_5cv:11
r2_5cv: 0.39944634538537904 rmse_5CV 18.339917415434297 MAE_5CV 10.878557198521046
test: 0.614859165970695
rmse_test 14.430788918020689
mae_test 8.978184147727273
