In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score,balanced_accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import KFold

df = pd.read_csv('qsar_oral_toxicity.csv', delimiter=';')
df = df.replace('negative', 0)
df = df.replace('positive', 1)
x, y = df.iloc[:, :-1], df.iloc[:, [-1]]
print(x.shape)

(8991, 1024)


In [2]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [3]:
# place for feature selection such as using selectkbest or RFECV


In [4]:
# initialize models
logreg = LogisticRegression(max_iter=10000)
svm = SVC()
mlpc = MLPClassifier()
estimators =[('logreg',logreg),('svm',svm),('mlpc',mlpc)]
Stackedclf = StackingClassifier(estimators=estimators, final_estimator=DecisionTreeClassifier())

In [5]:
#Grid search parameters
svm_param_grid = {'C': [0.01, 1, 10, ], 'gamma': [1,0.1, 0.01],'kernel': [ 'sigmoid','linear']}
svmGSCV =  GridSearchCV(SVC(),svm_param_grid,refit=True,verbose=2, scoring ="balanced_accuracy")

In [6]:
# add the models to this list to automate evaluation
models = [logreg,mlpc, svm, Stackedclf]

In [None]:
# cross validate and evaluation loop
for model in models:
    scores = cross_val_score(model, X_train, y_train.values.ravel(), cv=5, scoring='balanced_accuracy',error_score='raise')
    print("model balanced_accuracy: ", scores.mean())

model balanced_accuracy:  0.6829191289333494
model balanced_accuracy:  0.7300457427591851
model balanced_accuracy:  0.6425780070868857


In [None]:
kf = KFold(n_splits=5)
scores = []
for i, j in kf.split(x, y):
    X_train, y_train = x.iloc[i], y.iloc[i]
    X_test, y_test = x.iloc[j], y.iloc[j]
    scores.append((balanced_accuracy_score(y_test,pls_da(X_train,y_train, X_test) )))
print(sum(scores)/len(scores))

In [None]:
#fit best estimator for gridsearch if necessary 


In [None]:
# fit using test data for final evaluation
for model in models:
    model.fit(X_train,y_train.values.ravel())
    y_pred = model.predict(X_test)
    print(balanced_accuracy_score(y_test, y_pred))
    print(confusion_matrix(y_test,y_pred))
