<a href="https://colab.research.google.com/github/adichat08/Support-Vector-Classifier-for-Predicting-Survival-Likelihood-of-Hepatitis-Patients/blob/main/Initial_Model_Testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



---

# Initial Model Testing
Note: All preprocessing is done in a cross-validation loop. This is to avoid data leakage.

--------------------------------------------
In this section, four different models will be trained and tested on training data using three-fold cross validation.

The chosen models are:
*   A linear model(LogisticRegression)
*   A gradient boosting machine(XGBClassifier)
*   A multilayer preceptron(MLPClassifier)
*   A support vector machine(SVC)





In [None]:
# looking at scikit-learn's available scoring metrics
print("Scoring Metrics:\n{}".format(sorted(SCORERS.keys())))

Scoring Metrics:
['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'completeness_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted', 'max_error', 'mutual_info_score', 'neg_brier_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_gamma_deviance', 'neg_mean_poisson_deviance', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'neg_root_mean_squared_error', 'normalized_mutual_info_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'rand_score', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'roc_auc_ovo', 'roc_auc_ovo_weighted', 'roc_auc_ovr', 'roc_auc_ovr_weighted', 'top_k_accuracy', 'v_m

In [None]:
# creating a column transformer to apply the StandardScaler() om
# the numerical columns
ct =ColumnTransformer([("scaling",StandardScaler(),['AGE','BILIRUBIN','ALK PHOSPHATE','SGOT','ALBUMIN'])])

In [None]:
# creating a pipeline to preprocess the data inside the cross validation(to avoid data leakage)
log_pipe_cv = Pipeline([('preprocessing',ct),('log',LogisticRegression(random_state=42))])
# creating a KFold object to use for the cross validation
kfold = KFold(n_splits=3,shuffle=True,random_state=42)
# evaluating the performance performance of the linear model(LogisticRegression) with the
# chosen performance metrics
print("Cross-validation scores:\n{}".format(
      cross_val_score(log_pipe_cv,X_train,y_train,cv=kfold)))
print('Average score:\n{}'.format(
    cross_val_score(log_pipe_cv,X_train,y_train,cv=kfold).mean()))
print("Cross-validation AUC:\n{}".format(
      cross_val_score(log_pipe_cv,X_train,y_train,cv=kfold,scoring='roc_auc')))
print('Average AUC:\n{}'.format(
    cross_val_score(log_pipe_cv,X_train,y_train,cv=kfold,scoring='roc_auc').mean()))
print('F1 average:\n{}'.format(
    cross_val_score(log_pipe_cv,X_train,y_train,cv=kfold,scoring = 'f1').mean()))

Cross-validation scores:
[0.8974359  0.87179487 0.8974359 ]
Average score:
0.8888888888888888
Cross-validation AUC:
[0.80803571 0.89112903 0.83482143]
Average AUC:
0.8446620583717358
F1 average:
0.6495726495726495


In [None]:
# repeating the process with the gradient boosting model(XGBClassifier)
xgb_pipe_cv = Pipeline([('preprocessing',ct),('xgb',XGBClassifier(random_state=42))])
kfold = KFold(n_splits=3,shuffle=True,random_state=42)
print("Cross-validation scores:\n{}".format(
      cross_val_score(xgb_pipe_cv,X_train,y_train,cv=kfold)))
print('Average score:\n{}'.format(
    cross_val_score(xgb_pipe_cv,X_train,y_train,cv=kfold).mean()))
print("Cross-validation AUC:\n{}".format(
      cross_val_score(xgb_pipe_cv,X_train,y_train,cv=kfold,scoring='roc_auc')))
print('Average AUC:\n{}'.format(
    cross_val_score(xgb_pipe_cv,X_train,y_train,cv=kfold,scoring='roc_auc').mean()))
print('F1 average:\n{}'.format(
    cross_val_score(xgb_pipe_cv,X_train,y_train,cv=kfold,scoring = 'f1').mean()))

Cross-validation scores:
[0.82051282 0.87179487 0.84615385]
Average score:
0.8461538461538461
Cross-validation AUC:
[0.87946429 0.87903226 0.84375   ]
Average AUC:
0.8674155145929339
F1 average:
0.5256410256410257


In [None]:
# repeating the process with the multilayer perceptron(MLPClassifier)
mlp_pipe_cv = Pipeline([('preprocessing',ct),('mlp',MLPClassifier(max_iter=2000,random_state=42))])
kfold = KFold(n_splits=3,shuffle=True,random_state=42)
mlp = MLPClassifier(max_iter=1000,random_state=42)
print("Cross-validation scores:\n{}".format(
      cross_val_score(mlp_pipe_cv,X_train,y_train,cv=kfold)))
print('Average score:\n{}'.format(
    cross_val_score(mlp_pipe_cv,X_train,y_train,cv=kfold).mean()))
print("Cross-validation AUC:\n{}".format(
      cross_val_score(mlp_pipe_cv,X_train,y_train,cv=kfold,scoring='roc_auc')))
print('Average AUC:\n{}'.format(
    cross_val_score(mlp_pipe_cv,X_train,y_train,cv=kfold,scoring='roc_auc').mean()))
print('F1 average:\n{}'.format(
    cross_val_score(mlp_pipe_cv,X_train,y_train,cv=kfold,scoring = 'f1').mean()))

Cross-validation scores:
[0.92307692 0.87179487 0.92307692]
Average score:
0.905982905982906
Cross-validation AUC:
[0.75       0.83064516 0.85267857]
Average AUC:
0.8111079109062981
F1 average:
0.7039627039627039


In [None]:
# repeating the process with the support vector machine(SVC)
svc_pipe_cv = Pipeline([('preprocessing',ct),('svm',SVC(random_state=42))])
kfold = KFold(n_splits=3,shuffle=True,random_state=42)
svc = SVC(random_state=42)
print("Cross-validation scores:\n{}".format(
      cross_val_score(svc_pipe_cv,X_train,y_train,cv=kfold)))
print('Average score:\n{}'.format(
    cross_val_score(svc_pipe_cv,X_train,y_train,cv=kfold).mean()))
print("Cross-validation AUC:\n{}".format(
      cross_val_score(svc_pipe_cv,X_train,y_train,cv=kfold,scoring='roc_auc')))
print('Average AUC:\n{}'.format(
    cross_val_score(svc_pipe_cv,X_train,y_train,cv=kfold,scoring='roc_auc').mean()))
print('F1 average:\n{}'.format(
    cross_val_score(svc_pipe_cv,X_train,y_train,cv=kfold,scoring = 'f1').mean()))

Cross-validation scores:
[0.87179487 0.8974359  0.8974359 ]
Average score:
0.8888888888888888
Cross-validation AUC:
[0.81696429 0.91935484 0.82589286]
Average AUC:
0.8540706605222734
F1 average:
0.604040404040404


The neural network produced the greatest accuracy, closely followed by the linear model and the SVC. The gradient boosting classifier had the lowest accuracy.

When it came to AUC, the gradient boosting classifier showed the best performance. Next, the SVC returned the second best score, followed by the linear model. The neural network returned the lowest AUC score.

The best f1-score came from the neural network. The linear model returned an f1 score that was slightly lower than that of the neural network, and the SVC came in third. The gradient boosting classifier had the lowest f1-score.

Based on this initial cross-evaluation, the neural network and the linear model appear to be performing, on average, better than the SVC and gradient boosting classifier.

Linear models, as implied by the name, usually model a linear relationship between features and the target. One thing that can make this linear relationship more complex is binning of certain features in the dataset. Essentially, binning will split each feature up into multiple new features, allowing the model to make a representation for each one of those new features. In this case, it is likely to add some complexity to the model, allowing it to consider many more factors about the input data when making decisions.

Adding polynomial features, or powers of the original feature, can also, in many cases, help improve performance by changing the linear model's representation of a feature from a line to a curve. This will create a more complex model.

In [None]:
# applying the column transformer to scale the data
X_binned = ct.fit_transform(X_train)
# creating the KBinsDiscretizer object to split each features into 5 new ones
kb = KBinsDiscretizer(n_bins=5,strategy='quantile')
# applying the KBinsDiscretizer object on the training data
kb.fit(X_binned)
X_binned = kb.transform(X_binned)

  'decreasing the number of bins.' % jj)


In [None]:
# Running a cross validation with the linear model on the binned data
kfold = KFold(n_splits=3,shuffle=True,random_state=42)
log = LogisticRegression(random_state=42)
print("Cross-validation scores:\n{}".format(
      cross_val_score(log,X_binned,y_train,cv=kfold)))
print('Average score:\n{}'.format(
    cross_val_score(log,X_binned,y_train,cv=kfold).mean()))
print("Cross-validation AUC:\n{}".format(
      cross_val_score(log,X_binned,y_train,cv=kfold,scoring='roc_auc')))
print('Average AUC:\n{}'.format(
    cross_val_score(log,X_binned,y_train,cv=kfold,scoring='roc_auc').mean()))
print('F1 average:\n{}'.format(
    cross_val_score(log,X_binned,y_train,cv=kfold,scoring = 'f1').mean()))

Cross-validation scores:
[0.82051282 0.92307692 0.79487179]
Average score:
0.8461538461538461
Cross-validation AUC:
[0.875      0.86290323 0.78571429]
Average AUC:
0.841205837173579
F1 average:
0.5204795204795205


In [None]:
# adding polynomial features(degree 5) of the original dataset
poly = PolynomialFeatures(degree=5, include_bias=False)
poly.fit(X_binned)
X_poly = poly.transform(X_binned)

In [None]:
# Running a cross validation with the linear model on the data holding the polynomial features
kfold = KFold(n_splits=3,shuffle=True,random_state=42)
log = LogisticRegression(random_state=42,max_iter=1000,)
print("Cross-validation scores:\n{}".format(
      cross_val_score(log,X_poly,y_train,cv=kfold)))
print('Average score:\n{}'.format(
    cross_val_score(log,X_poly,y_train,cv=kfold).mean()))
print("Cross-validation AUC:\n{}".format(
      cross_val_score(log,X_poly,y_train,cv=kfold,scoring='roc_auc')))
print('Average AUC:\n{}'.format(
    cross_val_score(log,X_poly,y_train,cv=kfold,scoring='roc_auc').mean()))
print('F1 average:\n{}'.format(
    cross_val_score(log,X_poly,y_train,cv=kfold,scoring = 'f1').mean()))

Cross-validation scores:
[0.84615385 0.82051282 0.84615385]
Average score:
0.8376068376068376
Cross-validation AUC:
[0.79017857 0.84274194 0.81696429]
Average AUC:
0.8166282642089094
F1 average:
0.4444444444444445


Neither binning nor adding polynomials of the original features appear to improve the model's performance on any of the key metrics. This could be because LogisticRegression doesn't simply draw a line for its feature representations, and is likely to build more complex models than, say, LinearRegression. It's also a possibility that there is a fairly linear relationships between the features and the target.