### Import libraries:

In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold, cross_val_score
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, matthews_corrcoef

### Load data:

In [2]:
df = pd.read_csv('fetal_health.csv', header=0)
display(df.head())

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,73.0,0.5,43.0,...,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0,2.0
1,132.0,0.006,0.0,0.006,0.003,0.0,0.0,17.0,2.1,0.0,...,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0,1.0
2,133.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.1,0.0,...,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0,1.0
3,134.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0,1.0
4,132.0,0.007,0.0,0.008,0.0,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0,1.0


In [3]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2126 entries, 0 to 2125
Data columns (total 22 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   baseline value                                          2126 non-null   float64
 1   accelerations                                           2126 non-null   float64
 2   fetal_movement                                          2126 non-null   float64
 3   uterine_contractions                                    2126 non-null   float64
 4   light_decelerations                                     2126 non-null   float64
 5   severe_decelerations                                    2126 non-null   float64
 6   prolongued_decelerations                                2126 non-null   float64
 7   abnormal_short_term_variability                         2126 non-null   float64
 8   mean_value_of_short_term_variability  

In [4]:
print(df['fetal_health'].value_counts())

1.0    1655
2.0     295
3.0     176
Name: fetal_health, dtype: int64


#### Get X and y:

In [5]:
X = df.loc[:, :'histogram_tendency']
y = df['fetal_health']
print(X.shape)
print(y[:5])

(2126, 21)
0    2.0
1    1.0
2    1.0
3    1.0
4    1.0
Name: fetal_health, dtype: float64


#### How to proceed:

We will proceed from getting features which contribute better to the model, then conduct random search for the best parameters for two models: GradientBoostingClassifier and SVC. The metric used here is balanced accuracy and F1-score.

### Feature extraction:

#### ANOVA:

In [6]:
selector = SelectKBest(score_func= f_classif, k=10).fit(X,y)
anova_selected_features = X.columns[selector.get_support()]

In [7]:
print(anova_selected_features)

Index(['baseline value', 'accelerations', 'prolongued_decelerations',
       'abnormal_short_term_variability',
       'mean_value_of_short_term_variability',
       'percentage_of_time_with_abnormal_long_term_variability',
       'histogram_mode', 'histogram_mean', 'histogram_median',
       'histogram_variance'],
      dtype='object')


In [8]:
X_chosen = X[anova_selected_features]
display(X_chosen.head())

Unnamed: 0,baseline value,accelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,histogram_mode,histogram_mean,histogram_median,histogram_variance
0,120.0,0.0,0.0,73.0,0.5,43.0,120.0,137.0,121.0,73.0
1,132.0,0.006,0.0,17.0,2.1,0.0,141.0,136.0,140.0,12.0
2,133.0,0.003,0.0,16.0,2.1,0.0,141.0,135.0,138.0,13.0
3,134.0,0.003,0.0,16.0,2.4,0.0,137.0,134.0,137.0,13.0
4,132.0,0.007,0.0,16.0,2.4,0.0,137.0,136.0,138.0,11.0


### Make pipeline and set parameters for random search:

#### Pipelines:

In [9]:
pipe_svc = Pipeline([('scaler', MinMaxScaler(feature_range=(0, 1), copy=True)),
                 ('svc', SVC(kernel='rbf'))])

In [10]:
pipe_gbc = Pipeline([('scaler', MinMaxScaler(feature_range=(0, 1), copy=True)),
                 ('gbc', GradientBoostingClassifier())])

#### Hyper parameters:

In [18]:
C_grid = [1e0, 1e1, 1e2, 1e3, 1e4]
g_grid = ['scale', 1e-2, 1e-1, 1e0]

# Hyper paramet settings for grid search
param_grid_svc = {
    'svc__C': C_grid,
    'svc__gamma' : g_grid,
}

In [19]:
param_grid_gbc = {
    'gbc__learning_rate':[0.1,0.05,0.005],     
    'gbc__n_estimators':[100,700,1500]}  

#### Preparation of objects for cross validation

In [13]:
grid_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=11)  # for grid search
gen_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=61)  # for estimation of generalization performance

#### Define the random search for hyper parameters

In [14]:
rs_svc = GridSearchCV(pipe_svc, param_grid_svc, cv = grid_cv, scoring="balanced_accuracy").fit(X_chosen, y)

In [25]:
rs_gbc = RandomizedSearchCV(pipe_gbc, param_grid_gbc, n_iter=7, cv = grid_cv, scoring="balanced_accuracy").fit(X_chosen, y)

#### Estimation of generalization performance:

In [26]:
nested_score_svc = cross_val_score(rs_svc, X=X_chosen, y=y, cv=gen_cv, scoring='balanced_accuracy')
print(nested_score_svc.mean())

0.8769224927021638


In [27]:
nested_score_gbc = cross_val_score(rs_gbc, X=X_chosen, y=y, cv=gen_cv, scoring='balanced_accuracy')
print(nested_score_gbc.mean())

0.9074207554405362


#### Best hyperparameters:

In [29]:
print(rs_svc.best_params_)
print(rs_gbc.best_params_)

{'svc__C': 1000.0, 'svc__gamma': 'scale'}
{'gbc__n_estimators': 700, 'gbc__learning_rate': 0.1}


Gradient Boost Classifier with n_estimators=700 and learning_rate=0.1 works better than SVC in this dataset.

### Check the result with best model attained:

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=11)

results = pipe_gbc.fit(X_train, y_train)
y_pred = results.predict(X_test)

In [41]:
display(pd.crosstab(y_pred, y_test))

fetal_health,1.0,2.0,3.0
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,397,17,2
2.0,16,55,4
3.0,1,2,38


#### Check F1-score and Matthews correlation coefficient

In [42]:
print('F1-score:', f1_score(y_pred, y_test, average='weighted'))
print('Matthews correlation coefficient:', matthews_corrcoef(y_pred, y_test))

F1-score: 0.9210236420154406
Matthews correlation coefficient: 0.7840784249337324
