Загрузка данных

In [2]:
import pandas as pd

data = pd.read_csv('processed_data.csv')

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1941 entries, 0 to 1940
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Age                1941 non-null   float64
 1   Salt_Intake        1941 non-null   float64
 2   Stress_Score       1941 non-null   float64
 3   Sleep_Duration     1941 non-null   float64
 4   BMI                1941 non-null   float64
 5   Family_History     1941 non-null   int64  
 6   BP_History_Normal  1941 non-null   int64  
 7   Smoking_Status     1941 non-null   int64  
 8   Has_Hypertension   1941 non-null   int64  
dtypes: float64(5), int64(4)
memory usage: 136.6 KB


In [None]:
print(data.head(10).to_string())

        Age  Salt_Intake  Stress_Score  Sleep_Duration       BMI  Family_History  BP_History_Normal  Smoking_Status  Has_Hypertension
0  0.772727    -0.274219           0.9       -0.040121 -0.057862               1                  1               0                 1
1  0.212121     1.649547           1.0       -0.702885 -0.607283               0                  1               0                 0
2  0.909091     0.505686           0.3        0.423814 -1.683232               0                  1               0                 0
3  0.303030     0.765654           1.0       -1.498202 -0.904886               0                  0               0                 1
4  0.348485     0.661667           0.1       -0.437779 -2.255546               0                  0               0                 0
5  0.030303     1.181604           0.3       -0.835438 -0.950671               1                  0               0                 1
6  0.318182     0.193724           0.0        0.887749  0.3542

**Создание нейросетевого классификатора**

---




In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, f1_score, accuracy_score, roc_auc_score, recall_score
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, cross_val_score
from itertools import permutations
import numpy as np

In [6]:
x = data.drop(columns='Has_Hypertension')
y = data['Has_Hypertension']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y, random_state=42)
print(f"train: {x_train.shape[0]}\ntest: {x_test.shape[0]}")

train: 1358
test: 583


In [None]:
mlp_bin_classifier = MLPClassifier(
    batch_size='auto',
    activation='relu',
    solver='adam',
    learning_rate_init=0.001,
    n_iter_no_change=5,
    max_iter=2000,
    random_state=42,
)

In [None]:
classifier_params = {'hidden_layer_sizes': list(permutations(range(32, 1025, 16), 3)),
                     'alpha': [0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008,
                               0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008, 0.0009]}

In [None]:
best_bin_classifier = RandomizedSearchCV(
    mlp_bin_classifier,
    classifier_params,
    cv = StratifiedKFold(n_splits=4),
    scoring='precision',
    n_iter=35,
    n_jobs=2,
    refit=False,
    verbose=1
)

In [None]:
best_bin_classifier.fit(x_train, y_train)

Fitting 4 folds for each of 35 candidates, totalling 140 fits


In [None]:
for model_num in range(len(best_bin_classifier.cv_results_['params'])):

  if best_bin_classifier.cv_results_['mean_test_score'][model_num] > 0.76:
    hls = best_bin_classifier.cv_results_['params'][model_num]['hidden_layer_sizes']
    a = best_bin_classifier.cv_results_['params'][model_num]['alpha']
  else:
    continue

  test_classifier = MLPClassifier(
    hidden_layer_sizes=hls,
    alpha=a,
    batch_size='auto',
    activation='relu',
    solver='adam',
    learning_rate_init=0.001,
    max_iter=2000,
    validation_fraction=0.3,
    early_stopping=True,
    n_iter_no_change=10,
    random_state=42,
    verbose=0
)
  test_classifier.fit(x_train, y_train)

  y_pred = test_classifier.predict(x_test)

  print(f"({hls[0]:>3}, {hls[1]:>3}, {hls[2]:>3}), {a} {round(precision_score(y_test, y_pred), 3):>12}")

(128, 176, 320), 0.006        0.776
(128, 960, 848), 0.0002        0.743
(672, 1024, 720), 0.0003        0.746
(768, 672, 160), 0.008        0.765
(368, 640, 512), 0.0005        0.719
(864, 976, 480), 0.004        0.732
(208, 432, 704), 0.0009        0.762
( 48, 800, 400), 0.003        0.754
(720, 160, 240), 0.007        0.763


In [7]:
best_params = {
    'model_0': [(448, 144, 160), 0.005],
    'model_1': [(48, 896, 176), 0.0001],
    'model_2': [(48, 864, 912), 0.001],
    'model_3': [(464, 896,  96), 0.0004],
    'model_4': [(1008, 384, 816), 0.007],
    'model_5': [(848, 864, 1008), 0.003],
    'model_6': [(848, 320, 416), 0.001],
    'model_7': [(912, 928, 512), 0.0008],
    'model_8': [(880, 160, 688), 0.0009],
    'model_9': [(400, 656, 992), 0.007],
    'model_10': [(304, 320, 1008), 0.008],
    'model_11': [(304, 336, 880), 0.0005],
    'model_12': [(480, 528, 832), 0.0003],
    'model_13': [(32, 752,  64), 0.008],
    'model_14': [(128, 176, 624), 0.0006],
    'model_15': [(208, 352, 544), 0.0004],
    'model_16': [(752, 976, 1008), 0.002],
    'model_17': [(256, 160, 336), 0.002],
    'model_18': [(96, 832, 592), 0.0005],
    'model_19': [(64, 464, 384), 0.0008],
    'model_20': [(272, 704, 432), 0.0003],
    'model_21': [(96, 864, 1008), 0.0008],
    'model_22': [(64, 592,  80), 0.005],
    'model_23': [(592, 320, 448), 0.0001],
    'model_24': [(960, 112, 928), 0.004],
    'model_25': [(112, 624, 160), 0.0001],
    'model_26': [(64,  48, 224), 0.0007],
    'model_27': [(240, 432, 272), 0.005],
    'model_28': [(384,  32, 992), 0.0007],
    'model_29': [(256, 656, 960), 0.003],
    'model_30': [(832, 464, 736), 0.006],
    'model_31': [(1008, 320, 832), 0.0004],
    'model_32': [(528, 512, 112), 0.001],
    'model_33': [(128, 176, 320), 0.006],
    'model_34': [(768, 672, 160), 0.008],
}

**Исследование устойчивости найденных моделей к шумам в данных**

---



In [None]:
from random import choice

In [None]:
# Функция для созднаия n процентов шума в данных для m столбцов
# data - данные, которые необходимо зашумить
# n - процент шума
# m - количество столбцов одной записи, в которой необходимо создать шум
# Функция возвращает зашумленный набор данных
def make_data_noise(df : pd.DataFrame, n : int, m : int) -> pd.DataFrame:
  temp = df.copy() # Создаем копию данных

  # Считаем количество строк, которые надо зашумить
  count_of_lines = int(round(temp.shape[0] * n / 100, 0))
  # Выбираем случайным образом строки
  for line in np.random.choice(temp.index, size=count_of_lines, replace=False):
    # Выбираем случайные столбцы для зашумления
    columns_for_noise = np.random.choice(temp.columns, size=m, replace=False)
    for column in columns_for_noise:
      a = temp.loc[line, column]
      if temp[column].dtype == 'int64':
          # Значения категориальных переменных меняем на противоположное
          temp.loc[line, column] = a ^ 1
      else:
        # Значения числовых переменных изменяем на 1-5% от начального значения
        temp.loc[line, column] = a + choice([0.01, 0.02, 0.03, 0.04, 0.05]) * choice([-1, 1]) * a

  return temp

In [8]:
best_models = {
    name: MLPClassifier(
                        hidden_layer_sizes=best_params[name][0],
                        alpha=best_params[name][1],
                        batch_size='auto',
                        activation='relu',
                        solver='adam',
                        learning_rate_init=0.001,
                        max_iter=2000,
                        validation_fraction=0.3,
                        n_iter_no_change=10,
                        early_stopping=True,
                        random_state=42
                        ).fit(x_train, y_train) for name in best_params
}

In [9]:
temp_data = pd.DataFrame({
    'layer_1': [best_params[model][0][0] for model in best_params],
    'layer_2': [best_params[model][0][1] for model in best_params],
    'layer_3': [best_params[model][0][2] for model in best_params],
    'alpha': [best_params[model][1] for model in best_params],
    'precision': [round(precision_score(y_test, best_models[model].predict(x_test)), 3) for model in best_models]
}, index=best_params.keys())

In [10]:
temp_data.to_excel('best_models.xlsx')

In [11]:
print(temp_data.to_string())

          layer_1  layer_2  layer_3   alpha  precision
model_0       448      144      160  0.0050      0.770
model_1        48      896      176  0.0001      0.803
model_2        48      864      912  0.0010      0.799
model_3       464      896       96  0.0004      0.769
model_4      1008      384      816  0.0070      0.765
model_5       848      864     1008  0.0030      0.768
model_6       848      320      416  0.0010      0.762
model_7       912      928      512  0.0008      0.785
model_8       880      160      688  0.0009      0.775
model_9       400      656      992  0.0070      0.774
model_10      304      320     1008  0.0080      0.766
model_11      304      336      880  0.0005      0.775
model_12      480      528      832  0.0003      0.763
model_13       32      752       64  0.0080      0.772
model_14      128      176      624  0.0006      0.792
model_15      208      352      544  0.0004      0.778
model_16      752      976     1008  0.0020      0.794
model_17  

Зашумление 1 столбца

In [None]:
models_prec_score_1 = {
    'prec_orig': [],
    'prec_noise_5': [],
    'prec_noise_10': [],
    'prec_noise_15': [],
    'prec_noise_20': [],
    'prec_noise_25': [],
    'prec_noise_30': []
}

for name in best_models:
  models_prec_score_1['prec_orig'].append(round(precision_score(y_test, best_models[name].predict(x_test)), 3))

  for i in range(5, 31, 5):
    temp = np.array([precision_score(y_test, best_models[name].predict(make_data_noise(x_test, i, 1))) for j in range(50)])
    models_prec_score_1['prec_noise_' + str(i)].append(round(temp.mean(), 3))

In [None]:
models_prec_score_data = pd.DataFrame(models_prec_score_1, index=best_models.keys())

In [None]:
models_prec_score_data['max_diff'] = models_prec_score_data['prec_orig'] - models_prec_score_data.transpose().min()

In [None]:
print(models_prec_score_data.to_string())

          prec_orig  prec_noise_5  prec_noise_10  prec_noise_15  prec_noise_20  prec_noise_25  prec_noise_30  max_diff
model_0       0.770         0.769          0.767          0.764          0.761          0.759          0.755     0.015
model_1       0.803         0.801          0.797          0.793          0.793          0.789          0.785     0.018
model_2       0.799         0.795          0.792          0.791          0.787          0.784          0.780     0.019
model_3       0.769         0.766          0.763          0.761          0.758          0.755          0.752     0.017
model_4       0.765         0.762          0.761          0.758          0.757          0.753          0.752     0.013
model_5       0.768         0.764          0.761          0.759          0.753          0.751          0.746     0.022
model_6       0.762         0.760          0.756          0.756          0.751          0.750          0.748     0.014
model_7       0.785         0.781          0.780

In [None]:
models_prec_score_data.to_csv('models_prec_score_1.csv')

Зашумление 2 столбцов

In [None]:
models_prec_score_2 = {
    'prec_orig': [],
    'prec_noise_5': [],
    'prec_noise_10': [],
    'prec_noise_15': [],
    'prec_noise_20': [],
    'prec_noise_25': [],
    'prec_noise_30': []
}

for name in best_models:
  models_prec_score_2['prec_orig'].append(round(precision_score(y_test, best_models[name].predict(x_test)), 3))

  for i in range(5, 31, 5):
    temp = np.array([precision_score(y_test, best_models[name].predict(make_data_noise(x_test, i, 2))) for j in range(50)])
    models_prec_score_2['prec_noise_' + str(i)].append(round(temp.mean(), 3))

In [None]:
models_prec_score_data = pd.DataFrame(models_prec_score_2, index=best_models.keys())

In [None]:
models_prec_score_data['max_diff'] = models_prec_score_data['prec_orig'] - models_prec_score_data.transpose().min()

In [None]:
print(models_prec_score_data.to_string())

          prec_orig  prec_noise_5  prec_noise_10  prec_noise_15  prec_noise_20  prec_noise_25  prec_noise_30  max_diff
model_0       0.770         0.767          0.761          0.758          0.752          0.750          0.746     0.024
model_1       0.803         0.796          0.791          0.788          0.781          0.777          0.772     0.031
model_2       0.799         0.793          0.790          0.784          0.775          0.774          0.766     0.033
model_3       0.769         0.765          0.757          0.753          0.747          0.744          0.738     0.031
model_4       0.765         0.761          0.757          0.752          0.747          0.745          0.739     0.026
model_5       0.768         0.760          0.756          0.747          0.743          0.737          0.730     0.038
model_6       0.762         0.757          0.752          0.749          0.743          0.739          0.736     0.026
model_7       0.785         0.778          0.775

In [None]:
models_prec_score_data.to_csv('models_prec_score_2.csv')

Зашумление 3 столбцов

In [None]:
models_prec_score_3 = {
    'prec_orig': [],
    'prec_noise_5': [],
    'prec_noise_10': [],
    'prec_noise_15': [],
    'prec_noise_20': [],
    'prec_noise_25': [],
    'prec_noise_30': []
}

for name in best_models:
  models_prec_score_3['prec_orig'].append(round(precision_score(y_test, best_models[name].predict(x_test)), 3))

  for i in range(5, 31, 5):
    temp = np.array([precision_score(y_test, best_models[name].predict(make_data_noise(x_test, i, 3))) for j in range(50)])
    models_prec_score_3['prec_noise_' + str(i)].append(round(temp.mean(), 3))

In [None]:
models_prec_score_data = pd.DataFrame(models_prec_score_3, index=best_models.keys())

In [None]:
models_prec_score_data['max_diff'] = models_prec_score_data['prec_orig'] - models_prec_score_data.transpose().min()

In [None]:
print(models_prec_score_data.to_string())

          prec_orig  prec_noise_5  prec_noise_10  prec_noise_15  prec_noise_20  prec_noise_25  prec_noise_30  max_diff
model_0       0.770         0.763          0.757          0.753          0.746          0.736          0.733     0.037
model_1       0.803         0.794          0.786          0.777          0.771          0.761          0.755     0.048
model_2       0.799         0.792          0.785          0.775          0.768          0.759          0.754     0.045
model_3       0.769         0.761          0.752          0.747          0.736          0.732          0.724     0.045
model_4       0.765         0.759          0.753          0.744          0.740          0.735          0.727     0.038
model_5       0.768         0.759          0.751          0.741          0.733          0.726          0.715     0.053
model_6       0.762         0.756          0.748          0.741          0.735          0.728          0.723     0.039
model_7       0.785         0.776          0.770

In [None]:
models_prec_score_data.to_csv('models_prec_score_3.csv')

In [None]:
models_prec_score_data_1 = pd.read_csv('models_prec_score_1.csv', index_col=0)
models_prec_score_data_2 = pd.read_csv('models_prec_score_2.csv', index_col=0)
models_prec_score_data_3 = pd.read_csv('models_prec_score_3.csv', index_col=0)

In [None]:
print(models_prec_score_data_1.sort_values(by=['max_diff', 'prec_orig'], ascending=[True, False]).to_string())

          prec_orig  prec_noise_5  prec_noise_10  prec_noise_15  prec_noise_20  prec_noise_25  prec_noise_30  max_diff
model_10      0.766         0.763          0.762          0.759          0.757          0.756          0.753     0.013
model_4       0.765         0.762          0.761          0.758          0.757          0.753          0.752     0.013
model_22      0.762         0.759          0.757          0.754          0.752          0.751          0.749     0.013
model_8       0.775         0.773          0.770          0.769          0.764          0.763          0.761     0.014
model_11      0.775         0.773          0.771          0.767          0.766          0.762          0.761     0.014
model_9       0.774         0.772          0.770          0.766          0.764          0.763          0.760     0.014
model_28      0.772         0.771          0.767          0.766          0.763          0.761          0.758     0.014
model_34      0.765         0.762          0.762

In [None]:
print(models_prec_score_data_2.sort_values(by=['max_diff', 'prec_orig'], ascending=[True, False]).to_string())

          prec_orig  prec_noise_5  prec_noise_10  prec_noise_15  prec_noise_20  prec_noise_25  prec_noise_30  max_diff
model_0       0.770         0.767          0.761          0.758          0.752          0.750          0.746     0.024
model_8       0.775         0.771          0.765          0.760          0.756          0.753          0.749     0.026
model_4       0.765         0.761          0.757          0.752          0.747          0.745          0.739     0.026
model_6       0.762         0.757          0.752          0.749          0.743          0.739          0.736     0.026
model_10      0.766         0.762          0.758          0.752          0.747          0.744          0.739     0.027
model_34      0.765         0.761          0.755          0.752          0.748          0.743          0.738     0.027
model_11      0.775         0.771          0.766          0.762          0.757          0.753          0.747     0.028
model_9       0.774         0.769          0.765

In [None]:
print(models_prec_score_data_3.sort_values(by=['max_diff', 'prec_orig'], ascending=[True, False]).to_string())

          prec_orig  prec_noise_5  prec_noise_10  prec_noise_15  prec_noise_20  prec_noise_25  prec_noise_30  max_diff
model_28      0.772         0.765          0.760          0.752          0.744          0.741          0.735     0.037
model_0       0.770         0.763          0.757          0.753          0.746          0.736          0.733     0.037
model_4       0.765         0.759          0.753          0.744          0.740          0.735          0.727     0.038
model_8       0.775         0.769          0.761          0.754          0.746          0.745          0.736     0.039
model_31      0.764         0.756          0.751          0.744          0.735          0.733          0.725     0.039
model_6       0.762         0.756          0.748          0.741          0.735          0.728          0.723     0.039
model_23      0.777         0.769          0.761          0.755          0.749          0.739          0.737     0.040
model_11      0.775         0.769          0.761

model_28, model_0, model_8, model_23, model_11, model_16, model_14

In [None]:
temp_models = ['model_0', 'model_8', 'model_11', 'model_14', 'model_16', 'model_23', 'model_28']

In [None]:
my_data = pd.DataFrame({
    'Age': [22],
    'Salt_Intake': [8],
    'Stress_Score': [3],
    'Sleep_Duration': [7.05],
    'BMI': [25.6],
    'Family_History': [0],
    'BP_History_Normal': [1],
    'Smoking_Status': [0]
})

my_data['Age'] = (my_data['Age'] - 18) / (84 - 18)
my_data['Stress_Score'] = my_data['Stress_Score'] / 10
my_data['Salt_Intake'] = (my_data['Salt_Intake'] - 8.5274) / 1.9233
my_data['Sleep_Duration'] = (my_data['Sleep_Duration'] - 6.4605) / 1.5088
my_data['BMI'] = (my_data['BMI'] - 26.0528) / 4.3682

In [None]:
for model in temp_models:
  y_pred = best_models[model].predict(x_test)

  acc = round(accuracy_score(y_test, y_pred), 3)
  precision = round(precision_score(y_test, y_pred), 3)

  print(model)
  print(f"accuracy: {acc:>6}\nprecision: {precision:>5}")
  print(np.round(best_models[model].predict_proba(my_data), 4))
  print()

model_0
accuracy:  0.768
precision:  0.77
[[9.993e-01 7.000e-04]]

model_8
accuracy:  0.775
precision: 0.775
[[0.9943 0.0057]]

model_11
accuracy:  0.772
precision: 0.775
[[0.9983 0.0017]]

model_14
accuracy:  0.762
precision: 0.792
[[0.9851 0.0149]]

model_16
accuracy:  0.762
precision: 0.794
[[1. 0.]]

model_23
accuracy:  0.763
precision: 0.777
[[0.9813 0.0187]]

model_28
accuracy:  0.786
precision: 0.772
[[0.9934 0.0066]]

