In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
from sklearn.metrics import balanced_accuracy_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
path = Path().absolute().parent/'data'/'processed_train_data.csv'
data = pd.read_csv(path, index_col=0)
data.head()

Unnamed: 0_level_0,rectal_temp,pulse,respiratory_rate,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_protein,outcome,abdomen_distend_large,abdomen_distend_small,...,obturation,intrinsic,extrinsic,adynamic,volvulus/torsion,intussuption,thromboembolic,hernia,lipoma/slenic_incarceration,displacement
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,38.1,132.0,24.0,6.5,57.0,8.5,3.4,died,0,1,...,0,0,0,0,0,0,0,0,1,0
1,37.5,88.0,12.0,2.0,33.0,64.0,2.0,euthanized,0,1,...,0,0,0,0,0,0,0,1,0,0
2,38.3,120.0,28.0,3.5,37.0,6.4,3.4,lived,1,0,...,0,0,0,1,0,0,0,0,0,0
3,37.1,72.0,30.0,2.0,53.0,7.0,3.9,lived,0,1,...,0,0,0,0,0,0,0,1,0,0
4,38.0,52.0,48.0,7.0,47.0,7.3,2.6,lived,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
le = LabelEncoder()
y = le.fit_transform(data['outcome'])
X = data.drop(columns='outcome')

In [4]:
le_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
le_mapping 

{'died': 0, 'euthanized': 1, 'lived': 2}

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
xgb_clf = xgb.XGBClassifier(objective='multi:softmax', num_class=3)
parameters = {
    'max_depth': [5, 9, 13],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 500]
}
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=parameters, scoring='accuracy', cv=5, verbose=10)

In [7]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 1/5; 1/27] START learning_rate=0.1, max_depth=5, n_estimators=100...........
[CV 1/5; 1/27] END learning_rate=0.1, max_depth=5, n_estimators=100;, score=0.712 total time=   0.4s
[CV 2/5; 1/27] START learning_rate=0.1, max_depth=5, n_estimators=100...........
[CV 2/5; 1/27] END learning_rate=0.1, max_depth=5, n_estimators=100;, score=0.692 total time=   0.6s
[CV 3/5; 1/27] START learning_rate=0.1, max_depth=5, n_estimators=100...........
[CV 3/5; 1/27] END learning_rate=0.1, max_depth=5, n_estimators=100;, score=0.702 total time=   0.2s
[CV 4/5; 1/27] START learning_rate=0.1, max_depth=5, n_estimators=100...........
[CV 4/5; 1/27] END learning_rate=0.1, max_depth=5, n_estimators=100;, score=0.756 total time=   0.2s
[CV 5/5; 1/27] START learning_rate=0.1, max_depth=5, n_estimators=100...........
[CV 5/5; 1/27] END learning_rate=0.1, max_depth=5, n_estimators=100;, score=0.660 total time=   0.2s
[CV 1/5; 2/27] START learnin

In [8]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Лучшие параметры:", best_params)
print("Точность на обучающем наборе:", best_score)

y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Точность на тестовом наборе:", accuracy)

Лучшие параметры: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 500}
Точность на обучающем наборе: 0.7095421217248628
Точность на тестовом наборе: 0.7004048582995951


In [9]:
model = xgb.XGBClassifier(objective='multi:softmax', num_class=3, **best_params)

In [10]:
model.fit(X_train, y_train)

In [11]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Точность на тестовом наборе:", accuracy)

Точность на тестовом наборе: 0.7004048582995951


In [12]:
# import pickle
# model_path = Path().absolute().parent/'saved_models'/'xgb.pickle'
# pickle.dump(model, open(model_path, "wb"))

In [13]:
data.head()

Unnamed: 0_level_0,rectal_temp,pulse,respiratory_rate,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_protein,outcome,abdomen_distend_large,abdomen_distend_small,...,obturation,intrinsic,extrinsic,adynamic,volvulus/torsion,intussuption,thromboembolic,hernia,lipoma/slenic_incarceration,displacement
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,38.1,132.0,24.0,6.5,57.0,8.5,3.4,died,0,1,...,0,0,0,0,0,0,0,0,1,0
1,37.5,88.0,12.0,2.0,33.0,64.0,2.0,euthanized,0,1,...,0,0,0,0,0,0,0,1,0,0
2,38.3,120.0,28.0,3.5,37.0,6.4,3.4,lived,1,0,...,0,0,0,1,0,0,0,0,0,0
3,37.1,72.0,30.0,2.0,53.0,7.0,3.9,lived,0,1,...,0,0,0,0,0,0,0,1,0,0
4,38.0,52.0,48.0,7.0,47.0,7.3,2.6,lived,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
features = 'rectal_temp,pulse,respiratory_rate,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_protein,outcome,abdomen_distend_large,abdomen_distend_small,abdomen_firm,abdomen_normal,abdomen_other,abdominal_distention_moderate,abdominal_distention_none,abdominal_distention_severe,abdominal_distention_slight,abdomo_appearance_clear,abdomo_appearance_cloudy,abdomo_appearance_serosanguious,age_adult,age_young,capillary_refill_time_3,capillary_refill_time_less_3_sec,capillary_refill_time_more_3_sec,cp_data_no,cp_data_yes,mucous_membrane_bright_pink,mucous_membrane_bright_red,mucous_membrane_dark_cyanotic,mucous_membrane_normal_pink,mucous_membrane_pale_cyanotic,mucous_membrane_pale_pink,nasogastric_reflux_less_1_liter,nasogastric_reflux_more_1_liter,nasogastric_reflux_none,nasogastric_tube_none,nasogastric_tube_significant,nasogastric_tube_slight,pain_alert,pain_depressed,pain_extreme_pain,pain_mild_pain,pain_severe_pain,peripheral_pulse_absent,peripheral_pulse_increased,peripheral_pulse_normal,peripheral_pulse_reduced,peristalsis_absent,peristalsis_hypermotile,peristalsis_hypomotile,peristalsis_normal,rectal_exam_feces_absent,rectal_exam_feces_decreased,rectal_exam_feces_increased,rectal_exam_feces_normal,surgery_no,surgery_yes,surgical_lesion_no,surgical_lesion_yes,temp_of_extremities_cold,temp_of_extremities_cool,temp_of_extremities_normal,temp_of_extremities_warm,gastric,sm_intestine,lg_colon,lg_colon_and_cecum,cecum,transverse_colon,retum/descending_colon,uterus,bladder,all_intestinal_sites,simple,strangulation,inflammation,other,mechanical,paralytic,obturation,intrinsic,extrinsic,adynamic,volvulus/torsion,intussuption,thromboembolic,hernia,lipoma/slenic_incarceration,displacement'
features = features.split(',')
for idx, feature in enumerate(features):
    if feature == 'temp_of_extremities_normal':
        print(idx)
        break
features.remove('outcome')
X_ = np.array([[np.NAN] * len(features)])

62


In [35]:
X_[0][62] = 1
X_

array([[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,  1., nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]])

In [36]:
y_pred = model.predict(X_)
y_pred

array([1], dtype=int32)