In [306]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [273]:
df = pd.read_csv("cirrhosis.csv")
df.head()

Unnamed: 0,ID,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,1,400,D,D-penicillamine,21464,F,Y,Y,Y,Y,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,4.0
1,2,4500,C,D-penicillamine,20617,F,N,Y,Y,N,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0
2,3,1012,D,D-penicillamine,25594,M,N,N,N,S,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,4.0
3,4,1925,D,D-penicillamine,19994,F,N,Y,Y,S,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0
4,5,1504,CL,Placebo,13918,F,N,Y,Y,N,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0


Задача состоит в обучении модели прогнозирования выживания пациентов с циррозом печени на основе имеющихся данных из исследований, проведенных Майской клиникой (Mayo Clinic) в период с 1974 по 1984 год.

Датасет состоит из численных и категориальных данных. Расшифруем колонки с категориальными данными:
- Status: 3 класса - C (censored), CL (cencored due to liver tx), D (death)
- Drug: 2 класса - D-penicillamine, Placebo
- Sex: 2 класса - M (Male), F (Female)
- Ascites: N (No), Y (Yes)
- Hepatomegaly: N (No), Y (Yes)
- Spiders: N (No), Y (Yes)
- Edema: N (No), S (present without diuretics or resolved by diuretics), Y (Yes)

При открытии csv файла заметил пустые поля. Посмотрим в каких колонках они встречаются.

In [274]:
empty_cols = df.columns[df.isna().any()].tolist()
for col in empty_cols:
    print(df[col].dtype, col)

object Drug
object Ascites
object Hepatomegaly
object Spiders
float64 Cholesterol
float64 Copper
float64 Alk_Phos
float64 SGOT
float64 Tryglicerides
float64 Platelets
float64 Prothrombin
float64 Stage


Выяснили, что числовые данные отсутствуют в колонках Cholesterol, Copper, Alk_Phos, SGOT, Tryglicerides, Platelets, Prothrombin, Stage

Для импутации числовых данных воспользуемся *KNNImputer* из библиотеки sklearn.
Но перед этим посмотрим на матрицу корреляции, чтобы понять, какие колонки больше всего коррелируют с каждой из колонок с отсутствующими данными. По ним и будем искать KNN.

In [275]:
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,ID,N_Days,Age,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
ID,1.0,-0.354305,0.037136,-0.062154,0.032897,-0.128924,-0.098663,-0.352856,-0.012097,-0.0341,-0.076699,-0.19193,-0.033757
N_Days,-0.354305,1.0,-0.125934,-0.403953,-0.138236,0.430829,-0.364809,0.149269,-0.225492,-0.153,0.151361,-0.11147,-0.366193
Age,0.037136,-0.125934,1.0,0.002362,-0.15762,-0.18235,0.061549,-0.047247,-0.149869,0.022065,-0.148201,0.11376,0.189083
Bilirubin,-0.062154,-0.403953,0.002362,1.0,0.397129,-0.314177,0.456918,0.116984,0.44173,0.436748,-0.013435,0.314894,0.200731
Cholesterol,0.032897,-0.138236,-0.15762,0.397129,1.0,-0.069733,0.126115,0.149473,0.353246,0.27683,0.19171,-0.030811,0.011164
Albumin,-0.128924,0.430829,-0.18235,-0.314177,-0.069733,1.0,-0.264771,-0.101456,-0.220047,-0.103417,0.158659,-0.200592,-0.305296
Copper,-0.098663,-0.364809,0.061549,0.456918,0.126115,-0.264771,1.0,0.187357,0.293829,0.279852,-0.064403,0.218224,0.2694
Alk_Phos,-0.352856,0.149269,-0.047247,0.116984,0.149473,-0.101456,0.187357,1.0,0.112217,0.180082,0.143733,0.089384,0.041273
SGOT,-0.012097,-0.225492,-0.149869,0.44173,0.353246,-0.220047,0.293829,0.112217,1.0,0.126119,-0.120147,0.112174,0.164945
Tryglicerides,-0.0341,-0.153,0.022065,0.436748,0.27683,-0.103417,0.279852,0.180082,0.126119,1.0,0.103212,0.020122,0.123899


Ниже приведены колонки, у которых наибольшая корреляция к колонкам с числовыми значениями.

- Cholesterol: Bilirubin, SGOT, Tryglicerides;
- Copper: Bilirubin, SGOT, Tryglicerides;
- Alk_Phos: Copper, Tryglicerides, Cholesterol;
- SGOT: Bilirubin, Cholesterol, Copper;
- Tryglicerides: Bilirubin, Copper, Cholesterol;
- Platelets: Cholesterol, N_Days, Alk_Phos;
- Prothrombin: Bilirubin, Copper, Stage;
- Stage: Copper, Bilirubin, Age.

In [276]:
# Создаем список колонок, где первая колонка - искомая, а 3 последующие - опора (самая большая корреляция).

impute_cols = [
    ["Cholesterol", "Bilirubin", "SGOT", "Tryglicerides"],
    ["Copper", "Bilirubin", "SGOT", "Tryglicerides"],
    ["Alk_Phos", "Copper", "Tryglicerides", "Cholesterol"],
    ["SGOT", "Bilirubin", "Cholesterol", "Copper"],
    ["Tryglicerides", "Bilirubin", "Copper", "Cholesterol"],
    ["Platelets", "Cholesterol", "N_Days", "Alk_Phos"],
    ["Prothrombin", "Bilirubin", "Copper", "Stage"],
    ["Stage", "Copper", "Bilirubin", "Age"]
]

В качестве обучающих данных будем использовать только ту часть датасета, в которой совсем нет пропущенных данных в каждой из строк рассматриваемых колонок.
Далее, исходя из полученного KNNImputer, вставляем данные для рассматриваемой колонки для всего датасета.

In [277]:
for cols in impute_cols:
    print(f"Вставляем значения для \033[1m{cols[0]}\033[0m, опираясь на \033[1m{', '.join(cols[1:])}\033[0m")
    arr = df.loc[:, cols].dropna(axis="rows").to_numpy()
    print(f"Заполненных значений: {arr.shape[0]}")
    imputer = KNNImputer(n_neighbors=3, weights="uniform")
    imputer.fit(arr)
    df.loc[:, cols[0]] = imputer.transform(df.loc[:, cols].to_numpy())[:, 0]

Вставляем значения для [1mCholesterol[0m, опираясь на [1mBilirubin, SGOT, Tryglicerides[0m
Заполненных значений: 282
Вставляем значения для [1mCopper[0m, опираясь на [1mBilirubin, SGOT, Tryglicerides[0m
Заполненных значений: 280
Вставляем значения для [1mAlk_Phos[0m, опираясь на [1mCopper, Tryglicerides, Cholesterol[0m
Заполненных значений: 282
Вставляем значения для [1mSGOT[0m, опираясь на [1mBilirubin, Cholesterol, Copper[0m
Заполненных значений: 312
Вставляем значения для [1mTryglicerides[0m, опираясь на [1mBilirubin, Copper, Cholesterol[0m
Заполненных значений: 282
Вставляем значения для [1mPlatelets[0m, опираясь на [1mCholesterol, N_Days, Alk_Phos[0m
Заполненных значений: 407
Вставляем значения для [1mProthrombin[0m, опираясь на [1mBilirubin, Copper, Stage[0m
Заполненных значений: 410
Вставляем значения для [1mStage[0m, опираясь на [1mCopper, Bilirubin, Age[0m
Заполненных значений: 412


In [278]:
# Убедимся, что теперь отсутствуют колонки с пустыми числовыми значениями
empty_cols = df.columns[df.isna().any()].tolist()
for col in empty_cols:
    print(df[col].dtype, col)

object Drug
object Ascites
object Hepatomegaly
object Spiders


Для таких колонок просто заполним самым частовстречающимся значением, так как время=деньги

In [279]:
for col in empty_cols:
    df[col].fillna(df[col].mode().values[0], inplace=True)

In [280]:
# Убедимся, что теперь отсутствуют колонки с пустыми значениями
empty_cols = df.columns[df.isna().any()].tolist()
for col in empty_cols:
    print(df[col].dtype, col)

In [281]:
df.head()

Unnamed: 0,ID,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,1,400,D,D-penicillamine,21464,F,Y,Y,Y,Y,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,4.0
1,2,4500,C,D-penicillamine,20617,F,N,Y,Y,N,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0
2,3,1012,D,D-penicillamine,25594,M,N,N,N,S,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,4.0
3,4,1925,D,D-penicillamine,19994,F,N,Y,Y,S,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0
4,5,1504,CL,Placebo,13918,F,N,Y,Y,N,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0


Для колонок с текстовыми значениями в виде классов применим категоризацию данных через LabelEncoder

In [282]:
for col in df.columns.tolist():
    if df[col].dtype == "object" and col != "Status":
        le = LabelEncoder()
        le.fit(df[col])
        df[col]=le.transform(df[col])

In [283]:
df["Status"].replace("CL", 0, inplace=True)
df["Status"].replace("C", 0, inplace=True)
df["Status"].replace("D", 1, inplace=True)

In [284]:
df.head()

Unnamed: 0,ID,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,1,400,1,0,21464,0,1,1,1,2,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,4.0
1,2,4500,0,0,20617,0,0,1,1,0,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0
2,3,1012,1,0,25594,1,0,0,0,1,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,4.0
3,4,1925,1,0,19994,0,0,1,1,1,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0
4,5,1504,0,1,13918,0,0,1,1,0,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0


Имея все числовые данные, построим матрицу корреляции.
К счастью, для нашей колонки Status есть много колонок, обладающих корошей корреляцией по отношению к ней.

In [285]:
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,ID,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
ID,1.0,-0.354305,-0.297392,-0.357086,0.037136,-0.083855,-0.182124,0.299714,-0.2896,-0.15983,-0.062154,-0.006309,-0.128924,-0.044761,-0.243185,-0.004563,-0.062497,-0.077066,-0.191074,-0.027129
N_Days,-0.354305,1.0,-0.387981,0.054742,-0.125934,-0.007386,-0.24709,-0.287832,-0.180715,-0.304634,-0.403953,-0.152908,0.430829,-0.33576,0.121517,-0.217462,-0.153452,0.138443,-0.112614,-0.36553
Status,-0.297392,-0.387981,1.0,0.006973,0.241145,0.112976,0.290693,0.220186,0.207329,0.307321,0.416818,0.182585,-0.255241,0.353523,0.226798,0.243573,0.227097,-0.105717,0.355835,0.307293
Drug,-0.357086,0.054742,0.006973,1.0,-0.158028,-0.019563,0.024685,-0.113408,0.142884,0.02011,0.074236,0.038297,0.047487,-0.009003,-0.000819,0.034087,0.046073,0.067625,0.051187,0.053381
Age,0.037136,-0.125934,0.241145,-0.158028,1.0,0.163341,0.183588,0.111522,-0.07804,0.196916,0.002362,-0.151772,-0.18235,0.037986,-0.04878,-0.138098,-0.017071,-0.144583,0.113905,0.191433
Sex,-0.083855,-0.007386,0.112976,-0.019563,0.163341,1.0,0.015873,0.016205,-0.103809,0.033229,-0.027652,0.005702,0.030351,0.212292,0.043279,0.035672,0.058491,-0.092776,0.070439,0.0151
Ascites,-0.182124,-0.24709,0.290693,0.024685,0.183588,0.015873,1.0,0.07968,0.19596,0.55208,0.334283,-0.039205,-0.315648,0.198307,0.014968,0.084979,0.188158,-0.167251,0.267943,0.21411
Hepatomegaly,0.299714,-0.287832,0.220186,-0.113408,0.111522,0.016205,0.07968,1.0,0.117692,0.113423,0.234372,0.105699,-0.269351,0.20255,0.080545,0.115169,0.124196,-0.176184,0.147576,0.359368
Spiders,-0.2896,-0.180715,0.207329,0.142884,-0.07804,-0.103809,0.19596,0.117692,1.0,0.263447,0.247932,0.061312,-0.162071,0.222992,0.045032,0.115596,0.099628,-0.103871,0.19625,0.240775
Edema,-0.15983,-0.304634,0.307321,0.02011,0.196916,0.033229,0.55208,0.113423,0.263447,1.0,0.330803,-0.078931,-0.331281,0.217889,0.029987,0.123243,0.086487,-0.205454,0.33144,0.239485


Все численные значения (кроме категориальных) нормируем индивидуально для каждой колонки, используя StandardScaler

In [286]:
for col in ["N_Days", "Age", "Bilirubin", "Cholesterol", "Albumin", "Copper", "Alk_Phos", "SGOT", "Tryglicerides", "Platelets", "Prothrombin"]:
    scaler = StandardScaler()
    df[col] = scaler.fit_transform(df[col].to_numpy().reshape(-1, 1)).reshape(-1)

In [287]:
df.head()

Unnamed: 0,ID,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,1,-1.375612,1,0,0.768941,0,1,1,1,2,2.562152,-0.488044,-2.114296,0.723999,-0.11854,0.292738,0.880896,-0.689611,1.441539,4.0
1,2,2.340341,0,0,0.546706,0,0,1,1,0,-0.481759,-0.288171,1.513818,-0.556211,2.845083,-0.174052,-0.585051,-0.372008,-0.129341,3.0
2,3,-0.820938,1,0,1.852567,1,0,0,0,1,-0.413611,-0.902416,-0.041088,1.401758,-0.746055,-0.5069,-1.160959,-1.089176,1.245179,4.0
3,4,0.006542,1,0,0.383244,0,0,1,1,1,-0.322748,-0.570918,-2.255651,-0.4307,2.180502,-1.184634,-0.515244,-0.761328,-0.423881,4.0
4,5,-0.375023,0,1,-1.210972,0,0,1,1,0,0.040704,-0.400295,0.076708,0.560835,-0.665136,-0.181121,-0.864279,-1.242855,0.165199,3.0


Так как задачей является определение выживаемости пациента, в качестве Y мы возьмём колонку "Status".

In [288]:
X = df.loc[:, df.columns != "Status"].to_numpy()
Y = df.loc[:, "Status"].to_numpy()
print(X.shape)
print(Y.shape)

(418, 19)
(418,)


In [289]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.8)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(334, 19)
(84, 19)
(334,)
(84,)


## LogisticRegression()

score: 0.798

In [290]:
model = LogisticRegression()

In [291]:
model.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [292]:
y_pred = model.predict(x_test)

In [293]:
accuracy_score(y_test, y_pred)

0.7976190476190477

## RandomForestClassifier()
score: 0.845

In [315]:
model = RandomForestClassifier()

In [316]:
model.fit(x_train, y_train)

RandomForestClassifier()

In [317]:
y_pred = model.predict(x_test)

In [318]:
accuracy_score(y_test, y_pred)

0.8452380952380952

## XGBoost
score: 0.916

In [320]:
from xgboost import XGBClassifier

In [321]:
model = XGBClassifier()
model.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [323]:
y_pred = model.predict(x_test)

In [324]:
accuracy_score(y_test, y_pred)

0.9166666666666666