# **Библиотеки**

In [None]:
!pip install catboost

In [3]:
# Импортируем необходимые библиотеки
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
import joblib

# **Данные**

In [4]:
# Загрузим данные
data_raw = pd.read_csv('data.csv')

In [None]:
data_raw

In [None]:
columns_to_exclude = ['Case_ID', 'Age_at_diagnosis']

for column in data_raw.columns:
    if column not in columns_to_exclude:
      unique_values = data_raw[column].unique()
      print(f'Уникальные значения в колонке {column}: {unique_values}')

# **EDA**

In [8]:
# Переименуем значения в таблице
columns_to_replace = [
    'IDH1', 'TP53', 'ATRX', 'PTEN', 'EGFR', 'CIC', 'MUC16', 'PIK3CA', 'NF1', 'PIK3R1',
    'FUBP1', 'RB1', 'NOTCH1', 'BCOR', 'CSMD3', 'SMARCA4', 'GRIN2A', 'IDH2', 'FAT4', 'PDGFRA'
]
replacement_dict = {'NOT_MUTATED': 0, 'MUTATED': 1}

for column in columns_to_replace:
    data_raw[column] = data_raw[column].replace(replacement_dict)

In [9]:
data = data_raw

In [10]:
data

Unnamed: 0,Grade,Project,Case_ID,Gender,Age_at_diagnosis,Primary_Diagnosis,Race,IDH1,TP53,ATRX,...,FUBP1,RB1,NOTCH1,BCOR,CSMD3,SMARCA4,GRIN2A,IDH2,FAT4,PDGFRA
0,LGG,TCGA-LGG,TCGA-DU-8164,Male,51 years 108 days,"Oligodendroglioma, NOS",white,1,0,0,...,1,0,0,0,0,0,0,0,0,0
1,LGG,TCGA-LGG,TCGA-QH-A6CY,Male,38 years 261 days,Mixed glioma,white,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,LGG,TCGA-LGG,TCGA-HW-A5KM,Male,35 years 62 days,"Astrocytoma, NOS",white,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,LGG,TCGA-LGG,TCGA-E1-A7YE,Female,32 years 283 days,"Astrocytoma, anaplastic",white,1,1,1,...,0,0,0,0,0,0,0,0,1,0
4,LGG,TCGA-LGG,TCGA-S9-A6WG,Male,31 years 187 days,"Astrocytoma, anaplastic",white,1,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
857,GBM,TCGA-GBM,TCGA-19-5959,Female,77 years 325 days,Glioblastoma,white,0,0,0,...,0,0,0,0,0,0,0,0,0,0
858,GBM,TCGA-GBM,TCGA-16-0846,Male,85 years 65 days,Glioblastoma,white,0,1,0,...,0,0,0,0,0,0,0,0,0,0
859,GBM,TCGA-GBM,TCGA-28-1746,Female,77 years 178 days,Glioblastoma,white,0,1,0,...,0,0,0,0,0,0,0,0,0,0
860,GBM,TCGA-GBM,TCGA-32-2491,Male,63 years 121 days,Glioblastoma,white,0,1,0,...,0,1,0,0,0,0,0,0,0,0


In [11]:
# Разделим данные на обучающий и тестовый наборы
X_train, X_test, y_train, y_test = train_test_split(
    data[['IDH1', 'TP53', 'ATRX', 'PTEN', 'EGFR', 'CIC', 'MUC16', 'PIK3CA', 'NF1', 'PIK3R1',
          'FUBP1', 'RB1', 'NOTCH1', 'BCOR', 'CSMD3', 'SMARCA4', 'GRIN2A', 'IDH2', 'FAT4', 'PDGFRA']],
    data['Grade'],
    test_size=0.3,
    random_state=10
)

# **LogisticRegression**

In [12]:
# Создадим и обучим бейзлайн модель (логистическую регрессию)
baseline_model = LogisticRegression()
baseline_model.fit(X_train, y_train)

In [13]:
# Оценим бейзлайн модель
baseline_predictions = baseline_model.predict(X_test)
baseline_accuracy = classification_report(y_test, baseline_predictions)
print(baseline_accuracy)

              precision    recall  f1-score   support

         GBM       0.80      0.96      0.87       112
         LGG       0.96      0.82      0.88       147

    accuracy                           0.88       259
   macro avg       0.88      0.89      0.88       259
weighted avg       0.89      0.88      0.88       259



# **CatBoost**

In [None]:
# Создадим и обучим модель с использованием CatBoost
catboost_model = CatBoostClassifier()
catboost_model.fit(X_train, y_train)

In [15]:
# Оценим модель CatBoost
catboost_predictions = catboost_model.predict(X_test)
catboost_accuracy = classification_report(y_test, catboost_predictions)
print(catboost_accuracy)

              precision    recall  f1-score   support

         GBM       0.79      0.88      0.83       112
         LGG       0.90      0.82      0.86       147

    accuracy                           0.85       259
   macro avg       0.84      0.85      0.84       259
weighted avg       0.85      0.85      0.85       259



# **RandomForest**

In [16]:
# Создадим и обучим модель случайного леса
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train, y_train)

In [17]:
# Оценим модель случайного леса
random_forest_predictions = random_forest_model.predict(X_test)
random_forest_accuracy = classification_report(y_test, random_forest_predictions)
print(random_forest_accuracy)

              precision    recall  f1-score   support

         GBM       0.77      0.82      0.80       112
         LGG       0.86      0.82      0.84       147

    accuracy                           0.82       259
   macro avg       0.82      0.82      0.82       259
weighted avg       0.82      0.82      0.82       259



# **AdaBoost**

In [18]:
# Создадим и обучим модель  AdaBoostClassifier
ada_model = AdaBoostClassifier(n_estimators=50, random_state=42)
ada_model.fit(X_train, y_train)

# Оценим модель AdaBoostClassifier
ada_predictions = ada_model.predict(X_test)
ada_accuracy = classification_report(y_test, ada_predictions)
print(ada_accuracy)

              precision    recall  f1-score   support

         GBM       0.79      0.92      0.85       112
         LGG       0.93      0.82      0.87       147

    accuracy                           0.86       259
   macro avg       0.86      0.87      0.86       259
weighted avg       0.87      0.86      0.86       259



# **Сохраниение моделей**

In [19]:
joblib.dump(baseline_model, 'baseline_model.pkl')
joblib.dump(random_forest_model, 'random_forest_model.pkl')
joblib.dump(ada_model, 'ada_model.pkl')

['ada_model.pkl']