In [195]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [196]:
df = pd.read_csv("game_of_thrones_train.csv", index_col="S.No")


In [197]:
df.columns

Index(['name', 'title', 'male', 'culture', 'dateOfBirth', 'mother', 'father',
       'heir', 'house', 'spouse', 'book1', 'book2', 'book3', 'book4', 'book5',
       'isAliveMother', 'isAliveFather', 'isAliveHeir', 'isAliveSpouse',
       'isMarried', 'isNoble', 'age', 'numDeadRelations', 'popularity',
       'isAlive'],
      dtype='object')

In [198]:
FEATURES = df.columns.tolist()

In [199]:

CULT = {
    'Summer Islands': ['summer islands', 'summer islander', 'summer isles'],
    'Ghiscari': ['ghiscari', 'ghiscaricari', 'ghis'],
    'Asshai': ["asshai'i", 'asshai'],
    'Lysene': ['lysene', 'lyseni'],
    'Andal': ['andal', 'andals'],
    'Braavosi': ['braavosi', 'braavos'],
    'Dornish': ['dornishmen', 'dorne', 'dornish'],
    'Myrish': ['myr', 'myrish', 'myrmen'],
    'Westermen': ['westermen', 'westerman', 'westerlands'],
    'Westerosi': ['westeros', 'westerosi'],
    'Stormlander': ['stormlands', 'stormlander'],
    'Norvoshi': ['norvos', 'norvoshi'],
    'Northmen': ['the north', 'northmen'],
    'Free Folk': ['wildling', 'first men', 'free folk'],
    'Qartheen': ['qartheen', 'qarth'],
    'Reach': ['the reach', 'reach', 'reachmen'],
}

In [200]:
DROP_COL = ['name', 'mother', 'father', 'culture',
            'heir', 'house', 'spouse',
            'numDeadRelations', 'popularity',
            'isAliveMother', 'isAliveFather', 'isAliveHeir', 'isAliveSpouse', 'isMarried', 'isNoble',
            'culture_short',
            ]

In [201]:
def prepare(df1: pd.DataFrame) -> pd.DataFrame:
    df = df1.copy()

    df.title = df.title.fillna('People')
    df['boolDeadRelations'] = False
    df['boolDeadRelations'] = df['numDeadRelations'] > 0
    df['isPopular'] = False
    df['isPopular'] = df['popularity'] >= 0.1
    cult_dic = {}
    for (k, v) in CULT.items():
        for k1 in v:
            cult_dic[k1] = k
    df['culture_short'] = df['culture'].apply(lambda x: np.nan if pd.isna(x) else cult_dic.get(x.lower(), 'no'))
    df['isAliveMother'] = df['isAliveMother'].astype(pd.Int64Dtype())

    df.drop(columns=DROP_COL, inplace=True)

    return df

In [202]:
CAT_FEATURES = ['title', 'male', 'book1', 'book2', 'book3', 'book4', 'book5',       ]

In [203]:
df = prepare(df)

In [204]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop('isAlive', axis=1), df['isAlive'], test_size=0.2,
                                                    random_state=42)

In [205]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score

In [206]:
# Создание модели CatBoost
model = CatBoostClassifier(iterations=100,  # Количество итераций (деревьев)
                           learning_rate=0.1,  # Скорость обучения
                           depth=6,  # Глубина деревьев
                           loss_function='Logloss',  # Функция потерь
                           verbose=10)  # Вывод информации о процессе обучения

# Обучение модели

In [207]:
FEATURES = X_train.columns.tolist()
CAT_FEATURES_INDICES = [FEATURES.index(cat_feature) for cat_feature in CAT_FEATURES]

In [208]:
model.fit(X_train, y_train, plot=True, cat_features=CAT_FEATURES_INDICES)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6587779	total: 54.2ms	remaining: 5.37s
10:	learn: 0.4840062	total: 319ms	remaining: 2.58s
20:	learn: 0.4336779	total: 573ms	remaining: 2.16s
30:	learn: 0.4095095	total: 850ms	remaining: 1.89s
40:	learn: 0.3988885	total: 1.1s	remaining: 1.59s
50:	learn: 0.3918145	total: 1.37s	remaining: 1.31s
60:	learn: 0.3857648	total: 1.6s	remaining: 1.02s
70:	learn: 0.3795815	total: 1.88s	remaining: 767ms
80:	learn: 0.3731303	total: 2.12s	remaining: 498ms
90:	learn: 0.3683385	total: 2.37s	remaining: 235ms
99:	learn: 0.3619093	total: 2.6s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x19c171c8a10>

In [209]:
# Шаг 3. Предсказание на тестовых данных
y_pred = model.predict(X_test)

# Шаг 4. Оценка предсказания по метрике accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy : %.4f" % accuracy)

Accuracy : 0.8109


In [210]:
df_test = pd.read_csv("game_of_thrones_test.csv", index_col="S.No")

In [211]:
df_test = prepare(df_test)

In [212]:
df_test['isAlive'] = model.predict(df_test)

In [193]:
df_test[['isAlive']].to_csv('submission1.csv')