In [1]:
# Import packages

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
# Import dataset

character_deaths = pd.read_csv("./Dataset/character-deaths.csv")
print(f"character_deaths length: {len(character_deaths)}")

character_deaths length: 917


In [3]:
# Data preprocessing
## Make label
character_deaths["Label"] = 0
character_deaths["Label"][character_deaths["Book of Death"].notnull()] = 1
dataset = character_deaths.drop(["Death Year", "Book of Death", "Death Chapter"], axis = 1)
## Fillna 0
dataset = dataset.fillna(0)
## Transfer to dummies
dataset = pd.get_dummies(dataset, columns=["Allegiances"], dtype = int)
## Drop name
dataset = dataset.drop(["Name"], axis = 1)
print(dataset)

     Book Intro Chapter  Gender  Nobility  GoT  CoK  SoS  FfC  DwD  Label  \
0              1.355440       1         1    1    1    1    1    0      0   
1              1.010617       1         1    0    0    1    0    0      1   
2             -1.156840       1         1    0    0    0    0    1      0   
3             -0.417934       1         1    0    0    0    0    1      1   
4             -1.403142       1         1    0    0    1    0    0      0   
..                  ...     ...       ...  ...  ...  ...  ...  ...    ...   
912           -0.368674       1         0    0    0    1    0    0      0   
913            0.912096       1         0    0    0    0    0    1      1   
914           -0.171632       1         1    0    0    0    0    1      1   
915            2.192866       1         0    0    0    1    0    0      1   
916            0.025409       1         1    0    0    0    1    0      1   

     Allegiances_0  ...  Allegiances_House Tully  Allegiances_House Tyrell 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  character_deaths["Label"][character_deaths["Book of Death"].notnull()] = 1


In [4]:
# Split dataset
X = dataset.drop(["Label"], axis = 1)
y = dataset["Label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [11]:
# Build model
classifier = tree.DecisionTreeClassifier(max_depth=25)
classifier = classifier.fit(X_train, y_train)
# Prediction
prediction = classifier.predict(X_test)

In [12]:
# Confusion metrix
print(f"confusion_matrix: \n{confusion_matrix(y_test, prediction)}")
# Classify report
print(classification_report(y_test, prediction))

confusion_matrix: 
[[122  28]
 [ 33  47]]
              precision    recall  f1-score   support

           0       0.79      0.81      0.80       150
           1       0.63      0.59      0.61        80

    accuracy                           0.73       230
   macro avg       0.71      0.70      0.70       230
weighted avg       0.73      0.73      0.73       230



In [None]:
# Plot tree
tree.plot_tree(classifier)

In [13]:
# Make submission file for kaggle
from datetime import datetime
testData = pd.read_csv("./Dataset/test.csv")
testData = testData.fillna(0)
testData = pd.get_dummies(testData, columns=["Allegiances"], dtype=int)
character = testData["Character"]
testData = testData.drop(["Name", "Character"], axis = 1)
prediction = classifier.predict(testData)
submission = {"Character": character, "Death": prediction}
submission = pd.DataFrame(submission)
submission.to_csv(f"./Dataset/submission_{datetime.now()}.csv", index=False)
print(submission)
print("Done")

     Character  Death
0          668      1
1           30      0
2          377      1
3          535      0
4          806      1
..         ...    ...
225        259      0
226        490      0
227        302      0
228          7      0
229        891      0

[230 rows x 2 columns]
Done
