In [1]:
import pandas as pd
import statsmodels.formula.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
deaths = pd.read_csv("character-deaths.csv")
characters = pd.read_csv("character-predictions_pose.csv")

In [3]:
characters.columns

Index(['S.No', 'plod', 'name', 'title', 'male', 'culture', 'dateOfBirth',
       'DateoFdeath', 'mother', 'father', 'heir', 'house', 'spouse', 'book1',
       'book2', 'book3', 'book4', 'book5', 'isAliveMother', 'isAliveFather',
       'isAliveHeir', 'isAliveSpouse', 'isMarried', 'isNoble', 'age',
       'numDeadRelations', 'boolDeadRelations', 'isPopular', 'popularity',
       'isAlive'],
      dtype='object')

# dataframe cleaning

In [4]:
# selected columns I think might be relevant
characters = pd.read_csv("character-predictions_pose.csv")
characters = characters.loc[:,['name', 'male', 'house', 'isNoble', 'age', 'numDeadRelations', 'popularity', 'isAlive', 'culture']]
# deleted all observations that have NaNs
characters = characters[~characters.isnull().any(axis=1)]
characters.reset_index(inplace = True, drop = True)
characters

Unnamed: 0,name,male,house,isNoble,age,numDeadRelations,popularity,isAlive,culture
0,Walder Frey,1,House Frey,1,97.0,1,0.896321,1,Rivermen
1,Sylva Santagar,0,House Santagar,1,29.0,0,0.043478,1,Dornish
2,Valarr Targaryen,1,House Targaryen,1,26.0,0,0.431438,0,Valyrian
3,Wex Pyke,1,House Botley,0,19.0,0,0.113712,1,Ironborn
4,Timett,1,Burned Men,1,27.0,0,0.073579,1,Vale mountain clans
...,...,...,...,...,...,...,...,...,...
148,Sarella Sand,0,House Martell,0,25.0,1,0.103679,1,Dornishmen
149,Rhaegar Targaryen,1,House Targaryen,1,24.0,11,0.799331,0,Valyrian
150,Loras Tyrell,1,House Tyrell,1,23.0,2,0.665552,1,The Reach
151,Gormond Goodbrother,1,House Goodbrother,0,23.0,0,0.040134,1,Ironborn


# modeling

### basic model

In [5]:
model = sm.ols(formula = 'isAlive~age+male+house+isNoble+numDeadRelations+popularity+culture', data = characters).fit()
model.rsquared

0.7182996554985103

In [6]:
model.rsquared_adj

0.47782375165577506

### cleaned cultures

In [7]:
# clean cultures 

culture_counts = pd.DataFrame(characters.culture.value_counts())

#combining culture names that mean the same thing 
characters.culture = characters.culture.replace(to_replace = "northmen", value = "Northmen")
characters.culture = characters.culture.replace(to_replace = "ironborn", value = "Ironborn")
characters.culture = characters.culture.replace(to_replace = "Ironmen", value = "Ironborn")
characters.culture = characters.culture.replace(to_replace = "Asshai'i", value = "Asshai")
characters.culture = characters.culture.replace(to_replace = "Free folk", value = "Free Folk")
characters.culture = characters.culture.replace(to_replace = "free folk", value = "Free Folk")
characters.culture = characters.culture.replace(to_replace = "Summer Islands", value = "Summer Isles")
characters.culture = characters.culture.replace(to_replace = "Summer Islander", value = "Summer Isles")
characters.culture = characters.culture.replace(to_replace = "westermen", value = "Westermen")
characters.culture = characters.culture.replace(to_replace = "Westerman", value = "Westermen")
characters.culture = characters.culture.replace(to_replace = "Westerlands", value = "Westermen")
characters.culture = characters.culture.replace(to_replace = "Vale", value = "Valemen")
characters.culture = characters.culture.replace(to_replace = "Lhazareen", value = "Lhazarene")
characters.culture = characters.culture.replace(to_replace = "The Reach", value = "Reach")
characters.culture = characters.culture.replace(to_replace = "Reachmen", value = "Reach")
characters.culture = characters.culture.replace(to_replace = "Qarth", value = "Qartheen")
characters.culture = characters.culture.replace(to_replace = "Lyseni", value = "Lysene")
characters.culture = characters.culture.replace(to_replace = "Stormlander", value = "Stormlands")
characters.culture = characters.culture.replace(to_replace = "Meereenese", value = "Meereen")
characters.culture = characters.culture.replace(to_replace = "Astapor", value = "Astapori")
characters.culture = characters.culture.replace(to_replace = "Norvos", value = "Norvoshi")
characters.culture = characters.culture.replace(to_replace = "Wildlings", value = "Wildling")
characters.culture = characters.culture.replace(to_replace = "Andals", value = "Andal")
characters.culture = characters.culture.replace(to_replace = "Braavos", value = "Braavosi")
characters.culture = characters.culture.replace(to_replace = "Dorne", value = "Dornish")
characters.culture = characters.culture.replace(to_replace = "Dornishmen", value = "Dornish")
characters.culture = characters.culture.replace(to_replace = "Ghiscaricari", value = "Ghiscari")

In [8]:
print(characters)

                    name  male              house  isNoble   age  \
0            Walder Frey     1         House Frey        1  97.0   
1         Sylva Santagar     0     House Santagar        1  29.0   
2       Valarr Targaryen     1    House Targaryen        1  26.0   
3               Wex Pyke     1       House Botley        0  19.0   
4                 Timett     1         Burned Men        1  27.0   
..                   ...   ...                ...      ...   ...   
148         Sarella Sand     0      House Martell        0  25.0   
149    Rhaegar Targaryen     1    House Targaryen        1  24.0   
150         Loras Tyrell     1       House Tyrell        1  23.0   
151  Gormond Goodbrother     1  House Goodbrother        0  23.0   
152       Laena Velaryon     0     House Velaryon        0  27.0   

     numDeadRelations  popularity  isAlive              culture  
0                   1    0.896321        1             Rivermen  
1                   0    0.043478        1         

In [9]:
#getting survival rates per culture 
culture_counts = pd.DataFrame(characters.culture.value_counts()).reset_index()
survival_counts = pd.DataFrame(characters.groupby(['culture']).isAlive.value_counts())
survival_counts.rename({'isAlive':'count_survived'}, axis='columns', inplace=True)
survival_counts = pd.DataFrame(survival_counts.to_records())
#for lysene and rivermen, no one survived — this will be taken into account later and for now they will be dropped
survival_counts = survival_counts[survival_counts['isAlive'] == 1] 
survival_counts.drop(columns=["isAlive"], inplace=True)
culture_counts = culture_counts.rename(columns={'index':'culture', "culture" : "total"})
survival_df = culture_counts.merge(survival_counts)
survival_df['percent_survived'] = (survival_df['count_survived']/survival_df['total'])*100
survival_df = survival_df.sort_values(by=['percent_survived'], ascending=False)
display(survival_df)

Unnamed: 0,culture,total,count_survived,percent_survived
9,Dothraki,3,3,100.0
10,Reach,3,3,100.0
17,Vale mountain clans,1,1,100.0
16,Braavosi,1,1,100.0
15,Norvoshi,1,1,100.0
14,Naathi,1,1,100.0
11,Crannogmen,3,3,100.0
18,Myrish,1,1,100.0
2,Dornish,17,15,88.235294
3,Ironborn,12,10,83.333333


In [10]:
#grouping cultures based on their survival rates
characters.culture = characters.culture.replace(to_replace = ['Crannogmen', 'Reach', 'Norvoshi', 'Naathi', 'Vale mountain clans', 'Braavosi', 'Dorne', 'Dothraki', 'Myrish', 'Dornish'], value = "all_survive")
characters.culture = characters.culture.replace(to_replace = ['Ironborn', 'Westeros', 'Dornishmen', 'Rivermen', 'Northmen'], value = "most_survive")
characters.culture = characters.culture.replace(to_replace = ['Westermen', 'Stormlands', 'Ghiscari', 'Wildling', 'Valemen'], value = "half_survive")
characters.culture = characters.culture.replace(to_replace = ['Riverlands', 'Lysene', 'Valyrian'], value = "few_survive")
culture_counts = pd.DataFrame(characters.culture.value_counts()).reset_index()
culture_counts = culture_counts.rename(columns={'index':'culture', "culture" : "count"})
display(culture_counts)

Unnamed: 0,culture,count
0,most_survive,63
1,few_survive,38
2,all_survive,31
3,half_survive,21


In [31]:
def clean(data):
    return data
clean_data = clean(characters)
def test_person(name, formula, data = characters, cutoff=.5):
    train = data.loc[characters['name'] != name]
    person = data.loc[characters['name'] == name]
    model = sm.logit(formula = formula, data=characters).fit()
    prediction = model.predict(person)
    if prediction.values + cutoff >= 1:
        return person['isAlive'].values
    print(type(person['isAlive']))
    return 0 if person['isAlive'].values==1 else 1

def test(formula, num_examples=50, loss_func=None, cutoff=.5):
    accurate_counter = 0
    for index, row in characters.iterrows():
        accuracy = test_person(row['name'], formula, data = characters, cutoff=cutoff)
        if accuracy:
            accurate_counter += 1
        if index >= num_examples:
            break
    if loss_func is None:
        return accurate_counter
    else:
        return loss_func(accurate_counter)

In [32]:
test_person(name = "Valarr Targaryen", formula = "isAlive~culture+age", data = characters)

Optimization terminated successfully.
         Current function value: 0.427990
         Iterations 7
<class 'pandas.core.series.Series'>


1

In [40]:
test("isAlive~culture+male+isNoble",num_examples=153)

Optimization terminated successfully.
         Current function value: 0.419047
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.419047
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.419047
         Iterations 7
<class 'pandas.core.series.Series'>
Optimization terminated successfully.
         Current function value: 0.419047
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.419047
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.419047
         Iterations 7
<class 'pandas.core.series.Series'>
Optimization terminated successfully.
         Current function value: 0.419047
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.419047
         Iterations 7
<class 'pandas.core.series.Series'>
Optimization terminated successfully.
         Current function value: 0.419

Optimization terminated successfully.
         Current function value: 0.419047
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.419047
         Iterations 7
<class 'pandas.core.series.Series'>
Optimization terminated successfully.
         Current function value: 0.419047
         Iterations 7
<class 'pandas.core.series.Series'>
Optimization terminated successfully.
         Current function value: 0.419047
         Iterations 7
<class 'pandas.core.series.Series'>
Optimization terminated successfully.
         Current function value: 0.419047
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.419047
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.419047
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.419047
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.419

125