In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
pd.options.display.max_rows = 100

In [3]:
characters = pd.read_csv("character-predictions_pose.csv")

In [4]:
characters.columns

Index(['S.No', 'plod', 'name', 'title', 'male', 'culture', 'dateOfBirth',
       'DateoFdeath', 'mother', 'father', 'heir', 'house', 'spouse', 'book1',
       'book2', 'book3', 'book4', 'book5', 'isAliveMother', 'isAliveFather',
       'isAliveHeir', 'isAliveSpouse', 'isMarried', 'isNoble', 'age',
       'numDeadRelations', 'boolDeadRelations', 'isPopular', 'popularity',
       'isAlive'],
      dtype='object')

In [5]:
characters = characters[['name', 'male', 'age', 'culture', 'house', 'isAlive', 'isNoble']]
characters = characters[~characters.isnull().any(axis=1)]
characters.reset_index(drop = True, inplace = True)

In [6]:
characters.loc[characters.isAlive == 0].shape

(64, 7)

# make df: survival rates for different ages

In [7]:
def count_alive(df):
    count_Alive = []
    for num in df.groupby('age').sum().index:
        count = df.loc[df.age == num].isAlive.sum()
        count_Alive.append(count)
    return count_Alive

In [8]:
count_alive = count_alive(characters)

In [9]:
count_total = characters.groupby('age').count().name.to_list()

In [10]:
age = pd.DataFrame(count_alive)
age.rename(columns={0: "alive"}, inplace = True)
age['total'] = count_total
age['perc_alive'] = np.divide(count_alive, count_total)
age = age.sort_values(['perc_alive'], ascending=[False])

In [11]:
age

Unnamed: 0,alive,total,perc_alive
25,2,2,1.0
59,1,1,1.0
22,4,4,1.0
23,2,2,1.0
24,1,1,1.0
50,1,1,1.0
26,2,2,1.0
29,3,3,1.0
58,1,1,1.0
18,1,1,1.0


In [12]:
characters = characters[~characters.isnull().any(axis=1)]
characters.reset_index(drop = True, inplace = True)
characters.head()

Unnamed: 0,name,male,age,culture,house,isAlive,isNoble
0,Walder Frey,1,97.0,Rivermen,House Frey,1,1
1,Sylva Santagar,0,29.0,Dornish,House Santagar,1,1
2,Valarr Targaryen,1,26.0,Valyrian,House Targaryen,0,1
3,Wex Pyke,1,19.0,Ironborn,House Botley,1,0
4,Timett,1,27.0,Vale mountain clans,Burned Men,1,1


# functions for testing

In [13]:
def clean(data):
    return data

In [14]:
clean_data = clean(characters)

In [15]:
def test_person(name, formula, data):
    train = data.loc[clean_data['name'] != name]
    person = data.loc[clean_data['name'] == name]
    model = smf.logit(formula = formula, data = clean_data).fit()
    prediction = model.predict(person)
    return int(prediction) == person['isAlive']

In [16]:
def test(formula, data, num_examples = 433, loss_func = None):
    accurate_counter = 0
    for index, row in clean_data.iterrows():
        accuracy = test_person(row['name'], formula, data)
        if accuracy.iloc[0]:
            accurate_counter += 1
        if index >= num_examples:
            break
    if loss_func is None:
        return accurate_counter
    else:
        return loss_func(accurate_counter)

In [17]:
def test(formula, data, num_examples = 433, loss_func = None):
    accurate_counter = 0
    for index, row in clean_data.iterrows():
        accuracy = test_person(row['name'], formula, data)
        if accuracy.iloc[0]:
            accurate_counter += 1
        if index >= num_examples:
            break
    if loss_func is None:
        return accurate_counter
    else:
        return loss_func(accurate_counter)

# test 1: 0, middle, 1

In [18]:
perc_zero = list(age.loc[age.perc_alive == 0].index.values)
perc_zero

[53, 56, 55, 0, 47, 44, 43, 42, 40, 39, 37, 34, 1, 19, 2, 62]

In [19]:
perc_one = list(age.loc[age.perc_alive ==1].index.values)

In [20]:
characters['age_binned'] = characters['age'].apply(lambda x: 
                                                   '0' if x in perc_zero
                                                   else '1' if x in perc_one
                                                   else '(0,1)')

In [21]:
test('isAlive~age_binned+male+isNoble', characters)

Optimization terminated successfully.
         Current function value: 0.665481
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.665481
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.665481
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.665481
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.665481
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.665481
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.665481
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.665481
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.665481
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.665481
  

64

In [22]:
model = smf.logit(formula = 'isAlive~age_binned', data = characters).fit()
model.summary()

Optimization terminated successfully.
         Current function value: 0.677983
         Iterations 4


0,1,2,3
Dep. Variable:,isAlive,No. Observations:,153.0
Model:,Logit,Df Residuals:,150.0
Method:,MLE,Df Model:,2.0
Date:,"Sat, 25 Feb 2023",Pseudo R-squ.:,0.002582
Time:,16:40:25,Log-Likelihood:,-103.73
converged:,True,LL-Null:,-104.0
Covariance Type:,nonrobust,LLR p-value:,0.7645

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.3947,0.232,1.698,0.089,-0.061,0.850
age_binned[T.0],-0.3205,0.450,-0.713,0.476,-1.202,0.561
age_binned[T.1],-0.0231,0.372,-0.062,0.951,-0.752,0.706


# test 2: 0, halves, 1

In [23]:
perc_zero = list(age.loc[age.perc_alive == 0].index.values)
perc_zero

[53, 56, 55, 0, 47, 44, 43, 42, 40, 39, 37, 34, 1, 19, 2, 62]

In [24]:
perc_below50 = list(age.loc[((age.perc_alive > 0) & (age.perc_alive < .5))].index.values)

In [25]:
perc_above50 = list(age.loc[((age.perc_alive >= .5) & (age.perc_alive < 1))].index.values)

In [26]:
characters['age_binned'] = characters['age'].apply(lambda x: 
                                                   '0' if x in perc_zero
                                                   else '(0, .5)' if x in perc_below50
                                                   else '(.5, 1)' if x in perc_above50
                                                   else '1')

In [27]:
characters

Unnamed: 0,name,male,age,culture,house,isAlive,isNoble,age_binned
0,Walder Frey,1,97.0,Rivermen,House Frey,1,1,1
1,Sylva Santagar,0,29.0,Dornish,House Santagar,1,1,1
2,Valarr Targaryen,1,26.0,Valyrian,House Targaryen,0,1,1
3,Wex Pyke,1,19.0,Ironborn,House Botley,1,0,0
4,Timett,1,27.0,Vale mountain clans,Burned Men,1,1,"(0, .5)"
...,...,...,...,...,...,...,...,...
148,Sarella Sand,0,25.0,Dornishmen,House Martell,1,0,1
149,Rhaegar Targaryen,1,24.0,Valyrian,House Targaryen,0,1,1
150,Loras Tyrell,1,23.0,The Reach,House Tyrell,1,1,1
151,Gormond Goodbrother,1,23.0,Ironborn,House Goodbrother,1,0,1


In [28]:
model = smf.logit(formula = 'isAlive~age_binned', data = characters).fit()
model.summary()

Optimization terminated successfully.
         Current function value: 0.664366
         Iterations 5


0,1,2,3
Dep. Variable:,isAlive,No. Observations:,153.0
Model:,Logit,Df Residuals:,149.0
Method:,MLE,Df Model:,3.0
Date:,"Sat, 25 Feb 2023",Pseudo R-squ.:,0.02261
Time:,16:40:26,Log-Likelihood:,-101.65
converged:,True,LL-Null:,-104.0
Covariance Type:,nonrobust,LLR p-value:,0.1948

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.7673,0.336,2.286,0.022,0.109,1.425
"age_binned[T.(0, .5)]",0.4855,0.869,0.559,0.576,-1.218,2.189
age_binned[T.0],-0.6931,0.511,-1.357,0.175,-1.694,0.308
age_binned[T.1],-0.6619,0.407,-1.627,0.104,-1.459,0.135


In [29]:
test('isAlive~age_binned+male+isNoble', characters)

Optimization terminated successfully.
         Current function value: 0.654848
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.654848
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.654848
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.654848
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.654848
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.654848
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.654848
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.654848
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.654848
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.654848
  

64

# test 3: 0, 0-.5, .5, .5-.7, .7-.8, .8-1, 1

In [30]:
perc_50 = list(age.loc[age.perc_alive == .5].index.values)

In [31]:
perc_5070 = list(age.loc[((age.perc_alive > .5) & (age.perc_alive < .7))].index.values)

In [32]:
perc_7080 = list(age.loc[((age.perc_alive > .7) & (age.perc_alive < .8))].index.values)

In [33]:
perc_801 = list(age.loc[((age.perc_alive > .8) & (age.perc_alive < 1))].index.values)

In [34]:
characters['age_binned'] = characters['age'].apply(lambda x: 
                                                   '0' if x in perc_zero
                                                   else '(0, .5)' if x in perc_below50
                                                   else '.5' if x in perc_50
                                                   else '(.5, .7)' if x in perc_5070
                                                   else '(.7, .8)' if x in perc_7080
                                                   else '(.8, 1)' if x in perc_801
                                                   else '1')

In [35]:
characters

Unnamed: 0,name,male,age,culture,house,isAlive,isNoble,age_binned
0,Walder Frey,1,97.0,Rivermen,House Frey,1,1,1
1,Sylva Santagar,0,29.0,Dornish,House Santagar,1,1,1
2,Valarr Targaryen,1,26.0,Valyrian,House Targaryen,0,1,1
3,Wex Pyke,1,19.0,Ironborn,House Botley,1,0,0
4,Timett,1,27.0,Vale mountain clans,Burned Men,1,1,"(0, .5)"
...,...,...,...,...,...,...,...,...
148,Sarella Sand,0,25.0,Dornishmen,House Martell,1,0,1
149,Rhaegar Targaryen,1,24.0,Valyrian,House Targaryen,0,1,1
150,Loras Tyrell,1,23.0,The Reach,House Tyrell,1,1,1
151,Gormond Goodbrother,1,23.0,Ironborn,House Goodbrother,1,0,1


In [36]:
model = smf.logit(formula = 'isAlive~age_binned+male+isNoble', data = characters).fit()
model.summary()

Optimization terminated successfully.
         Current function value: 0.641895
         Iterations 5


0,1,2,3
Dep. Variable:,isAlive,No. Observations:,153.0
Model:,Logit,Df Residuals:,145.0
Method:,MLE,Df Model:,7.0
Date:,"Sat, 25 Feb 2023",Pseudo R-squ.:,0.05567
Time:,16:40:27,Log-Likelihood:,-98.21
converged:,True,LL-Null:,-104.0
Covariance Type:,nonrobust,LLR p-value:,0.1153

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.0286,0.902,1.140,0.254,-0.740,2.797
"age_binned[T.(.7, .8)]",-1.3842,1.520,-0.911,0.362,-4.363,1.595
"age_binned[T.(0, .5)]",0.8216,1.202,0.683,0.494,-1.534,3.177
age_binned[T..5],0.9019,1.049,0.860,0.390,-1.155,2.958
age_binned[T.0],-0.4163,0.965,-0.431,0.666,-2.308,1.475
age_binned[T.1],-0.2987,0.912,-0.327,0.743,-2.087,1.490
male,-0.2355,0.370,-0.636,0.525,-0.961,0.491
isNoble,-0.6082,0.381,-1.595,0.111,-1.355,0.139


In [37]:
test('isAlive~age_binned+male+isNoble', characters)

Optimization terminated successfully.
         Current function value: 0.641895
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.641895
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.641895
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.641895
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.641895
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.641895
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.641895
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.641895
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.641895
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.641895
  

64