In [56]:
import pandas as pd
import statsmodels.formula.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [57]:
deaths = pd.read_csv("character-deaths.csv")
characters = pd.read_csv("character-predictions_pose.csv")

In [58]:
characters.columns

Index(['S.No', 'plod', 'name', 'title', 'male', 'culture', 'dateOfBirth',
       'DateoFdeath', 'mother', 'father', 'heir', 'house', 'spouse', 'book1',
       'book2', 'book3', 'book4', 'book5', 'isAliveMother', 'isAliveFather',
       'isAliveHeir', 'isAliveSpouse', 'isMarried', 'isNoble', 'age',
       'numDeadRelations', 'boolDeadRelations', 'isPopular', 'popularity',
       'isAlive'],
      dtype='object')

# dataframe cleaning

In [59]:
# selected columns I think might be relevant
characters = pd.read_csv("character-predictions_pose.csv")
characters = characters.loc[:,['name', 'male', 'house', 'isNoble', 'age', 'numDeadRelations', 'popularity', 'isAlive', 'culture']]

# deleted all observations that have NaNs
characters = characters[~characters.isnull().any(axis=1)]
characters.reset_index(inplace = True, drop = True)
characters

Unnamed: 0,name,male,house,isNoble,age,numDeadRelations,popularity,isAlive,culture
0,Walder Frey,1,House Frey,1,97.0,1,0.896321,1,Rivermen
1,Sylva Santagar,0,House Santagar,1,29.0,0,0.043478,1,Dornish
2,Valarr Targaryen,1,House Targaryen,1,26.0,0,0.431438,0,Valyrian
3,Wex Pyke,1,House Botley,0,19.0,0,0.113712,1,Ironborn
4,Timett,1,Burned Men,1,27.0,0,0.073579,1,Vale mountain clans
...,...,...,...,...,...,...,...,...,...
148,Sarella Sand,0,House Martell,0,25.0,1,0.103679,1,Dornishmen
149,Rhaegar Targaryen,1,House Targaryen,1,24.0,11,0.799331,0,Valyrian
150,Loras Tyrell,1,House Tyrell,1,23.0,2,0.665552,1,The Reach
151,Gormond Goodbrother,1,House Goodbrother,0,23.0,0,0.040134,1,Ironborn


# modeling

### basic model

In [60]:
model = sm.ols(formula = 'isAlive~age+male+house+isNoble+numDeadRelations+popularity+culture', data = characters).fit()
model.rsquared

0.7182996554985103

In [61]:
model.rsquared_adj

0.47782375165577506

### cleaned cultures

In [62]:
# clean cultures 

culture_counts = pd.DataFrame(characters.culture.value_counts())

#combining culture names that mean the same thing 
characters.culture = characters.culture.replace(to_replace = "northmen", value = "Northmen")
characters.culture = characters.culture.replace(to_replace = "ironborn", value = "Ironborn")
characters.culture = characters.culture.replace(to_replace = "Ironmen", value = "Ironborn")
characters.culture = characters.culture.replace(to_replace = "Asshai'i", value = "Asshai")
characters.culture = characters.culture.replace(to_replace = "Free folk", value = "Free Folk")
characters.culture = characters.culture.replace(to_replace = "free folk", value = "Free Folk")
characters.culture = characters.culture.replace(to_replace = "Summer Islands", value = "Summer Isles")
characters.culture = characters.culture.replace(to_replace = "Summer Islander", value = "Summer Isles")
characters.culture = characters.culture.replace(to_replace = "westermen", value = "Westermen")
characters.culture = characters.culture.replace(to_replace = "Westerman", value = "Westermen")
characters.culture = characters.culture.replace(to_replace = "Westerlands", value = "Westermen")
characters.culture = characters.culture.replace(to_replace = "Vale", value = "Valemen")
characters.culture = characters.culture.replace(to_replace = "Lhazareen", value = "Lhazarene")
characters.culture = characters.culture.replace(to_replace = "The Reach", value = "Reach")
characters.culture = characters.culture.replace(to_replace = "Reachmen", value = "Reach")
characters.culture = characters.culture.replace(to_replace = "Qarth", value = "Qartheen")
characters.culture = characters.culture.replace(to_replace = "Lyseni", value = "Lysene")
characters.culture = characters.culture.replace(to_replace = "Stormlander", value = "Stormlands")
characters.culture = characters.culture.replace(to_replace = "Meereenese", value = "Meereen")
characters.culture = characters.culture.replace(to_replace = "Astapor", value = "Astapori")
characters.culture = characters.culture.replace(to_replace = "Norvos", value = "Norvoshi")
characters.culture = characters.culture.replace(to_replace = "Wildlings", value = "Wildling")
characters.culture = characters.culture.replace(to_replace = "Andals", value = "Andal")

In [63]:
model = sm.ols(formula = 'isAlive~age+male+house+isNoble+numDeadRelations+popularity+culture', data = df).fit()
model.rsquared

0.718189796242892

In [64]:
model.rsquared_adj

0.490057726534757

### clean houses

In [65]:
# attempt: code house as other if only 1 person
# result: r-squared stayed the same (.718), adj. increased (.478 to .490)

house_counts_one = house_counts.loc[house_counts.house == 1].index.values.tolist()

characters['house'] = characters['house'].apply(lambda x: 
                                              'other' if x in house_counts_one
                                              else x)

model = sm.ols(formula = 'isAlive~age+male+house+isNoble+numDeadRelations+popularity+I(popularity**2)+culture', data = characters).fit()
model.rsquared

0.7093669754072639

In [66]:
model.rsquared_adj

0.49222735933223116