In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
import time

In [2]:
df = pd.read_csv(r'../character-predictions.csv')

In [3]:
df.head(5)

Unnamed: 0,S.No,actual,pred,alive,plod,name,title,male,culture,dateOfBirth,...,isAliveHeir,isAliveSpouse,isMarried,isNoble,age,numDeadRelations,boolDeadRelations,isPopular,popularity,isAlive
0,1,0,0,0.054,0.946,Viserys II Targaryen,,1,,,...,0,Unknown,0,0,,11,1,1,0.605351,0
1,2,1,0,0.387,0.613,Walder Frey,Lord of the Crossing,1,Rivermen,208.0,...,Unknown,1,1,1,97.0,1,1,1,0.896321,1
2,3,1,0,0.493,0.507,Addison Hill,Ser,1,,,...,Unknown,Unknown,0,1,,0,0,0,0.267559,1
3,4,0,0,0.076,0.924,Aemma Arryn,Queen,0,,82.0,...,Unknown,0,1,1,23.0,0,0,0,0.183946,0
4,5,1,1,0.617,0.383,Sylva Santagar,Greenstone,0,Dornish,276.0,...,Unknown,1,1,1,29.0,0,0,0,0.043478,1


In [4]:
df.loc[df.culture.isna(), 'culture'] = 'Unkown'
df['culture'].values

array(['Unkown', 'Rivermen', 'Unkown', ..., 'Unkown', 'Unkown', 'Unkown'],
      dtype=object)

In [5]:
#Going to clean the titles into an easy binary field
df['hasTitle'] = np.where(df['title'] != 'None', 1, 0) 
#Convert these guys to categorical
df['culture'] = df['culture'].astype('category')
df['culture_codes'] = df['culture'].cat.codes

df['isAliveHeir'] = df['isAliveHeir'].astype('category')
df['isAliveHeir_codes'] = df['isAliveHeir'].cat.codes

df['isAliveSpouse'] = df['isAliveSpouse'].astype('category')
df['isAliveSpouse_codes'] = df['isAliveSpouse'].cat.codes

#Make birthday into hasBirthday?
df['hasBirthday'] = np.where(df['dateOfBirth'] == np.nan, 0, 1)

In [6]:
df.head(10)

Unnamed: 0,S.No,actual,pred,alive,plod,name,title,male,culture,dateOfBirth,...,numDeadRelations,boolDeadRelations,isPopular,popularity,isAlive,hasTitle,culture_codes,isAliveHeir_codes,isAliveSpouse_codes,hasBirthday
0,1,0,0,0.054,0.946,Viserys II Targaryen,,1,Unkown,,...,11,1,1,0.605351,0,0,50,0,2,1
1,2,1,0,0.387,0.613,Walder Frey,Lord of the Crossing,1,Rivermen,208.0,...,1,1,1,0.896321,1,1,41,2,1,1
2,3,1,0,0.493,0.507,Addison Hill,Ser,1,Unkown,,...,0,0,0,0.267559,1,1,50,2,2,1
3,4,0,0,0.076,0.924,Aemma Arryn,Queen,0,Unkown,82.0,...,0,0,0,0.183946,0,1,50,2,0,1
4,5,1,1,0.617,0.383,Sylva Santagar,Greenstone,0,Dornish,276.0,...,0,0,0,0.043478,1,1,10,2,1,1
5,6,1,0,0.021,0.979,Tommen Baratheon,,1,Unkown,,...,5,1,1,1.0,1,0,50,1,2,1
6,7,0,0,0.014,0.986,Valarr Targaryen,Hand of the King,1,Valyrian,183.0,...,0,0,1,0.431438,0,1,54,2,1,1
7,8,0,0,0.036,0.964,Viserys I Targaryen,,1,Unkown,,...,5,1,1,0.67893,0,0,50,1,2,1
8,9,0,1,0.724,0.276,Wilbert,Ser,1,Unkown,,...,0,0,0,0.006689,0,1,50,2,2,1
9,10,1,0,0.391,0.609,Wilbert Osgrey,Ser,1,Unkown,,...,0,0,0,0.020067,1,1,50,2,2,1


In [7]:
data = df[['hasTitle', 
          'male',
          'hasBirthday',
          'numDeadRelations',
          'isPopular',
          'hasTitle',
          'culture_codes',
          'isAliveHeir_codes',
          'isAliveSpouse_codes',
          'isAlive']]

x_train, x_test = train_test_split(data, test_size=0.5, random_state=int(time.time()))

In [13]:
gnb = GaussianNB()
features = ['hasTitle', 
          'male',
          'hasBirthday',
          'numDeadRelations',
          'isPopular',
          'hasTitle',
          'culture_codes',
          'isAliveHeir_codes',
          'isAliveSpouse_codes']

gnb.fit(
    x_train[features].values,
    x_train['isAlive']
)

y_prob = gnb.predict_proba(data[features])
y_pred = gnb.predict(data[features])

In [15]:
print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
      .format(
          data.shape[0],
          (data["isAlive"] != y_pred).sum(),
          100*(1-(data["isAlive"] != y_pred).sum()/data.shape[0])
))

Number of mislabeled points out of a total 1946 points : 474, performance 75.64%


In [16]:
df = pd.read_csv(r'../character-predictions.csv')
p_dead, p_alive = zip(*y_prob)

In [17]:
df['prob_dead'] = pd.Series(y_prob)

NameError: name 'Series' is not defined