In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import sys
from sklearn.cross_validation import cross_val_predict

from sklearn.pipeline import Pipeline

# used for train/test splits and cross validation
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV

# used to impute mean for data and standardize for computational stability
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

# logistic regression is our favourite model ever
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LassoCV

# used to calculate AUROC/accuracy
from sklearn import metrics

# used to create confusion matrix
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import cross_val_score

# gradient boosting - must download package https://github.com/dmlc/xgboost
#import xgboost as xgb

# default colours for prettier plots

col = [[0.9047, 0.1918, 0.1988],
    [0.2941, 0.5447, 0.7494],
    [0.3718, 0.7176, 0.3612],
    [1.0000, 0.5482, 0.1000],
    [0.4550, 0.4946, 0.4722],
    [0.6859, 0.4035, 0.2412],
    [0.9718, 0.5553, 0.7741],
    [0.5313, 0.3359, 0.6523]];
marker = ['v','o','d','^','s','o','+']
ls = ['-','-','-','-','-','s','--','--']
%matplotlib inline

from __future__ import print_function

In [2]:
# read data from the got-data.ipynb file
# most of it is scrapped from the game of thrones tv show wiki
df = pd.read_csv('data/got_data_all_characters.csv', sep=',', index_col=0)

# who do we not have data for?
idxRemove = (df['Season(s)'].isnull())
print('Never appeared... ({} characters)'.format(np.sum(idxRemove)))
#print(df[idxRemove].index)
# delete people with no data - they have no scores and never appeared in any season
df = df.loc[~idxRemove]


# remove video game characters
idxRemove = [i for i, s in enumerate(df['Season(s)']) if 'A Telltale Games Series' in s]
print('\nVideo games... ({} characters)'.format(len(idxRemove)))
#print(df.iloc[idxRemove].index)
df = df.iloc[ [i for i in range(df.shape[0]) if i not in idxRemove]  ]


# pick all who died but weren't in a season
idxRemove = (df['Status'].values == 'Deceased') & (df['Season(s)'].isnull())
df = df.loc[~idxRemove]
print('\nDeleting {} who died and were never in the show.'.format(np.sum(idxRemove)))

# pick all who survived past season 1 - this filters a lot of background characters
idxRemove = (df['Status'].values == 'Deceased') & (df['Season(s)'].values == '1')
df = df.loc[~idxRemove]
print('\nRemoved {} who died on/before season 1. Ned :('.format(np.sum(idxRemove)))

# remove all who appeared/died in the same season
for s in ['1','2','3','4','5','6']:
    idxRemove = (df['Season(s)'].values == s) | (str(df['Season(s)'].values) == s)
    df = df.loc[~idxRemove]
    print('Removed {} characters who '.format(np.sum(idxRemove)) + \
          'only appeared in season {}.'.format(s))

# remove the bloody random targaryens
p = re.compile('_Targaryen')
idxRemove = [s for s in df.index if p.search(s) is not None and 'Daenerys' not in s]
print('\nRemoved {} bloody Targaryens.'.format(len(idxRemove)))
df.drop(idxRemove,axis=0,inplace=True)

# remove historical characters
idxRemove = [s for s in df.index
             if df.loc[s,'Season(s)'] == 'Histories & Lore' or
             df.loc[s,'Season(s)'] == 'Complete Guide to Westeros, Histories & Lore']
print('\nRemoved {} historical characters.'.format(len(idxRemove)))
df.drop(idxRemove,axis=0,inplace=True)

# print out the data we have
print('\nTotal of {} characters.'.format(df.shape[0]))

Never appeared... (18 characters)

Video games... (72 characters)

Deleting 0 who died and were never in the show.

Removed 88 who died on/before season 1. Ned :(
Removed 87 characters who only appeared in season 1.
Removed 93 characters who only appeared in season 2.
Removed 76 characters who only appeared in season 3.
Removed 101 characters who only appeared in season 4.
Removed 88 characters who only appeared in season 5.
Removed 26 characters who only appeared in season 6.

Removed 12 bloody Targaryens.

Removed 58 historical characters.

Total of 185 characters.


In [3]:
for i in df.index:
    if df.loc[i,'Status'] is np.nan:
        print('{:50s} - nan'.format(i))
    elif ('Dead' in df.loc[i, 'Status']) | ('Deceased' in df.loc[i, 'Status']):
        s = df.loc[i,'Season(s)']
        s = s[-1]
        print('{:50s} - died in season {} (was in {})'.format(i, s, df.loc[i,'Season(s)']))
    else:
        print('{:50s} - alive (was in {})'.format(i, df.loc[i,'Season(s)']))

Aemon                                              - died in season 5 (was in 1, 3, 4, 5)
Alliser_Thorne                                     - alive (was in 1, 4, 5, 6)
Areo_Hotah                                         - died in season 6 (was in 5, 6)
Armeca                                             - alive (was in 1, 2)
Arwaya_Frey                                        - alive (was in 1, 3)
Arya_Stark                                         - alive (was in 1, 2, 3, 4, 5, 6)
Arys_Oakheart                                      - alive (was in 1, 2,  5)
Balon_Greyjoy                                      - alive (was in 2, 3, 6)
Balon_Swann                                        - alive (was in 2, 3,  5)
Barra                                              - died in season 2 (was in 1, 2)
Barristan_Selmy                                    - died in season 5 (was in 1, 3, 4, 5)
Benjen_Stark                                       - alive (was in 1, 6)
Beric_Dondarrion                       

In [4]:
# write data to file
df.to_csv('data/got_data_valid_characters.csv')