In [None]:
##################################################
### Author: Anthony Igel                       ###
### Team: Category Management Transformation   ###
### Project: Developing practical Python Tools ###
### Purpose: GoT Character Deaths              ###
### Date: 05/30/2018                           ###
##################################################

import sys
assert sys.version_info.major == 3

######################################################################
########                     Import Modules                   ########
######################################################################
import py_effo as py_effo
### pandas
# Pandas is for structured data operations and manipulations, extensively used for data preparation,
import pandas as pd

### numpy
# NumPy stands for Numerical Python, a library contains basic linear algebra functions, Fourier Transforms and advanced random
# number capabilities
import numpy as np

### Scipy
# Scipy performs a host of statistical calculations, built on top of Numpy, thus we do not need to import Numpy as all Numpy
# functions are contained in Scipy
# https://oneau.wordpress.com/2011/02/28/simple-statistics-with-scipy/
import scipy as sp

### sklearn
# Sklearn contains basic statistical models
# As well as a module to calculate model performance statistics
from sklearn import datasets, svm, model_selection, tree, preprocessing, metrics
from sklearn.linear_model import LogisticRegression
import sklearn.ensemble as ske

### Xgboost
#from xgboost import plot_importance
#from xgboost import XGBClassifier as XGBC

### Statsmodels
# Sklearn contains basic statistical models and data sets
import statsmodels.api as sm

### Seaborn
# Seaborn is a Python visualization library based on Matplolib, providing high-level interface for statistcial graphing
# Seaborn supports numpy and pandas data structures as well as statistical routines from scipy and statsmodels
# Note: https://seaborn.pydata.org/introduction.html
import seaborn as sns

### Matplotlib
# Matplotlib is a Python based plotting library with complete 2D support and limited 3D support
%matplotlib inline
import matplotlib as mlb
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
sns.set_style("white")

### String
# Allows for more flexible solutions for dealing with string characters
import string as st

from collections import Counter

In [None]:
######################################################################
########                     Import Data                      ########
######################################################################

### Import data sets
character_deaths = pd.read_csv("//nfs/analysis/analysis/kroger/category_management_transformation/mini_hack_days/python/character_deaths.csv")
character_predictions = pd.read_csv("//nfs/analysis/analysis/kroger/category_management_transformation/mini_hack_days/python/character-predictions.csv")

### View top 10 records of the data frame
character_predictions.head(10)

In [None]:
######################################################################
########                    Data Preparation                  ########
######################################################################

######## Column Manipulation ########

### This will ensure that all column names are stripped of whitespace
character_deaths.rename(columns = lambda x: x.strip(), inplace = True)
character_predictions.rename(columns = lambda x: x.strip(), inplace = True)

### We can also adjust the case of our metrics table columns
character_deaths.rename(columns = lambda x: x.lower(), inplace = True)
character_predictions.rename(columns = lambda x: x.lower(), inplace = True)

######## New Attributes ########
# 1. no_of_books – Number of books a character appeared in
character_predictions.loc[:, "no_of_books"] = character_predictions[[x for x in character_predictions.columns if x.startswith("book")]].sum(axis = 1)

In [None]:
######################################################################
########                    Data Exploration                  ########
######################################################################

######## Is there a relationship between survival and having dead relations?
### Group by blood relations and mortal-status
data = character_predictions.groupby(["booldeadrelations", "isalive"]).count()["s.no"].unstack().copy(deep = True)
p = data.div(data.sum(axis = 1), axis = 0).plot.barh(stacked = True, rot = 0, width = .5)
_ = p.set_xlim([0, 1]), p.set(yticklabels = ["No", "Yes"], xticklabels = "", xlabel = "Proportion of Dead vs. Alive", ylabel = "Has Dead Relations"), p.legend(["Dead", "Alive"])
plt.show()

######## How does appearing in more books relate to survival
### Group by number of books and mortal-status
data = character_predictions.groupby(["no_of_books", "isalive"]).count()["s.no"].unstack().copy(deep = True)
p = data.div(data.sum(axis = 1), axis = 0).plot.barh(stacked = True, rot = 0, figsize = (15, 8), width = .5)
_ = p.set(xticklabels = "", xlim = [0, 1], ylabel = "No. of Books", xlabel = "Proportion of Dead vs. Alive"), p.legend(["Dead", "Alive"], loc = "upper right", ncol = 2, borderpad = -.15)

In [None]:
######## How does culture relate to survival?
## Create parent-culture variable, mapping multiple cultures to a single over-arching culture
cult = {
    'Summer Islands': ['summer islands', 'summer islander', 'summer isles'],
    'Ghiscari': ['ghiscari', 'ghiscaricari',  'ghis'],
    'Asshai': ["asshai'i", 'asshai'],
    'Lysene': ['lysene', 'lyseni'],
    'Andal': ['andal', 'andals'],
    'Braavosi': ['braavosi', 'braavos'],
    'Dornish': ['dornishmen', 'dorne', 'dornish'],
    'Myrish': ['myr', 'myrish', 'myrmen'],
    'Westermen': ['westermen', 'westerman', 'westerlands'],
    'Westerosi': ['westeros', 'westerosi'],
    'Stormlander': ['stormlands', 'stormlander'],
    'Norvoshi': ['norvos', 'norvoshi'],
    'Northmen': ['the north', 'northmen'],
    'Free Folk': ['wildling', 'first men', 'free folk'],
    'Qartheen': ['qartheen', 'qarth'],
    'Reach': ['the reach', 'reach', 'reachmen'],
}

### Create function to assign culture count 
def get_cult(value):
    value = value.lower()
    v = [k for (k, v) in cult.items() if value in v]
    return v[0] if len(v) > 0 else value.title()

### Assign culture to each character
character_predictions.loc[:, "culture"] = [get_cult(x) for x in character_predictions.culture.fillna("")]

### Group by culture and mortal-status
data = character_predictions.groupby(["culture", "isalive"]).count()["s.no"].unstack().copy(deep = True)
data.loc[:, "total"] = data.sum(axis = 1)
p = data[data.index != ""].sort_values("total")[[0, 1]].plot.barh(stacked = True, rot = 0, figsize = (14, 12),)
_ = p.set(xlabel = "No. of Characters", ylabel = "Culture"), p.legend(["Dead", "Alive"], loc = "lower right")

In [None]:
######################################################################
########                   Data Manipulation                  ########
######################################################################

death_preds = character_predictions.copy(deep = True)

eath_preds.loc[:, "culture"] = [get_cult(x) for x in death_preds.culture.fillna("")]
death_preds.loc[:, "title"] = pd.factorize(death_preds.title)[0]
death_preds.loc[:, "culture"] = pd.factorize(death_preds.culture)[0]
death_preds.loc[:, "mother"] = pd.factorize(death_preds.mother)[0]
death_preds.loc[:, "father"] = pd.factorize(death_preds.father)[0]
death_preds.loc[:, "heir"] = pd.factorize(death_preds.heir)[0]
death_preds.loc[:, "house"] = pd.factorize(death_preds.house)[0]
death_preds.loc[:, "spouse"] = pd.factorize(death_preds.spouse)[0]

death_preds.drop(["name", "alive", "pred", "plod", "isalive", "dateOfbirth"], 1, inplace = True)
death_preds.columns = map(lambda x: x.replace(".", "").replace("_", ""), death_preds.columns)
death_preds.fillna(value = -1, inplace = True)


In [None]:

######################################################################
########                Predictive Enviroment                 ########
######################################################################