<h1>Classification Analysis</h1>

<h1>A brief description of the code</h1>

The following script uses the data set Game of Thrones, GoT, to build a classification model that is able to determine if a character is alive or not.

Errors/Bugs identified as of December 5, 2021: None.

<h1> Preparation and exploration of the data</h1>

All necessary packages are loaded as well as some user defined functions and the data set, displaying the first 5 rows to understand the formatting of the data and its content.

In [None]:
# importing libraries
import pandas                  as pd                           # data science essentials
import numpy                   as np                           # mathematical essentials
import seaborn                 as sns                          # enhanced data viz
import matplotlib.pyplot       as plt                          # data visualization
import statsmodels.formula.api as smf                          # logistic regression
import gender_guesser.detector as gender                       # guess gender based on first name
from sklearn.model_selection import train_test_split           # train-test split
from sklearn.linear_model    import LogisticRegression         # logistic regression
from sklearn.metrics         import confusion_matrix           # confusion matrix
from sklearn.metrics         import roc_auc_score              # auc score
from sklearn.neighbors       import KNeighborsClassifier       # KNN for classification
from sklearn.neighbors       import KNeighborsRegressor        # KNN for regression
from sklearn.preprocessing   import StandardScaler             # standard scaler
from sklearn.tree            import DecisionTreeClassifier     # classification trees
from sklearn.tree            import plot_tree                  # tree plots
from sklearn.model_selection import RandomizedSearchCV         # hyperparameter tuning
from sklearn.metrics         import make_scorer                # customizable scorer
from sklearn.ensemble        import RandomForestClassifier     # random forest
from sklearn.ensemble        import GradientBoostingClassifier # gbm

In [None]:
# user-defined functions

########################################
#              mv_flagger              #
########################################
def mv_flagger(df):
    """
Flags all columns that have missing values with 'COLUMN_NAME_unknown'.

PARAMETERS
----------
df : DataFrame to flag missing values


RETURNS
-------
DataFrame with missing value flags."""


    for col in df:

        if df[col].isnull().astype(int).sum() > 0:
            df[col+'_unknown'] = df[col].isnull().astype(int)
            
    return df


########################################
#           optimal_neighbors          #
########################################
def optimal_neighbors(x_data,
                      y_data,
                      standardize = True,
                      pct_test=0.25,
                      seed=219,
                      response_type='reg',
                      max_neighbors=20,
                      show_viz=True):
    """
Exhaustively compute training and testing results for KNN across
[1, max_neighbors]. Outputs the maximum test score and (by default) a
visualization of the results.
PARAMETERS
----------
x_data        : explanatory variable data
y_data        : response variable
standardize   : whether or not to standardize the x data, default True
pct_test      : test size for training and validation from (0,1), default 0.25
seed          : random seed to be used in algorithm, default 219
response_type : type of neighbors algorithm to use, default 'reg'
    Use 'reg' for regression (KNeighborsRegressor)
    Use 'class' for classification (KNeighborsClassifier)
max_neighbors : maximum number of neighbors in exhaustive search, default 20
show_viz      : display or surpress k-neigbors visualization, default True
"""    
    
    
    if standardize == True:
        # optionally standardizing x_data
        scaler             = StandardScaler()
        scaler.fit(x_data)
        x_scaled           = scaler.transform(x_data)
        x_scaled_df        = pd.DataFrame(x_scaled)
        x_data             = x_scaled_df



    # train-test split
    x_train, x_test, y_train, y_test = train_test_split(x_data,
                                                        y_data,
                                                        test_size = pct_test,
                                                        random_state = seed)


    # creating lists for training set accuracy and test set accuracy
    training_accuracy = []
    test_accuracy = []
    
    
    # setting neighbor range
    neighbors_settings = range(1, max_neighbors + 1)


    for n_neighbors in neighbors_settings:
        # building the model based on response variable type
        if response_type == 'reg':
            clf = KNeighborsRegressor(n_neighbors = n_neighbors)
            clf.fit(x_train, y_train)
            
        elif response_type == 'class':
            clf = KNeighborsClassifier(n_neighbors = n_neighbors)
            clf.fit(x_train, y_train)            
            
        else:
            print("Error: response_type must be 'reg' or 'class'")
        
        
        # recording the training set accuracy
        training_accuracy.append(clf.score(x_train, y_train))
    
        # recording the generalization accuracy
        test_accuracy.append(clf.score(x_test, y_test))


    # optionally displaying visualization
    if show_viz == True:
        # plotting the visualization
        fig, ax = plt.subplots(figsize=(12,8))
        plt.plot(neighbors_settings, training_accuracy, label = "training accuracy")
        plt.plot(neighbors_settings, test_accuracy, label = "test accuracy")
        plt.ylabel("Accuracy")
        plt.xlabel("n_neighbors")
        plt.legend()
        plt.show()
    
    
    # returning optimal number of neighbors
    print(f"The optimal number of neighbors is: {test_accuracy.index(max(test_accuracy))+1}")
    return test_accuracy.index(max(test_accuracy))+1


########################################
#               visual_cm              #
########################################
def visual_cm(true_y, pred_y, labels = None):
    """
Creates a visualization of a confusion matrix.

PARAMETERS
----------
true_y : true values for the response variable
pred_y : predicted values for the response variable
labels : , default None
    """
    # visualizing the confusion matrix

    # setting labels
    lbls = labels
    

    # declaring a confusion matrix object
    cm = confusion_matrix(y_true = true_y,
                          y_pred = pred_y)


    # heatmap
    sns.heatmap(cm,
                annot       = True,
                xticklabels = lbls,
                yticklabels = lbls,
                cmap        = 'Blues',
                fmt         = 'g')


    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix of the Classifier')
    plt.show()

In [None]:
# converting data types with a dictionary
data_types = {"culture" : str}

# loading data
file = './GOT_character_predictions.xlsx'

got = pd.read_excel(io         = file,
                    header     = 0,
                    sheet_name = 0,
                    dtype      = data_types)

# setting pandas print options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 100)



# displaying the head of the dataset
#got.head(n = 5)

<h2>Understanding the variables</h2>

The GOT dictionary is loaded to provided a better understanding of the variables and identify if there is, if any, irrelevant features for determining if the characters are alive or not.

In [None]:
# pulling up data dictionary
#got_description = pd.read_excel('GOT_data_dictionary.xlsx')


# displaying the data dictionary
#got_description

<h2>Dropping columns</h2>

The feature <em>S.No</em> will be dropped as it is an ordinal variables and works like an index.

In [None]:
# dropping order of character appearance
got = got.drop(labels = "S.No", 
               axis   = 1)

<h2>Missing values</h2>

The following step is checking the missing values of the dataset. First the data was checked in therm or amount of missing values, and then it was checked by percentage of missing values per feature as to establish a strategy for the missing values.

In [None]:
# checking each feature for missing values
got.isnull().sum(axis = 0)

In [None]:
# checking percentage of missing values
got.isnull().mean().round(decimals = 2)

<h2>Flagging values</h2>

As the existence of missing values might be insightful, all of them will be flagged before moving forward with any imputation strategy of dropping features due to low domain Knowledge.

In [None]:
# running the mv_flagger function
got = mv_flagger(df = got)

# checking results
#got.columns

<h2>Missing values strategies and anomalies</h2>
<br>

1. Relationship between missing values:

    * <strong>dateOfBirth</strong> has the the same amount of missing values as <strong>age</strong>
    * <strong>mother</strong> has the the same amount of missing values as <strong>isAliveMother</strong>
    * <strong>father</strong> has the the same amount of missing values as <strong>isAliveFather</strong>
    * <strong>heir</strong> has the the same amount of missing values as <strong>isAliveHeir</strong>
    * <strong>spouse</strong> has the the same amount of missing values as <strong>isAliveSpouse</strong>
    <br>
    
Therefore, it can be inferred that the second featured is calculated based on the first one, meaning that they are strongly related and including both variables in the model will add multicollinearity to it.
<br><br>

2. Strategies for related missing values:
    * <strong>dateOfBirth</strong>: drop feature as well as the related flagged feature, <strong>dateOfBirth_unknown</strong>
    * <strong>age</strong>:
        * fill missing values with mean by gender and title or noble status.
        * drop related flagged feature, <strong>age_unknown</strong>
        * It will be necessary to develop a new feature for gender. 
    * <strong>mother</strong>: drop feature, insufficient domain knowledge.
    * <strong>isAliveMother</strong>: 
        * drop feature, insufficient domain knowledge.
        * drop related flagged feature, <strong>isAliveFather_unknown</strong>
    * <strong>father</strong>: drop feature, insufficient domain knowledge.
    * <strong>isAliveFather</strong>: 
        * drop feature, insufficient domain knowledge.
        * drop related flagged feature, <strong>isAliveFather_unknown</strong>
    * <strong>heir</strong>: drop feature, insufficient domain knowledge.
    * <strong>isAliveHeir</strong>: 
        * drop feature, insufficient domain knowledge.
        * drop related flagged feature, <strong>isAliveHeir_unknown</strong>
    * <strong>spouse</strong>: drop feature, insufficient domain knowledge.
    * <strong>isAliveSpouse</strong>: 
        * drop feature, insufficient domain knowledge.
        * drop related flagged feature, <strong>isAliveSpouse_unknown</strong>
    <br><br><br>

3. Other missing values:
    * <strong>title</strong>: drop feature, insufficient domain knowledge.
    * <strong>house</strong>: impute missing values according to character's last name, otherwise create a "unknown" category.
    * <strong>culture</strong>: analyze if subcategories can be created.

<h2>Creating first and last name features</h2>

In [None]:
# STEP 1: splitting to obtain family name

# placeholder list
placeholder_lst = []

# looping over each character name
for index, col in got.iterrows():
    
    # splitting name at space ''
    split_family_name = got.loc[index, 'name'].rsplit(sep      = ' ',
                                                      maxsplit = 1)
    
    # appending placeholder_lst with the results
    placeholder_lst.append(split_family_name)
    

# converting placeholder_lst into a new DataFrame 
family_name_df = pd.DataFrame(placeholder_lst, 
                              columns = ['name_no', 'last_name'])


# displaying the results
family_name_df.head(n = 5)

In [None]:
# STEP 2: splitting to obtain first name

# placeholder list
placeholder_lst = []

# looping over each character name
for index, col in got.iterrows():
    
    # splitting name at space ''
    split_first_name = got.loc[index, 'name'].split(sep      = ' ', 
                                                    maxsplit = 1)
    
    # appending placeholder_lst with the results
    placeholder_lst.append(split_first_name)
    

# converting placeholder_lst into a new DataFrame 
first_name_df = pd.DataFrame(placeholder_lst,
                             columns = ['first_name', 'no_familyname'])


# displaying the results
first_name_df.head(n = 5)

In [None]:
# STEP 3: concatenating with original DataFrame

# concatenating first and last name only
got = pd.concat([got, first_name_df.loc[ : , 'first_name'] , family_name_df.loc[ : , 'last_name']], 
                axis = 1)

# droping original name feature
got = got.drop(labels = 'name', 
               axis   = 1)

# displaying the results
got.head(n = 5)

<h2>Creating and one-hot encoding a gender feature</h2>

<h3>Creating gender feature</h3>

The necessary package to create the feature were imported, and the guesses were put into a list to be used for hardcode the gender feature and comment out the gender guesser process as a measure to reduce the processing time.

In [None]:
# installing gender_guesser
#pip install gender_guesser

In [None]:
# guessing gender based on first name

# placeholder list
#placeholder_lst = []

# looping to guess gender
#for name in got['first_name']:
#    guess = gender.Detector().get_gender(name)
#    print(guess)
#    placeholder_lst.append(guess)

The results from the list created before were printed in order to directly use the output for harcoding the gender variable and add it to the original DataFrame

In [None]:
#print(placeholder_lst)

In [None]:
# creating list for gender
gender_lts = ['unknown', 'unknown', 'andy', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'male', 'male', 'mostly_male', 'mostly_male', 'mostly_male', 'mostly_male', 'mostly_male', 'mostly_male', 'unknown', 'male', 'unknown', 'unknown', 'male', 'male', 'female', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'mostly_female', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'male', 'male', 'andy', 'andy', 'unknown', 'andy', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'male', 'male', 'unknown', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'mostly_male', 'male', 'mostly_male', 'mostly_male', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'male', 'unknown', 'male', 'unknown', 'male', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'mostly_male', 'unknown', 'unknown', 'male', 'female', 'andy', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'mostly_male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'mostly_female', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'andy', 'female', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'male', 'male', 'unknown', 'unknown', 'unknown', 'male', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'male', 'female', 'female', 'female', 'female', 'female', 'female', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'mostly_female', 'female', 'unknown', 'mostly_female', 'unknown', 'female', 'unknown', 'female', 'unknown', 'male', 'male', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'male', 'unknown', 'andy', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'male', 'male', 'male', 'male', 'male', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'female', 'female', 'female', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'male', 'male', 'male', 'male', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'mostly_female', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'male', 'male', 'male', 'unknown', 'male', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'mostly_male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'mostly_male', 'female', 'male', 'male', 'male', 'female', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 'male', 'male', 'male', 'female', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'andy', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'female', 'male', 'unknown', 'unknown', 'female', 'male', 'unknown', 'male', 'unknown', 'unknown', 'male', 'female', 'female', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'female', 'female', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'male', 'male', 'male', 'unknown', 'male', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'male', 'male', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'male', 'unknown', 'male', 'male', 'unknown', 'unknown', 'male', 'male', 'unknown', 'male', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'male', 'female', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'female', 'female', 'female', 'unknown', 'unknown', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'male', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'mostly_male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'mostly_female', 'mostly_female', 'mostly_female', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'female', 'male', 'male', 'male', 'male', 'unknown', 'female', 'female', 'female', 'unknown', 'mostly_male', 'unknown', 'unknown', 'male', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'male', 'male', 'female', 'male', 'female', 'unknown', 'unknown', 'female', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'male', 'male', 'unknown', 'unknown', 'female', 'unknown', 'female', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'male', 'male', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'female', 'unknown', 'male', 'unknown', 'unknown', 'mostly_female', 'male', 'female', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'male', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'male', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'female', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'male', 'unknown', 'male', 'male', 'female', 'mostly_female', 'female', 'mostly_female', 'mostly_female', 'mostly_female', 'mostly_female', 'mostly_female', 'mostly_female', 'unknown', 'unknown', 'female', 'female', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'female', 'unknown', 'male', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'female', 'unknown', 'female', 'unknown', 'unknown', 'female', 'female', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'male', 'male', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'male', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'female', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'male', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'female', 'unknown', 'unknown', 'male', 'male', 'male', 'male', 'male', 'unknown', 'unknown', 'unknown', 'male', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'female', 'mostly_male', 'unknown', 'female', 'male', 'male', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'male', 'male', 'male', 'male', 'male', 'unknown', 'unknown', 'male', 'male', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'male', 'male', 'unknown', 'male', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'male', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'male', 'male', 'male', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'female', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'male', 'female', 'unknown', 'female', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'male', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'male', 'unknown', 'unknown', 'male', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'mostly_male', 'male', 'unknown', 'unknown', 'male', 'male', 'male', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'female', 'female', 'female', 'male', 'male', 'unknown', 'unknown', 'unknown', 'male', 'male', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'male', 'unknown', 'male', 'unknown', 'female', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'male', 'male', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'male', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'male', 'unknown', 'male', 'male', 'andy', 'male', 'male', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'female', 'unknown', 'male', 'male', 'male', 'male', 'male', 'male', 'mostly_male', 'mostly_male', 'mostly_male', 'mostly_male', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'male', 'male', 'unknown', 'male', 'male', 'unknown', 'male', 'male', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'female', 'female', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'male', 'male', 'unknown', 'unknown', 'male', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'male', 'male', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'male', 'male', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'male', 'male', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'male', 'unknown', 'unknown', 'female', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'mostly_male', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'female', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'male', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'mostly_female', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'male', 'male', 'unknown', 'unknown', 'unknown', 'male', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'mostly_female', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'male', 'male', 'mostly_female', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'female', 'unknown', 'female', 'unknown', 'female', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'andy', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'female', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'mostly_female', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'male', 'male', 'male', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'male', 'female', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'male', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'male', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'male', 'unknown', 'male', 'male', 'unknown', 'male', 'unknown', 'male', 'male', 'male', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'male', 'male', 'male', 'male', 'male', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'mostly_male', 'male', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'male', 'male', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'mostly_female', 'unknown', 'unknown', 'unknown', 'female', 'male', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'female', 'male', 'mostly_male', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'female', 'unknown', 'unknown', 'unknown', 'male', 'unknown', 'unknown']

# converting list into a series
got['gender_guess'] = pd.Series(gender_lts)


# checking results
#got.head(n = 5)

<h3>One-hot encoding gender feature</h3>

It is necessary to convert the gender feature as it is a categorical value.

In [None]:
# one hot encoding variables
one_hot_gender_guess = pd.get_dummies(got['gender_guess'])

# joining codings together
got = got.join(other = [one_hot_gender_guess])

# checking results
#got.columns

<h3>Creating sub-categories for gender feature</h3>

Gender has 6 categories, and 3 of them has less than 100 observations, therefore the categories will be feature engineered.

In [None]:
# getting frecuency of gender guesses
got['gender_guess'].value_counts()

In [None]:
# creating total female feature
got['isFemale']      = (got['female']  + got['mostly_female'])

# creating total male feature
got['isMale']        = (got['male']    + got['mostly_male'])

# creating total undetermined gender feature
got['genderUndet']   = (got['unknown'] + got['andy'])


# dropping variables feature engineering
got = got.drop(labels = ['gender_guess', 'female', 'mostly_female', 'male', 'mostly_male', 'unknown', 'andy'], 
               axis   = 1)

<h2>Dropping features</h2>

Features were dropped per the strategy estated before.

In [None]:
# dropping date of birth related variables
got = got.drop(labels = ['dateOfBirth', 'dateOfBirth_unknown'], 
               axis   = 1)

# dropping mother related variables
got = got.drop(labels = ['mother', 'isAliveMother', 'isAliveMother_unknown'], 
               axis   = 1)

# dropping father related variables
got = got.drop(labels = ['father', 'isAliveFather', 'isAliveFather_unknown'], 
               axis   = 1)

# dropping heir related variables
got = got.drop(labels = ['heir', 'isAliveHeir', 'isAliveHeir_unknown'], 
               axis   = 1)

# dropping spouse related variables
got = got.drop(labels = ['spouse', 'isAliveSpouse', 'isAliveSpouse_unknown'], 
               axis   = 1)

# dropping additional variables
got = got.drop(labels = ['age_unknown', 'title'], 
               axis   = 1)

<h2>Imputing missing values for age</h2>

There are 2 observations with age below than 0, meaning that the characters have not been born yet.
<br>

In [None]:
got['age'].describe()

In [None]:
(got['age'] < 0).sum(axis = 0)

In [None]:
# checking age by categories
print(f"""
Mean Age:                     {round(got.loc[ : , 'age'].mean(),1)}
Mean Born Characters Age:     {round(got.loc[ : , 'age'][got['age'] > 0].mean(),1)}

-------------------------------------------

Mean Female Age:              {round(got.loc[ : , 'age'][got['age'] > 0][got['isFemale']      == 1].mean(),1)}
Mean Male Age:                {round(got.loc[ : , 'age'][got['age'] > 0][got['isMale']        == 1].mean(),1)}
Mean Undetermined Age:        {round(got.loc[ : , 'age'][got['age'] > 0][got['genderUndet']   == 1].mean(),1)}

-------------------------------------------

Mean Title Age:               {round(got.loc[ : , 'age'][got['age'] > 0][got['title_unknown'] == 0].mean(),1)}
Mean NOT Title Age:           {round(got.loc[ : , 'age'][got['age'] > 0][got['title_unknown'] == 1].mean(),1)}

Mean Noble Age:               {round(got.loc[ : , 'age'][got['age'] > 0][got['isNoble']       == 1].mean(),1)}
Mean NOT Noble Age:           {round(got.loc[ : , 'age'][got['age'] > 0][got['isNoble']       == 0].mean(),1)}
""")

Checking the ages by categories shows that female average age is lower than male and undetermined gender, and these last two have a similar average age.
<br><br>
In addition, the average age by having a title or a noble status are virtually the same, therefore <em>isNoble</em> will be user to fill the missing values in age and since this feature has no missing values in the original dataset.

In [None]:
# imputing missing values for age

# storing average age for noble female
noble_female_avg_age            = round(got.loc[ : , 'age']\
                                        [got['isFemale'] == 1]\
                                        [got['isNoble']  == 1].mean(),
                                        ndigits = 1)

# storing average age for noble male
noble_male_avg_age              = round(got.loc[ : , 'age']\
                                        [got['isMale']     == 1]\
                                        [got['isNoble']    == 1].mean(),
                                        ndigits = 1)

# storing average age for noble undetermined gender
noble_undetermined_avg_age      = round(got.loc[ : , 'age']\
                                        [got['genderUndet'] == 1]\
                                        [got['isNoble'] == 1].mean(),
                                        ndigits = 1)


# storing average age for NOT noble female
notNoble_female_avg_age         = round(got.loc[ : , 'age']\
                                        [got['isFemale'] == 0]\
                                        [got['isNoble'] == 0].mean(),
                                        ndigits = 1)

# storing average age for NOT noble male
notNoble_male_avg_age           = round(got.loc[ : , 'age']\
                                        [got['isMale'] == 0]\
                                        [got['isNoble'] == 0].mean(),
                                        ndigits = 1)

# storing average age for NOT noble undetermined gender
notNoble_undetermined_avg_age   = round(got.loc[ : , 'age']\
                                        [got['genderUndet'] == 0]\
                                        [got['isNoble'] == 0].mean(),
                                        ndigits = 1)


# imputing missing values for age
for index, val in got.iterrows():

    
    # noble female
    if str(got.loc[index, 'age']).lower()    == 'nan'    and \
           got.loc[index, 'isFemale']        == 1        and \
           got.loc[index, 'isNoble']         == 1:
        
           got.loc[index, 'age'] = noble_female_avg_age
            
    

    # noble males
    elif str(got.loc[index, 'age']).lower()    == 'nan'  and \
             got.loc[index, 'isMale']          == 1      and \
             got.loc[index, 'isNoble']         == 1:
        
             got.loc[index, 'age'] = noble_male_avg_age

            
            
    # noble undetermined gender
    elif str(got.loc[index, 'age']).lower()    == 'nan'  and \
             got.loc[index, 'genderUndet']     == 1      and \
             got.loc[index, 'isNoble']         == 1:
        
             got.loc[index, 'age'] = noble_undetermined_avg_age


    
    # not noble female
    elif str(got.loc[index, 'age']).lower()    == 'nan'    and \
             got.loc[index, 'isFemale']        == 1        and \
             got.loc[index, 'isNoble']         == 0:
        
             got.loc[index, 'age'] = notNoble_female_avg_age

            
    # not noble males
    elif str(got.loc[index, 'age']).lower()    == 'nan'  and \
             got.loc[index, 'isMale']          == 1      and \
             got.loc[index, 'isNoble']         == 0:
        
             got.loc[index, 'age'] = notNoble_male_avg_age

            
    # not noble undetermined gender
    elif str(got.loc[index, 'age']).lower()    == 'nan'  and \
             got.loc[index, 'genderUndet']     == 1      and \
             got.loc[index, 'isNoble']         == 0:
        
             got.loc[index, 'age'] = notNoble_undetermined_avg_age
            

# ensuring all missing values for age are filled
print(f"Remaining missing values for age: {got.loc[ :, 'age'].isnull().sum()}")

<h2>Imputing missing values for house</h2>

In [None]:
# converting house observations into. lowercase
got['house'] = got['house'].str.lower()

#checking hourse frecuency 
got['house'].value_counts()

In order to use the character's last name for imputing missing values, the missing values in <em>last_name</em> will be filled as "unknown".

In [None]:
# filling missing values in last name
got['last_name'] = got['last_name'].fillna('unknown')

# converting last_name observations into. lowercase
got['last_name'] = got['last_name'].str.lower()

# ensuring all missing values for last_name are filled
print(f"Remaining missing values for last name: {got.loc[ :, 'last_name'].isnull().sum()}")

In [None]:
# creating categories for houses and one-hot encoding them

# creating house_frey category
got['night_watch']     = np.where((got['house'] == """night's watch"""),1,0)

# checking frecuency
print(got['night_watch'].value_counts())



# creating house_frey category
got['house_frey']      = np.where(got['last_name'].str.contains('frey') |\
                                  got['house'].str.contains('frey'),1,0)

# checking frecuency
print(got['house_frey'].value_counts())



# creating house_stark category
got['house_stark']     = np.where(got['last_name'].str.contains('stark') |\
                                  got['house'].str.contains('stark'),1,0)

# checking frecuency
print (got['house_stark'].value_counts())



# creating house_targaryen category
got['house_targaryen'] = np.where(got['last_name'].str.contains('targaryen') |\
                                  got['house'].str.contains('targaryen'),1,0)

# checking frecuency
print(got['house_targaryen'].value_counts())



# creating house_lannister category
got['house_lannister'] = np.where(got['last_name'].str.contains('lannister') |\
                                  got['house'].str.contains('lannister'),1,0)

# checking frecuency
print(got['house_lannister'].value_counts())



# creating house_greyjoy category
got['house_greyjoy']   = np.where(got['last_name'].str.contains('greyjoy') |\
                                  got['house'].str.contains('greyjoy'),1,0)

# checking frecuency
print(got['house_greyjoy'].value_counts())



# creating house_tyrell category
got['house_tyrell']    = np.where(got['last_name'].str.contains('tyrell') |\
                                  got['house'].str.contains('tyrell'),1,0)

# checking frecuency
print(got['house_tyrell'].value_counts())



# creating house_tyrell category
got['house_martell']   = np.where(got['last_name'].str.contains('martell') |\
                                  got['house'].str.contains('martell'),1,0)

# checking frecuency
print(got['house_martell'].value_counts())

<h2>Dropping remaining columns</h2>

In [None]:
# dropping categorical features that will not be used
got = got.drop(labels = ['culture', 'house', 'house_unknown', 'first_name', 'last_name'], 
               axis   = 1)

# dropping stratum for categorical variables
got = got.drop(labels = ['genderUndet'], 
               axis   = 1)

<h1>Train and testing sets</h1>

<h2>Correlation analysis</h2>

In [None]:
got_corr = got.corr().round(2)

got_corr['isAlive'].sort_values(ascending = False)

<h2>Stratifying the response variable</h2> 

The following will be done to preserve the balance of the response variable in the training and testing sets.

In [None]:
got.loc[ : ,'isAlive'].value_counts(normalize = True).round(decimals = 2)

<h2>Train and testing sets for statsmodels</h2> 

In [None]:
# declaring explanatory variables
got_data = got.drop(labels = 'isAlive', 
                    axis   = 1)


# declaring response variable
got_target = got.loc[ : , 'isAlive']

In [None]:
# train-test split with stratification
x_train, x_test, y_train, y_test = train_test_split(
            got_data,
            got_target,
            test_size    = 0.10,
            random_state = 219,
            stratify     = got_target) # preserving balance


# merging training data for statsmodels
got_train = pd.concat([x_train, y_train], axis = 1)

In [None]:
# checking valance preservation
print(f"""

Response Variable Proportions (Training Set)
--------------------------------------------
{y_train.value_counts(normalize = True).round(decimals = 2)}



Response Variable Proportions (Testing Set)
--------------------------------------------
{y_test.value_counts(normalize = True).round(decimals = 2)}
""")

<h2>Logistic Regression</h2>

In [None]:
#getting all variables in data_data
#for val in got_data:
#    print(f" {val} + ")

In [None]:
# instantiating a logistic regression model object
logistic = smf.logit(formula = """isAlive ~ book4_A_Feast_For_Crows""",
                           data    = got_train)


# fitting the model object
results_logistic = logistic.fit()


# checking the results SUMMARY
results_logistic.summary2() # summary2() has AIC and BIC

In [None]:
# instantiating a logistic regression model object
logistic = smf.logit(formula = """isAlive ~ book1_A_Game_Of_Thrones + 
                                                 book2_A_Clash_Of_Kings + 
                                                 book3_A_Storm_Of_Swords + 
                                                 book4_A_Feast_For_Crows + 
                                                 book5_A_Dance_with_Dragons + 
                                                 isMarried + 
                                                 isNoble + 
                                                 age + 
                                                 numDeadRelations + 
                                                 popularity + 
                                                 title_unknown + 
                                                 culture_unknown + 
                                                 mother_unknown + 
                                                 father_unknown + 
                                                 heir_unknown + 
                                                 spouse_unknown + 
                                                 isFemale + 
                                                 isMale + 
                                                 night_watch + 
                                                 house_frey + 
                                                 house_stark + 
                                                 house_targaryen + 
                                                 house_lannister + 
                                                 house_greyjoy + 
                                                 house_tyrell + 
                                                 house_martell""",
                           data    = got_train)


# fitting the model object
results_logistic = logistic.fit()


# checking the results SUMMARY
results_logistic.summary2() # summary2() has AIC and BIC

In [None]:
# instantiating a logistic regression model object
logistic = smf.logit(formula = """isAlive ~ book1_A_Game_Of_Thrones + 
                                                 book2_A_Clash_Of_Kings + 
                                                 book4_A_Feast_For_Crows +
                                                 popularity + 
                                                 house_targaryen +
                                                 house_tyrell
                                                 """,
                           data    = got_train)


# fitting the model object
results_logistic = logistic.fit()


# checking the results SUMMARY
results_logistic.summary2() # summary2() has AIC and BIC

<h1>Logistic classification model</h1>

The upcoming models will use 2 candidate model's explanatory variables as follows, <em>logit_full</em> includes all the explanatory variables after they have been feature engineered and <em>logit_sig</em> includes only the variables with statistically significant for the previous logit regression model.

In [None]:
# creating a dictionary to store candidate models

candidate_dict = {

 # full model
 'logit_full'   : ['book1_A_Game_Of_Thrones', 'book2_A_Clash_Of_Kings', 
                   'book3_A_Storm_Of_Swords', 'book4_A_Feast_For_Crows', 
                   'book5_A_Dance_with_Dragons', 'isMarried', 'isNoble', 
                   'age', 'numDeadRelations', 'popularity', 'title_unknown', 
                   'culture_unknown', 'mother_unknown', 'father_unknown',
                   'heir_unknown', 'spouse_unknown', 'isFemale', 'isMale',
                   'night_watch', 'house_frey', 'house_stark', 
                   'house_targaryen', 'house_lannister', 'house_greyjoy', 
                   'house_tyrell', 'house_martell'],
 
 # significant variables only
 'logit_sig'    : ['book1_A_Game_Of_Thrones', 'book2_A_Clash_Of_Kings', 
                   'book4_A_Feast_For_Crows', 'popularity', 'house_targaryen',
                   'house_tyrell'],
    
}

<h2>Building a logistic regression model in scikit-learn</h2>

The regression model will be build using <em>logit_sig</em> as explanatory variables.

In [None]:
# train/test split with the significant model
got_data   =  got.loc[ : , candidate_dict['logit_sig']]
got_target =  got.loc[ : , 'isAlive']


# this is the exact code we were using before
x_train, x_test, y_train, y_test = train_test_split(
            got_data,
            got_target,
            random_state = 219,
            test_size    = 0.10,
            stratify     = got_target)


# INSTANTIATING a logistic regression model
logreg = LogisticRegression(solver = 'lbfgs',
                            C = 1,
                            random_state = 219)


# FITTING the training data
logreg_fit = logreg.fit(x_train, y_train)


# PREDICTING based on the testing set
logreg_pred = logreg_fit.predict(x_test)


# SCORING the results
print('LogReg Training ACCURACY:', logreg_fit.score(x_train, y_train).round(4))
print('LogReg Testing  ACCURACY:', logreg_fit.score(x_test, y_test).round(4))

# saving scoring data for future use
logreg_train_score = logreg_fit.score(x_train, y_train).round(4) # accuracy
logreg_test_score  = logreg_fit.score(x_test, y_test).round(4)   # accuracy


# displaying and saving the gap between training and testing
print('LogReg Train-Test Gap   :', abs(logreg_train_score - logreg_test_score).round(4))
logreg_test_gap = abs(logreg_train_score - logreg_test_score).round(4)

In [None]:
# creating a confusion matrix
confusion_matrix(y_true = y_test,
                 y_pred = logreg_pred)

# unpacking the confusion matrix
logreg_tn, \
logreg_fp, \
logreg_fn, \
logreg_tp = confusion_matrix(y_true = y_test, 
                             y_pred = logreg_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {logreg_tn}
False Positives: {logreg_fp}
False Negatives: {logreg_fn}
True Positives : {logreg_tp}
""")

In [None]:
# calling the visual_cm function
#visual_cm(true_y = y_test,
#          pred_y = logreg_pred,
#          labels = ['Dead', 'Alive'])

In [None]:
# area under the roc curve (auc)
print(roc_auc_score(y_true  = y_test,
                    y_score = logreg_pred).round(decimals = 4))


# saving AUC score for future use
logreg_auc_score = roc_auc_score(y_true  = y_test,
                                 y_score = logreg_pred).round(decimals = 4)

In [None]:
# zipping each feature name to its coefficient
logreg_model_values = zip(got[candidate_dict['logit_sig']].columns,
                          logreg_fit.coef_.ravel().round(decimals = 2))


# setting up a placeholder list to store model features
logreg_model_lst = [('intercept', logreg_fit.intercept_[0].round(decimals = 2))]


# printing out each feature-coefficient pair one by one
for val in logreg_model_values:
    logreg_model_lst.append(val)
    

# checking the results
for pair in logreg_model_lst:
    print(pair)

<strong>Logistic Regression Comments:</strong>
<br>

The accuracy of the model increased for the testing set, however, the testing gap is above 0.05, so it could infer that the model is slightly overfitted.
<br>

In addition, since the AUC score is below 0.075, the analysis will continue to check the performance of other models.

<h1>Classification Trees (CART Models)</h1>

In [None]:
########################################
# plot_feature_importances
########################################
def plot_feature_importances(model, train, export = False):
    """
    Plots the importance of features from a CART model.
    
    PARAMETERS
    ----------
    model  : CART model
    train  : explanatory variable training data
    export : whether or not to export as a .png image, default False
    """
    
    # declaring the number
    n_features = x_train.shape[1]
    
    # setting plot window
    fig, ax = plt.subplots(figsize=(12,9))
    
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), train.columns)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
    
    if export == True:
        plt.savefig('Tree_Leaf_50_Feature_Importance.png')
        
########################################
# plot_feature_importances
########################################
def plot_feature_importances_full(model, train, export = False):
    """
    Plots the importance of features from a CART model.
    
    PARAMETERS
    ----------
    model  : CART model
    train  : explanatory variable training data
    export : whether or not to export as a .png image, default False
    """
    
    # declaring the number
    n_features = x_train_full.shape[1]
    
    # setting plot window
    fig, ax = plt.subplots(figsize=(12,9))
    
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), train.columns)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
    
    if export == True:
        plt.savefig('Tree_Leaf_50_Feature_Importance.png')        

In [None]:
# INSTANTIATING a classification tree object
full_tree = DecisionTreeClassifier()


# FITTING the training data
full_tree_fit = full_tree.fit(x_train, y_train)


# PREDICTING on new data
full_tree_pred = full_tree_fit.predict(x_test)


# SCORING the model
print('Full Tree Training ACCURACY:', full_tree_fit.score(x_train,
                                                     y_train).round(4))

print('Full Tree Testing ACCURACY :', full_tree_fit.score(x_test,
                                                     y_test).round(4))

print('Full Tree AUC Score:', roc_auc_score(y_true  = y_test,
                                            y_score = full_tree_pred).round(4))


# saving scoring data for future use
full_tree_train_score = full_tree_fit.score(x_train, y_train).round(4) # accuracy
full_tree_test_score  = full_tree_fit.score(x_test, y_test).round(4)   # accuracy


# saving AUC
full_tree_auc_score   = roc_auc_score(y_true  = y_test,
                                      y_score = full_tree_pred).round(4) # auc

In [None]:
# unpacking the confusion matrix
full_tree_tn, \
full_tree_fp, \
full_tree_fn, \
full_tree_tp = confusion_matrix(y_true = y_test, y_pred = full_tree_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {full_tree_tn}
False Positives: {full_tree_fp}
False Negatives: {full_tree_fn}
True Positives : {full_tree_tp}
""")

In [None]:
# INSTANTIATING a classification tree object
pruned_tree = DecisionTreeClassifier(max_depth = 4,
                                     min_samples_leaf = 25,
                                     random_state = 219)


# FITTING the training data
pruned_tree_fit  = pruned_tree.fit(x_train, y_train)


# PREDICTING on new data
pruned_tree_pred = pruned_tree_fit.predict(x_test)


# SCORING the model
print('Training ACCURACY:', pruned_tree_fit.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', pruned_tree_fit.score(x_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = pruned_tree_pred).round(4))


# saving scoring data for future use
pruned_tree_train_score = pruned_tree_fit.score(x_train, y_train).round(4) # accuracy
pruned_tree_test_score  = pruned_tree_fit.score(x_test, y_test).round(4)   # accuracy


# saving auc score
pruned_tree_auc_score   = roc_auc_score(y_true  = y_test,
                                        y_score = pruned_tree_pred).round(4) # auc

In [None]:
# unpacking the confusion matrix
pruned_tree_tn, \
pruned_tree_fp, \
pruned_tree_fn, \
pruned_tree_tp = confusion_matrix(y_true = y_test, y_pred = pruned_tree_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {pruned_tree_tn}
False Positives: {pruned_tree_fp}
False Negatives: {pruned_tree_fn}
True Positives : {pruned_tree_tp}
""")

In [None]:
# plotting feature importance
#plot_feature_importances(pruned_tree_fit,
#                         train = x_train,
#                         export = False)

<strong>Comments on Feature Importance</strong>

- Popularity</em> and <em>book4_A_Feast_For_Crows</em> are the most important features in terms of splitting the data into nodes.

- house_tyrell</em> and <em>book1_A_Game_Of_Thrones</em> are the second best pair of most important features in terms of splitting the data into nodes.
    
- house_targaryen</em> and <em>book2_A_Clash_Of_Kings</em> seems to not be important features in terms of splitting the data into nodes.

In [None]:
# creating a dictionary for model results
model_performance = {
    
    'Model Name'    : ['Logistic', 'Full Tree', 'Pruned Tree'],
           
    'AUC Score' : [logreg_auc_score, full_tree_auc_score, pruned_tree_auc_score],
    
    'Training Accuracy' : [logreg_train_score, full_tree_train_score,
                           pruned_tree_train_score],
           
    'Testing Accuracy'  : [logreg_test_score, full_tree_test_score,
                           pruned_tree_test_score],

    'Confusion Matrix'  : [(logreg_tn, logreg_fp, logreg_fn, logreg_tp),
                           (full_tree_tn, full_tree_fp, full_tree_fn, full_tree_tp),
                           (pruned_tree_tn, pruned_tree_fp, pruned_tree_fn, pruned_tree_tp)]}


# converting model_performance into a DataFrame
model_performance = pd.DataFrame(model_performance)

<h1>Classification Modeling with KNN</h1>

In [None]:
# determining the optimal number of neighbors
#opt_neighbors = optimal_neighbors(x_data        = got_data,
#                                  y_data        = got_target,
#                                  response_type = 'class')

<strong>Results from the previous code:</strong> 
<br><br>
        The optimal number of neighbors is: 18

In [None]:
# INSTANTIATING StandardScaler()
scaler = StandardScaler()


# FITTING the data
scaler.fit(got_data)


# TRANSFORMING the data
x_scaled     = scaler.transform(got_data)


# converting to a DataFrame
x_scaled_df  = pd.DataFrame(x_scaled) 


# train-test split with the scaled data
x_train_scaled, x_test_scaled, y_train_scaled, y_test_scaled = train_test_split(
            x_scaled_df,
            got_target,
            random_state = 219,
            test_size    = 0.10,
            stratify     = got_target)


# INSTANTIATING a KNN classification model with optimal neighbors
knn_opt = KNeighborsClassifier(n_neighbors = 18)


# FITTING the training data
knn_fit = knn_opt.fit(x_train_scaled, y_train_scaled)


# PREDICTING based on the testing set
knn_pred = knn_fit.predict(x_test_scaled)


# SCORING the results
print('Training ACCURACY:', knn_fit.score(x_train_scaled, y_train_scaled).round(4))
print('Testing  ACCURACY:', knn_fit.score(x_test_scaled, y_test_scaled).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = knn_pred).round(4))


# saving scoring data
knn_train_score = knn_fit.score(x_train_scaled, y_train_scaled).round(4)
knn_test_score  = knn_fit.score(x_test_scaled, y_test_scaled).round(4)


# saving AUC score
knn_auc_score   = roc_auc_score(y_true  = y_test,
                                          y_score = knn_pred).round(4)

In [None]:
# calling the visual_cm function
#visual_cm(true_y = y_test,
#          pred_y = knn_pred,
#          labels = ['Dead', 'Alive'])

In [None]:
# unpacking the confusion matrix
knn_tree_tn, \
knn_tree_fp, \
knn_tree_fn, \
knn_tree_tp = confusion_matrix(y_true = y_test, y_pred = knn_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {knn_tree_tn}
False Positives: {knn_tree_fp}
False Negatives: {knn_tree_fn}
True Positives : {knn_tree_tp}
""")

In [None]:
# declaring model performance objects
knn_train_score = knn_fit.score(x_train_scaled, y_train_scaled).round(4)
knn_test_score  = knn_fit.score(x_test_scaled, y_test_scaled).round(4)
knn_auc_score   = roc_auc_score(y_true  = y_test,
                                          y_score = knn_pred).round(4)

# appending to model_performance
model_performance = model_performance.append(
                          {'Model Name'        : 'FINAL MODEL - KNN Tree',
                          'Training Accuracy'  : knn_train_score,
                          'Testing Accuracy'   : knn_test_score, 
                          'AUC Score'          : knn_auc_score,
                          'Confusion Matrix'   : (knn_tree_tn, 
                                                  knn_tree_fp, 
                                                  knn_tree_fn, 
                                                  knn_tree_tp)},
                          ignore_index = True)

<h2>Logistic Regression with Default Hyperparameters</h2>

In [None]:
# INSTANTIATING a logistic regression model with default values
lr_default = LogisticRegression(solver = 'lbfgs',
                                C = 1.0,
                                warm_start = False,
                                random_state = 219)

In [None]:
# FITTING the training data
lr_default_fit = lr_default.fit(x_train, y_train)


# PREDICTING based on the testing set
lr_default_pred = lr_default_fit.predict(x_test)


# SCORING the results
print('Training ACCURACY:', lr_default_fit.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', lr_default_fit.score(x_test, y_test).round(4))


# SCORING with AUC
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = lr_default_pred).round(4))


# saving scoring data for future use
logreg_train_score = lr_default_fit.score(x_train, y_train).round(4) # accuracy
logreg_test_score  = lr_default_fit.score(x_test, y_test).round(4)   # accuracy


# saving AUC score
logreg_auc_score = roc_auc_score(y_true  = y_test,
                                 y_score = lr_default_pred).round(4)

<h2>Hyperparameter Tuning with RandomizedSearchCV</h2>

In [None]:
########################################
# RandomizedSearchCV
########################################

# declaring a hyperparameter space
#C_range          = np.arange(0.1, 5.0, 0.1)
#warm_start_range = [True, False]
#solver_range     = ['newton-cg', 'sag', 'lbfgs']


# creating a hyperparameter grid
#param_grid = {'C'          : C_range,
#              'warm_start' : warm_start_range,
#              'solver'     : solver_range}


# INSTANTIATING the model object without hyperparameters
#lr_tuned = LogisticRegression(random_state = 219,
#                              max_iter     = 1000) # increased for convergence


# GridSearchCV object
#lr_tuned_cv = RandomizedSearchCV(estimator           = lr_tuned,   # the model object
#                                 param_distributions = param_grid, # parameters to tune
#                                 cv                  = 3,          # how many folds in cross-validation
#                                 n_iter              = 250,        # number of combinations of hyperparameters to try
#                                 random_state        = 219,        # starting point for random sequence
#                                 scoring = make_scorer(
#                                           roc_auc_score,
#                                           needs_threshold = False)) # scoring criteria (AUC)


# FITTING to the FULL DATASET (due to cross-validation)
#lr_tuned_cv.fit(got_data, got_target)


# printing the optimal parameters and best score
#print("Tuned Parameters  :", lr_tuned_cv.best_params_)
#print("Tuned CV AUC      :", lr_tuned_cv.best_score_.round(4))

<strong>Results from the previous code:</strong> 
<br>

- Tuned Parameters  : {'warm_start': True, 'solver': 'sag', 'C': 4.9}
- Tuned CV AUC      : 0.5971

<h2>Logistic Regression with Tuned Hyperparameters</h2>

In [None]:
# checking the best estimator for the model
#lr_tuned_cv.best_estimator_

<strong>Results from the previous code:</strong> 
<br>

- LogisticRegression(C=4.9, max_iter=1000, random_state=219, solver='sag', warm_start=True)

In [None]:
# building a model based on hyperparameter tuning results

# INSTANTIATING a logistic regression model with tuned values
lr_tuned = LogisticRegression(C            = 4.9,
                              warm_start   = True,
                              solver       = 'sag',
                              max_iter     = 1000,
                              random_state = 219)


# FITTING the model to the full dataset
lr_tuned.fit(got_data, got_target) # this is ok because already tuned


# PREDICTING based on the testing set
lr_tuned_pred = lr_tuned.predict(x_test)


# SCORING the results
print('LR Tuned Training ACCURACY:', lr_tuned.score(x_train, y_train).round(4))
print('LR Tuned Testing  ACCURACY:', lr_tuned.score(x_test, y_test).round(4))
print('LR Tuned AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = lr_tuned_pred).round(4))


# saving scoring data for future use
lr_tuned_train_score = lr_tuned.score(x_train, y_train).round(4) # accuracy
lr_tuned_test_score  = lr_tuned.score(x_test, y_test).round(4)   # accuracy


# saving the AUC score
lr_tuned_auc         = roc_auc_score(y_true  = y_test,
                                     y_score = lr_tuned_pred).round(4) # auc

In [None]:
# unpacking the confusion matrix
lr_tuned_tn, \
lr_tuned_fp, \
lr_tuned_fn, \
lr_tuned_tp = confusion_matrix(y_true = y_test, y_pred = lr_tuned_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {lr_tuned_tn}
False Positives: {lr_tuned_fp}
False Negatives: {lr_tuned_fn}
True Positives : {lr_tuned_tp}
""")

In [None]:
# declaring model performance objects
lr_train_acc = lr_tuned.score(x_train, y_train).round(4)
lr_test_acc  = lr_tuned.score(x_test, y_test).round(4)
lr_auc       = roc_auc_score(y_true  = y_test,
                             y_score = lr_tuned_pred).round(4)


# appending to model_performance
model_performance = model_performance.append(
                          {'Model Name'        : 'Tuned LR',
                           'Training Accuracy' : lr_train_acc,
                           'Testing Accuracy'  : lr_test_acc,
                           'AUC Score'         : lr_auc,
                           'Confusion Matrix'  : (lr_tuned_tn,
                                                  lr_tuned_fp,
                                                  lr_tuned_fn,
                                                  lr_tuned_tp)},
                           ignore_index = True)

<h2>Hyperparameter Tuning on Classification Trees</h2>

In [None]:
# declaring a hyperparameter space
#criterion_range = ['gini', 'entropy']
#splitter_range  = ['best', 'random']
#depth_range     = np.arange(1, 25, 1)
#leaf_range      = np.arange(1, 100, 1)


# creating a hyperparameter grid
#param_grid = {'criterion'        : criterion_range,
#              'splitter'         : splitter_range,
#              'max_depth'        : depth_range,
#              'min_samples_leaf' : leaf_range}


# INSTANTIATING the model object without hyperparameters
#tuned_tree = DecisionTreeClassifier(random_state = 219)


# RandomizedSearchCV object
#tuned_tree_cv = RandomizedSearchCV(estimator             = tuned_tree,
#                                   param_distributions   = param_grid,
#                                   cv                    = 3,
#                                   n_iter                = 1000,
#                                   random_state          = 219,
#                                   scoring = make_scorer(roc_auc_score,
#                                             needs_threshold = False))


# FITTING to the FULL DATASET (due to cross-validation)
#tuned_tree_cv.fit(got_data, got_target)


# PREDICT step is not needed


# printing the optimal parameters and best score
#print("Tuned Parameters  :", tuned_tree_cv.best_params_)
#print("Tuned Training AUC:", tuned_tree_cv.best_score_.round(4))

<strong>Results from the previous code:</strong> 
<br>

- Tuned Parameters  : {'splitter': 'best', 'min_samples_leaf': 4, 'max_depth': 7, 'criterion': 'gini'}
- Tuned Training AUC: 0.6744

In [None]:
# building a model based on hyperparameter tuning results

# INSTANTIATING a logistic regression model with tuned values
tree_tuned = DecisionTreeClassifier(splitter         = 'best',
                                    min_samples_leaf = 4,
                                    max_depth        = 7,
                                    criterion        = 'gini',
                                    random_state     = 219)


# FITTING to the FULL DATASET (due to cross-validation)
tree_tuned_fit = tree_tuned.fit(got_data, got_target)


# PREDICTING based on the testing set
tree_tuned_pred = tree_tuned.predict(x_test)


# SCORING the results
print('Training ACCURACY:', tree_tuned.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', tree_tuned.score(x_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = tree_tuned_pred).round(4))


# saving scoring data for future use
tree_tuned_train_score = tree_tuned.score(x_train, y_train).round(4) # accuracy
tree_tuned_test_score  = tree_tuned.score(x_test, y_test).round(4)   # accuracy


# saving the AUC score
tree_tuned_auc         = roc_auc_score(y_true  = y_test,
                                       y_score = tree_tuned_pred).round(4) # auc

In [None]:
# unpacking the confusion matrix
tuned_tree_tn, \
tuned_tree_fp, \
tuned_tree_fn, \
tuned_tree_tp = confusion_matrix(y_true = y_test, y_pred = tree_tuned_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {tuned_tree_tn}
False Positives: {tuned_tree_fp}
False Negatives: {tuned_tree_fn}
True Positives : {tuned_tree_tp}
""")

In [None]:
# declaring model performance objects
tree_train_acc = tree_tuned.score(x_train, y_train).round(4)
tree_test_acc  = tree_tuned.score(x_test, y_test).round(4)
tree_auc       = roc_auc_score(y_true  = y_test,
                              y_score = tree_tuned_pred).round(4)


# appending to model_performance
model_performance = model_performance.append(
                          {'Model Name'        : 'Tuned Tree',
                           'Training Accuracy' : tree_train_acc,
                           'Testing Accuracy'  : tree_test_acc,
                           'AUC Score'         : tree_auc,
                           'Confusion Matrix'  : (tuned_tree_tn,
                                                  tuned_tree_fp,
                                                  tuned_tree_fn,
                                                  tuned_tree_tp)},
                           ignore_index = True)


<h2>Ensemble Modeling</h2>

In [None]:
# train/test split with the logit_sig variables
got_data_full   =  got.loc[ : , candidate_dict['logit_full']]
got_target_full =  got.loc[ : , 'isAlive']


# train/test split
x_train_full, x_test_full, y_train_full, y_test_full = train_test_split(
            got_data_full,
            got_target_full,
            random_state = 219,
            test_size    = 0.10,
            stratify     = got_target_full)

In [None]:
# INSTANTIATING a random forest model with default values
rf_default = RandomForestClassifier(n_estimators     = 100, #number of trees in the forest
                                    criterion        = 'gini', #quality of a split
                                    max_depth        = None, #depth of the tree
                                    min_samples_leaf = 1, #min required to be a leaf node
                                    bootstrap        = True, #if bootstrap will be used when building trees
                                    warm_start       = False, #fit whole new model
                                    random_state     = 219)

In [None]:
# FITTING the training data
rf_default_fit = rf_default.fit(x_train_full, y_train_full)


# PREDICTING based on the testing set
rf_default_fit_pred = rf_default_fit.predict(x_test_full)


# SCORING the results
print('Training ACCURACY:', rf_default_fit.score(x_train_full, y_train_full).round(4))
print('Testing  ACCURACY:', rf_default_fit.score(x_test_full, y_test_full).round(4))


# saving AUC score
print('AUC Score        :', roc_auc_score(y_true  = y_test_full,
                                          y_score = rf_default_fit_pred).round(4))

In [None]:
# plotting feature importances
#plot_feature_importances_full(rf_default_fit,
#                         train = x_train_full,
#                         export = False)

<strong>Comments on Feature Importance</strong>

- Popularity</em> and <em>age</em> are the most important features in terms of splitting the data into nodes.

- book4_A_Feast_For_Crows </em>is the second best most important feature in terms of splitting the data into nodes.

In [None]:
# unpacking the confusion matrix
rf_tn, \
rf_fp, \
rf_fn, \
rf_tp = confusion_matrix(y_true = y_test_full, y_pred = rf_default_fit_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {rf_tn}
False Positives: {rf_fp}
False Negatives: {rf_fn}
True Positives : {rf_tp}
""")

In [None]:
# declaring model performance objects
rf_train_acc = rf_default_fit.score(x_train_full, y_train_full).round(4)
rf_test_acc  = rf_default_fit.score(x_test_full, y_test_full).round(4)
rf_auc       = roc_auc_score(y_true  = y_test_full,
                             y_score = rf_default_fit_pred).round(4)


# appending to model_performance
model_performance = model_performance.append(
                          {'Model Name'         : 'Random Forest (Full)',
                           'Training Accuracy'  : rf_train_acc,
                           'Testing Accuracy'   : rf_test_acc,
                           'AUC Score'          : rf_auc,
                           'Confusion Matrix'   : (rf_tn,
                                                   rf_fp,
                                                   rf_fn,
                                                   rf_tp)},
                          ignore_index = True)

In [None]:
# FITTING the training data
#rf_default_fit = rf_default.fit(x_train_full, y_train_full)


# PREDICTING based on the testing set
#rf_default_fit_pred = rf_default_fit.predict(x_test_full)


# declaring a hyperparameter space
#estimator_range  = np.arange(100, 1100, 250)
#leaf_range       = np.arange(1, 31, 10)
#criterion_range  = ['gini', 'entropy']
#bootstrap_range  = [True, False]
#warm_start_range = [True, False]


# creating a hyperparameter grid
#param_grid = {'n_estimators'     : estimator_range,
#              'min_samples_leaf' : leaf_range,
#              'criterion'        : criterion_range,
#              'bootstrap'        : bootstrap_range,
#              'warm_start'       : warm_start_range}


# INSTANTIATING the model object without hyperparameters
#forest_grid = RandomForestClassifier(random_state = 219)


# GridSearchCV object
#forest_cv = RandomizedSearchCV(estimator           = forest_grid,
#                               param_distributions = param_grid,
#                               cv         = 3,
#                               n_iter     = 1000,
#                               scoring    = make_scorer(roc_auc_score,
#                                            needs_threshold = False))


# FITTING to the FULL DATASET (due to cross-validation)
#forest_cv.fit(got_data_full, got_target_full)


# PREDICT step is not needed


# printing the optimal parameters and best score
#print("Tuned Parameters  :", forest_cv.best_params_)
#print("Tuned Training AUC:", forest_cv.best_score_.round(4))

<strong>Results from the previous code:</strong> 
<br>

- Tuned Parameters  : {'warm_start': True, 'n_estimators': 100, 'min_samples_leaf': 1, 'criterion': 'entropy', 'bootstrap': False}
- Tuned Training AUC: 0.675

In [None]:
# best estimators based on RandomizedSearchCV
#forest_cv.best_estimator_

<strong>Results from the previous code:</strong> 
<br>

- RandomForestClassifier(bootstrap=False, criterion='entropy', random_state=219, warm_start=True)

In [None]:
# building a model based on hyperparameter tuning results

# INSTANTIATING with best_estimator
forest_tuned = RandomForestClassifier(criterion        = 'entropy',
                                      min_samples_leaf = 1,
                                      n_estimators     = 100,
                                      warm_start       = True,
                                      bootstrap        = False,
                                      random_state     = 219)


# FITTING to the FULL DATASET (due to cross-validation)
forest_tuned_fit = forest_tuned.fit(got_data_full, got_target_full)


# PREDICTING based on the testing set
forest_tuned_pred = forest_tuned_fit.predict(x_test_full)


# SCORING the results
print('Forest Tuned Training ACCURACY:', forest_tuned.score(x_train_full, y_train_full).round(4))
print('Forest Tuned Testing  ACCURACY:', forest_tuned.score(x_test_full, y_test_full).round(4))
print('Forest Tuned AUC Score        :', roc_auc_score(y_true  = y_test_full,
                                                       y_score = forest_tuned_pred).round(4))


# saving scoring data for future use
forest_tuned_train_score = forest_tuned.score(x_train_full, y_train_full).round(4) # accuracy
forest_tuned_test_score  = forest_tuned.score(x_test_full, y_test_full).round(4)   # accuracy


# saving the AUC score
forest_tuned_auc = roc_auc_score(y_true  = y_test_full,
                                 y_score = forest_tuned_pred).round(4) # auc

In [None]:
# plotting feature importances
plot_feature_importances_full(forest_tuned_fit,
                         train = x_train_full,
                         export = False)

In [None]:
# unpacking the confusion matrix
tuned_rf_tn, \
tuned_rf_fp, \
tuned_rf_fn, \
tuned_rf_tp = confusion_matrix(y_true = y_test_full, y_pred = forest_tuned_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {tuned_rf_tn}
False Positives: {tuned_rf_fp}
False Negatives: {tuned_rf_fn}
True Positives : {tuned_rf_tp}
""")

In [None]:
# declaring model performance objects
tuned_rf_train_acc = forest_tuned_fit.score(x_train_full, y_train_full).round(4)
tuned_rf_test_acc  = forest_tuned_fit.score(x_test_full, y_test_full).round(4)
tuned_rf_auc       = roc_auc_score(y_true  = y_test_full,
                                   y_score = forest_tuned_pred).round(4)


# appending to model_performance
model_performance = model_performance.append(
                          {'Model Name'         : 'Tuned Random Forest (Full)',
                           'Training Accuracy'  : tuned_rf_train_acc,
                           'Testing Accuracy'   : tuned_rf_test_acc,
                           'AUC Score'          : tuned_rf_auc,
                           'Confusion Matrix'   : (tuned_rf_tn,
                                                   tuned_rf_fp,
                                                   tuned_rf_fn,
                                                   tuned_rf_tp)},
                          ignore_index = True)

<h2>Gradient Boosted Machines</h2>

In [None]:
# INSTANTIATING the model object without hyperparameters
full_gbm_default = GradientBoostingClassifier(loss          = 'deviance',
                                              learning_rate = 0.1,
                                              n_estimators  = 100,
                                              criterion     = 'friedman_mse',
                                              max_depth     = 3,
                                              warm_start    = False,
                                              random_state  = 219)


# FIT step is needed as we are not using .best_estimator
full_gbm_default_fit = full_gbm_default.fit(x_train_full, y_train_full)


# PREDICTING based on the testing set
full_gbm_default_pred = full_gbm_default_fit.predict(x_test_full)


# SCORING the results
print('Training ACCURACY:', full_gbm_default_fit.score(x_train_full, y_train_full).round(4))
print('Testing ACCURACY :', full_gbm_default_fit.score(x_test_full, y_test_full).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test_full,
                                          y_score = full_gbm_default_pred).round(4))

In [None]:
# unpacking the confusion matrix
gbm_default_tn, \
gbm_default_fp, \
gbm_default_fn, \
gbm_default_tp = confusion_matrix(y_true = y_test_full, y_pred = full_gbm_default_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {gbm_default_tn}
False Positives: {gbm_default_fp}
False Negatives: {gbm_default_fn}
True Positives : {gbm_default_tp}
""")

In [None]:
# declaring model performance objects
gbm_train_acc = full_gbm_default_fit.score(x_train_full, y_train_full).round(4)
gbm_test_acc  = full_gbm_default_fit.score(x_test_full, y_test_full).round(4)
gbm_auc       = roc_auc_score(y_true  = y_test_full,
                              y_score = full_gbm_default_pred).round(4)


# appending to model_performance
model_performance = model_performance.append(
                          {'Model Name'       : 'GBM (Full)',
                          'Training Accuracy' : gbm_train_acc,
                          'Testing Accuracy'  : gbm_test_acc,
                          'AUC Score'         : gbm_auc,
                          'Confusion Matrix'  : (gbm_default_tn,
                                                 gbm_default_fp,
                                                 gbm_default_fn,
                                                 gbm_default_tp)},
                          ignore_index = True)

In [None]:
# declaring a hyperparameter space
#learn_range        = np.arange(0.1, 2.2, 0.5)
#estimator_range    = np.arange(100, 501, 25)
#depth_range        = np.arange(2, 11, 2)
#warm_start_range   = [True, False]

# creating a hyperparameter grid
#param_grid = {'learning_rate' : learn_range,
#              'max_depth'     : depth_range,
#              'n_estimators'  : estimator_range,
#              'warm_start'    : warm_start_range}


# INSTANTIATING the model object without hyperparameters
#full_gbm_grid = GradientBoostingClassifier(random_state = 219)


# GridSearchCV object
#full_gbm_cv = RandomizedSearchCV(estimator     = full_gbm_grid,
#                           param_distributions = param_grid,
#                           cv                  = 3,
#                           n_iter              = 500,
#                           random_state        = 219,
#                           scoring             = make_scorer(roc_auc_score,
#                                                 needs_threshold = False))


# FITTING to the FULL DATASET (due to cross-validation)
#full_gbm_cv.fit(got_data_full, got_target_full)


# PREDICT step is not needed


# printing the optimal parameters and best score
#print("Tuned Parameters  :", full_gbm_cv.best_params_)
#print("Tuned Training AUC:", full_gbm_cv.best_score_.round(4))

<strong>Results from the previous code:</strong> 
<br>

- Tuned Parameters  : {'warm_start': False, 'n_estimators': 325, 'max_depth': 6, 'learning_rate': 0.1}
- Tuned Training AUC: 0.6796

In [None]:
# checking the best estimator for the model
#full_gbm_cv.best_estimator_

<strong>Results from the previous code:</strong> 
<br>

- GradientBoostingClassifier(max_depth=6, n_estimators=325, random_state=219)

In [None]:
# INSTANTIATING with best_estimator
gbm_tuned = GradientBoostingClassifier(learning_rate = 0.1,
                                       max_depth     = 6,
                                       n_estimators  = 325,
                                       warm_start    = False,
                                       random_state  = 219)


# FITTING to the FULL DATASET (due to cross-validation)
gbm_tuned_fit = gbm_tuned.fit(got_data_full, got_target_full)


# PREDICTING based on the testing set
gbm_tuned_pred = gbm_tuned_fit.predict(x_test_full)


# SCORING the results
print('Training ACCURACY:', gbm_tuned_fit.score(x_train_full, y_train_full).round(4))
print('Testing  ACCURACY:', gbm_tuned_fit.score(x_test_full, y_test_full).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test_full,
                                          y_score = gbm_tuned_pred).round(4))

In [None]:
# unpacking the confusion matrix
gbm_tuned_tn, \
gbm_tuned_fp, \
gbm_tuned_fn, \
gbm_tuned_tp = confusion_matrix(y_true = y_test_full, y_pred = gbm_tuned_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {gbm_tuned_tn}
False Positives: {gbm_tuned_fp}
False Negatives: {gbm_tuned_fn}
True Positives : {gbm_tuned_tp}
""")

In [None]:
# declaring model performance objects
gbm_train_acc = gbm_tuned_fit.score(x_train_full, y_train_full).round(4)
gbm_test_acc  = gbm_tuned_fit.score(x_test_full, y_test_full).round(4)
gbm_auc       = roc_auc_score(y_true  = y_test_full,
                              y_score = gbm_tuned_pred).round(4)


# appending to model_performance
model_performance = model_performance.append(
                          {'Model Name'        : 'Tuned GBM',
                          'Training Accuracy'  : gbm_train_acc,
                          'Testing Accuracy'   : gbm_test_acc,
                          'AUC Score'          : gbm_auc,
                          'Confusion Matrix'   : (gbm_tuned_tn,
                                                  gbm_tuned_fp,
                                                  gbm_tuned_fn,
                                                  gbm_tuned_tp)},
                          ignore_index = True)

In [None]:
model_performance.sort_values(by = 'AUC Score',
                              ascending = False)

In [None]:
# user defined funciton displaying final model results
def print_final_model(train_acc, test_acc, auc_score, tn, fp, fn, tp):
    
    """
Displays the final model results.

PARAMETERS
----------
possible_models: DF including the information por the possible.
train_acc      : final training model accuracy.
test_acc       : final testing model accuracy.
auc            : final model AUC score.
tn             : correctly predicted false values for the response variable.
fp             : incorrectly predicted true values for the response variable.
fn             : incorrectly predicted false values for the response variable.
tp             : correctly predicted true values for the response variable.
"""
    
    # printing final model results

    print(f"""
  Final Model:

  Model        Train Score      Test Score      AUC Score
  -----        -----------      ----------      ----------
  Tuned GBM       {train_acc}          {test_acc}          {auc_score}

                                                 |
  True Negatives:          {tn}                    |  False Positives:         {fp}                   
                                                 |
  PREDICTED: Dead         (isAlive=0)            |  PREDICTED: Alive        (isAlive=1)
  ACTUAL:    Dead         (isAlive=0)            |  ACTUAL:    Dead         (isAlive=0)
                                                 |
-------------------------------------------------|-----------------------------------------------
                                                 |
  False Negatives:           {fn}                   |  True Positives:         {tp}
                                                 |  
  PREDICTED: Dead         (isAlive=0)            |  PREDICTED: Dead         (isAlive=0)
  ACTUAL:    Alive        (isAlive=1)            |  ACTUAL:    Alive        (isAlive=1)
                                                 |  

Candidate Models:
""")

In [None]:
# calling the function
print_final_model(knn_train_score, knn_test_score, knn_auc_score, 
                  knn_tree_tn, knn_tree_fp, knn_tree_fn, knn_tree_tp)

# displaying all possible models.
model_performance.sort_values(by = 'AUC Score',
                              ascending = False)