<h2>Part I: Preparation and Exploration</h2>
<br><h3>a) Imports and Loading the Dataset</h3><br>
    Import packages and load the 'GOT_character_predictions.xlsx' dataset into Python as <strong>GOT</strong> (from the <em>Downloads</em> folder).

In [None]:
# installing gender_guesser
# %pip install gender_guesser                          

In [None]:
# importing libraries
import pandas            as pd                       # data science essentials
import matplotlib.pyplot as plt                      # data visualization
import seaborn           as sns                      # enhanced data viz
from sklearn.model_selection import train_test_split # train-test split
from sklearn.linear_model import LogisticRegression  # logistic regression
import statsmodels.formula.api as smf                # logistic regression
from sklearn.metrics import confusion_matrix         # confusion matrix
from sklearn.metrics import roc_auc_score            # auc score
from sklearn.neighbors import KNeighborsClassifier   # KNN for classification
from sklearn.neighbors import KNeighborsRegressor    # KNN for regression
from sklearn.preprocessing import StandardScaler     # standard scaler
from sklearn.tree import DecisionTreeClassifier      # classification trees
from sklearn.tree import plot_tree                   # tree plots                         
import gender_guesser.detector as gender             # guess gender based on (given) name

In [None]:
# loading data
GOT = pd.read_excel('./GOT_character_predictions.xlsx')


# setting pandas print options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 100)


# displaying the head of the dataset
GOT.head(n = 5)

### b. User-Defined Functions

In [None]:
# user-defined functions

#########################
# mv_flagger
#########################
def mv_flagger(df):
    """
Flags all columns that have missing values with 'm-COLUMN_NAME'.

PARAMETERS
----------
df : DataFrame to flag missing values


RETURNS
-------
DataFrame with missing value flags."""


    for col in df:

        if df[col].isnull().astype(int).sum() > 0:
            df['m_'+col] = df[col].isnull().astype(int)
            
    return df



#########################
# text_split_feature
#########################
def text_split_feature(col, df, sep=' ', new_col_name='number_of_names'):
    """
Splits values in a string Series (as part of a DataFrame) and sums the number
of resulting items. Automatically appends summed column to original DataFrame.

PARAMETERS
----------
col          : column to split
df           : DataFrame where column is located
sep          : string sequence to split by, default ' '
new_col_name : name of new column after summing split, default
               'number_of_names'
"""
    
    df[new_col_name] = 0
    
    
    for index, val in df.iterrows():
        df.loc[index, new_col_name] = len(df.loc[index, col].split(sep = ' '))
        

########################################
# optimal_neighbors
########################################
def optimal_neighbors(x_data,
                      y_data,
                      standardize = True,
                      pct_test=0.10,
                      seed=219,
                      response_type='reg',
                      max_neighbors=20,
                      show_viz=True):    
    """
Exhaustively compute training and testing results for KNN across
[1, max_neighbors]. Outputs the maximum test score and (by default) a
visualization of the results.
PARAMETERS
----------
x_data        : explanatory variable data
y_data        : response variable
standardize   : whether or not to standardize the x data, default True
pct_test      : test size for training and validation from (0,1), default 0.25
seed          : random seed to be used in algorithm, default 219
response_type : type of neighbors algorithm to use, default 'reg'
    Use 'reg' for regression (KNeighborsRegressor)
    Use 'class' for classification (KNeighborsClassifier)
max_neighbors : maximum number of neighbors in exhaustive search, default 20
show_viz      : display or surpress k-neigbors visualization, default True
"""      
    if standardize == True:
        # optionally standardizing x_data
        scaler             = StandardScaler()
        scaler.fit(x_data)
        x_scaled           = scaler.transform(x_data)
        x_scaled_df        = pd.DataFrame(x_scaled)
        x_data             = x_scaled_df



    # train-test split
    x_train, x_test, y_train, y_test = train_test_split(x_data,
                                                        y_data,
                                                        test_size = pct_test,
                                                        random_state = seed)


    # creating lists for training set accuracy and test set accuracy
    training_accuracy = []
    test_accuracy = []
    
    
    # setting neighbor range
    neighbors_settings = range(1, max_neighbors + 1)


    for n_neighbors in neighbors_settings:
        # building the model based on response variable type
        if response_type == 'reg':
            clf = KNeighborsRegressor(n_neighbors = n_neighbors)
            clf.fit(x_train, y_train)
            
        elif response_type == 'class':
            clf = KNeighborsClassifier(n_neighbors = n_neighbors)
            clf.fit(x_train, y_train)            
            
        else:
            print("Error: response_type must be 'reg' or 'class'")
        
        
        # recording the training set accuracy
        training_accuracy.append(clf.score(x_train, y_train))
    
        # recording the generalization accuracy
        test_accuracy.append(clf.score(x_test, y_test))


    # optionally displaying visualization
    if show_viz == True:
        # plotting the visualization
        fig, ax = plt.subplots(figsize=(12,8))
        plt.plot(neighbors_settings, training_accuracy, label = "training accuracy")
        plt.plot(neighbors_settings, test_accuracy, label = "test accuracy")
        plt.ylabel("Accuracy")
        plt.xlabel("n_neighbors")
        plt.legend()
        plt.show()
    
    
    # returning optimal number of neighbors
    print(f"The optimal number of neighbors is: {test_accuracy.index(max(test_accuracy))+1}")
    return test_accuracy.index(max(test_accuracy))+1


########################################
# visual_cm
########################################
def visual_cm(true_y, pred_y, labels = None):
    """
Creates a visualization of a confusion matrix.

PARAMETERS
----------
true_y : true values for the response variable
pred_y : predicted values for the response variable
labels : , default None
    """
    # visualizing the confusion matrix

    # setting labels
    lbls = labels
    

    # declaring a confusion matrix object
    cm = confusion_matrix(y_true = true_y,
                          y_pred = pred_y)


    # heatmap
    sns.heatmap(cm,
                annot       = True,
                xticklabels = lbls,
                yticklabels = lbls,
                cmap        = 'Blues',
                fmt         = 'g')


    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix of the Classifier')
    plt.show()

### c. Loading GOT Data Dictionary

In [None]:
# pulling up data dictionary
GOT_description = pd.read_excel('./GOT_data_dictionary.xlsx')
    

# displaying the data dictionary
GOT_description

## Part II - Dataset Anomalies

### a. Missing value detection

In [None]:
# Check null value count
GOT.isnull().sum(axis = 0)

In [None]:
# INFOrmation about each variable
GOT.info()

### b. Flagging missing values

In [None]:
# running the mv_flagger function
GOT = mv_flagger(df = GOT)

# checking results
GOT.columns

###  c. Develop missing value and categorical encoding strategies

In [None]:
# name              - (discrete) feature out, drop original    
# title             - (categorical) impute with 'Unknown', then one-hot encode, drop original
# gender_guess      - (categorical) impute with 'Unknown', then one-hot encode, drop original
# culture           - (categorical) impute with 'Unknown', then one-hot encode, drop original
# dateOfBirth       - drop as it is correlated with age
# mother            - (categorical) drop (very dirty; insufficient domain knowledge)
# father            - (categorical) drop (very dirty; insufficient domain knowledge)
# heir              - (categorical) drop (very dirty; insufficient domain knowledge)
# house             - (discrete) feature out, drop original  
# spouse            - (categorical) drop (very dirty; insufficient domain knowledge)
# isAliveMother     - (categorical) drop (very dirty; insufficient domain knowledge)
# isAliveFather     - (categorical) drop (very dirty; insufficient domain knowledge)
# isAliveHeir       - (categorical) drop (very dirty; insufficient domain knowledge)
# isAliveSpouse     - (categorical) drop (very dirty; insufficient domain knowledge)
# isMarried         - (categorical) keep original
# isNoble           - (categorical) keep original
# age               - (continuous) impute median as it has negative value and also using gender_guess field
# numDeadRelations  - (continuous) keep original
# popularity        - (continuous) keep original


### d. Gender guesser

#### Below is the actual gender guess code. Commented as it consumes time.

In [None]:
# # guessing gender based on (given) name

# # placeholder list
# placeholder_lst = []


# # looping to guess gender
# for name in GOT['name']:
#     name.split()
#     first_name = name.split()[0]
# #    print(first_name)
#     guess = gender.Detector().get_gender(first_name)
# #    print(guess)
#     placeholder_lst.append(guess)

# # converting list into a series
# GOT['gender_guess'] = pd.Series(placeholder_lst)


# # checking results
# GOT.head(n = 5)

# # checking results
# GOT['gender_guess'].value_counts(normalize = False,
#                                  sort      = True,
#                                  ascending = False)


#### Below is the hardcoded gender guess code. This helps save time. 
##### This field is used to fill the Null values for age

In [None]:
# guessing gender based on (given) name

# placeholder list
placeholder_lst = []


# looping to guess gender
for name in GOT['name']:
    name.split()
    first_name = name.split()[0]
    
    if first_name in ['Addison',	'Red',	'Rusty',	'Tal',	'Aubrey',	'Dorren',	'Moon']:
        guess = 'andy'
        placeholder_lst.append(guess)
    
    elif first_name in ['Will',	'Robin',	'Rowan',	'Sam',	'Courtenay',	'Dale',	'Kirby',	'Long',	'Ellery',	'Bryce',	'Devan']:
        guess = 'mostly_male'
        placeholder_lst.append(guess)        
        
    elif first_name in ['Young',	'Sky',	'Alyn',	'Cass',	'Kyle',	'Nan',	'Gilly',	'Lyn',	'Val']:
        guess = 'mostly_female'
        placeholder_lst.append(guess)            

    elif first_name in ['Sylva',	'Willow',	'Wenda',	'Randa',	'Rhonda',	'Rosamund',	'Rus',	'Serra',	'Shella',	'Talea',	'Violet',	'Walda',	'Alia',	'Alyce',	'Alys',	'Alyssa',	'Amabel',	'Barbara',	'Becca',	'Bella',	'Bess',	'Bethany',	'Dalla',	'Darla',	'Del',	'Denyse',	'Donella',	'Dorcas',	'Eden',	'Eglantine',	'Elza',	'Emma',	'Euron',	'Fern',	'Ferny',	'Hali',	'Helly',	'Jayde',	'Jayne',	'Jocelyn',	'Joanna',	'Johanna',	'Kym',	'Leana',	'Leona',	'Leyla',	'Lia',	'Liane',	'Lysa',	'Maddy',	'Maggy',	'Marei',	'Maris',	'Myrtle',	'Nella',	'Penny',	'Alla',	'Anya',	'Beren',	'Beth',	'Corliss',	'Deana',	'Della',	'Eleanor',	'Elinor',	'Janna',	'Joy',	'Kyra',	'Leonella',	'Lyra',	'Margot',	'Marianne',	'Mariya',	'Marissa',	'Meredyth',	'Mina',	'Munda',	'Myrielle',	'Rhea',	'Aurane',	'Danelle',	'Marya',	'Masha',	'Meera',	'Melissa',	'Mya',	'Pia',	'Aeron',	'Asha',	'Barba',	'Elia',	'Genna',	'Harma',	'Holly',	'Myranda',	'Shireen',	'Robyn',	'Arianne',	'Meg',]:
        guess = 'female'
        placeholder_lst.append(guess) 

    elif first_name in ['Wilbert',	'Willem',	'Willis',	'Joffrey',	'Quentin',	'Raymond',	'Raymund',	'Richard',	'Rob',	'Rickard',	'Robb',	'Robert',	'Roger',	'Roland',	'Rolland',	'Ronald',	'Rupert',	'Simon',	'Stafford',	'Stone',	'Symon',	'Terrance',	'Theo',	'Theobald',	'Tim',	'Timon',	'Tom',	'Tristan',	'Ulf',	'Umar',	'Victor',	'Walton',	'Wendell',	'Alaric',	'Andrey',	'Armen',	'Arron',	'Ben',	'Benedict',	'Bennet',	'Bertram',	'Blane',	'Boy',	'Brandon',	'Bryan',	'Buford',	'Byron',	'Cedric',	'Clarence',	'Clement',	'Clifford',	'Conn',	'Damon',	'Dan',	'Dennis',	'Denys',	'Dick',	'Dudley',	'Duncan',	'Edmund',	'Egon',	'Eldred',	'Elyas',	'Erik',	'Gage',	'Galt',	'Gareth',	'Garrett',	'Garth',	'Gavin',	'Gerold',	'Glendon',	'Griffin',	'Harbert',	'Hamish',	'Harlan',	'Harry',	'Henk',	'Herbert',	'Hendry',	'Hilmar',	'Hod',	'Holger',	'Hugh',	'Jaime',	'Jasper',	'Jon',	'Joss',	'Josua',	'Leo',	'Lester',	'Lew',	'Lewis',	'Lorimer',	'Lothar',	'Lucas',	'Luke',	'Lyonel',	'Malcolm',	'Manfred',	'Martyn',	'Matt',	'Maynard',	'Michael',	'Mortimer',	'Myles',	'Nail',	'Ned',	'Ormond',	'Owen',	'Peter',	'Poul',	'Alesander',	'Allard',	'Allar',	'Ambrose',	'Andrew',	'Armond',	'Arthur',	'Burton',	'Colin',	'Creighton',	'Dermot',	'Desmond',	'Dirk',	'Dolf',	'Elbert',	'Elmar',	'Elwood',	'Franklyn',	'Garrison',	'Gerald',	'Gilbert',	'Harlen',	'Hugo',	'Igon',	'Jarl',	'Jason',	'Lambert',	'Lyman',	'Maron',	'Mathis',	'Morton',	'Norbert',	'Omer',	'Orland',	'Osmund',	'Otho',	'Philip',	'Quincy',	'Ralf',	'Raynard',	'Raynald',	'Royce',	'Rufus',	'Selwyn',	'Terrence',	'Theodore',	'Titus',	'Triston',	'Wallace',	'Wendel',	'William',	'Zachery',	'Zia',	'Anders',	'Arnolf',	'Clayton',	'Cleon',	'Eldon',	'Ethan',	'Harmen',	'Iggo',	'Jared',	'Jory',	'Lyle',	'Nestor',	'Otto',	'Preston',	'Ramsay',	'Andrik',	'Amory',	'Archibald',	'Arlan',	'Aron',	'Bowen',	'Cletus',	'Danny',	'Donal',	'Gregor',	'Justin',	'Kevan',	'Leyton',	'Mark',	'Marlon',	'Wyman',	'Rolph',	'Damion',	'Roslin',	'Tycho',	'Barra',	'Temmo',	'Symeon',]:
        guess = 'male'
        placeholder_lst.append(guess)            

    else:     
        guess = 'Unknown'
        placeholder_lst.append(guess)         
    
# converting list into a series
GOT['gender_guess'] = pd.Series(placeholder_lst)


# checking results
GOT.head(n = 5)

# checking results
GOT['gender_guess'].value_counts(normalize = False,
                                 sort      = True,
                                 ascending = False)


In [None]:
# Mapping relative values as male and female
GOT["gender_guess"] = GOT["gender_guess"].replace(to_replace = ['male','mostly_male','andy'], value = "male")
GOT["gender_guess"] = GOT["gender_guess"].replace(to_replace = ['female','mostly_female'], value = "female")

In [None]:
# checking results
GOT['gender_guess'].value_counts(normalize = False,
                                 sort      = True,
                                 ascending = False)

In [None]:
# Creating dummies for gender guess field
dummies_gender_guess = pd.get_dummies(GOT.gender_guess, prefix='gg')

dummies_gender_guess.head(n = 5)

In [None]:
# Concatenate the appropriate values to the dataframe dropping atleast one dummy variable
GOT = pd.concat([GOT, dummies_gender_guess[['gg_Unknown', 'gg_male']]], axis=1)

### e. Age Imputation

In [None]:
# Imputing age field with median as it has negative values
print(f"""
Median Age:                     {round(GOT.loc[ : , 'age'].median(),1)}
Median Male Age:                {round(GOT.loc[ : , 'age'][GOT['gender_guess'] == "male"].median(),1)}
Median Female Age:              {round(GOT.loc[ : , 'age'][GOT['gender_guess'] == "female"].median(),1)}
 """)

Median_Age = round(GOT.loc[ : , 'age'].median(),1)
Median_Male_Age = round(GOT.loc[ : , 'age'][GOT['gender_guess'] == "male"].median(),1)
Median_Female_Age = round(GOT.loc[ : , 'age'][GOT['gender_guess'] == "female"].median(),1)

In [None]:
# Pre-imputation check
GOT['age'].head(n=5)

In [None]:
# imputing missing values for age
for index, val in GOT.iterrows():

    
    # imputing age for females
    if (str(GOT.loc[index, 'age']).lower()  == 'nan' and GOT.loc[index, 'gender_guess']  == "female"):       
            GOT.loc[index, 'age'] = Median_Female_Age   

    # imputing age for males
    elif (str(GOT.loc[index, 'age']).lower()  == 'nan' and GOT.loc[index, 'gender_guess']  == "male"):       
              GOT.loc[index, 'age'] = Median_Male_Age 

    # imputing age for males
    elif (str(GOT.loc[index, 'age']).lower()  == 'nan' and GOT.loc[index, 'gender_guess']  == "Unknown"):       
              GOT.loc[index, 'age'] = Median_Age 
            
# ensuring all missing values for age are taken care of
print(f"Remaining missing values for age: {GOT.loc[ :, 'age'].isnull().sum()}")        

In [None]:
# post-imputation check
GOT['age'].head(n = 20)

In [None]:
# GOT features
GOT.columns

### Histogram to identify skewness in the data

In [None]:
#Number of dead relations
mean = GOT['numDeadRelations'].mean()
median = GOT['numDeadRelations'].median()
mode = GOT['numDeadRelations'].mode()[0]

plt.axvline(mean, color='r', linestyle='--')
plt.axvline(median, color='g', linestyle='-')
plt.axvline(mode, color='b', linestyle='-')
plt.legend({'Mean':mean,'Median':median,'Mode':mode})

sns.histplot(data  = GOT,
             x     = 'numDeadRelations',
            kde    = True)


# title and axis labels
plt.title(label   = "Original Distribution of number of Dead Relations")
plt.xlabel(xlabel = "number of Dead Relations") # avoiding using dataset labels
plt.ylabel(ylabel = "count")

# displaying the histogram
plt.show()

#_____________________________________________________________________________

# Popularity
mean = GOT['popularity'].mean()
median = GOT['popularity'].median()
mode = GOT['popularity'].mode()[0]

plt.axvline(mean, color='r', linestyle='--')
plt.axvline(median, color='g', linestyle='-')
plt.axvline(mode, color='b', linestyle='-')
plt.legend({'Mean':mean,'Median':median,'Mode':mode})

sns.histplot(data  = GOT,
             x     = 'popularity',
            kde    = True)


# title and axis labels
plt.title(label   = "Original Distribution of popularity")
plt.xlabel(xlabel = "popularity") # avoiding using dataset labels
plt.ylabel(ylabel = "count")

# displaying the histogram
plt.show()



### Applying log to skewed variables

In [None]:
GOT['log_numDeadRelations'] = np.log(GOT['numDeadRelations']+ 0.001)

GOT['log_popularity'] = np.log(GOT['popularity']+ 0.001)

### f. Culture feature cleaning

In [None]:
# Culture feature cleaning
GOT["culture"] = GOT["culture"].replace(to_replace = 'Andal', value = "Andals")
GOT["culture"] = GOT["culture"].replace(to_replace = "Asshai'i", value = "Asshai")
GOT["culture"] = GOT["culture"].replace(to_replace = 'Astapori', value = "Astapor")
GOT["culture"] = GOT["culture"].replace(to_replace = 'Braavosi', value = "Braavos")
GOT["culture"] = GOT["culture"].replace(to_replace = ['Dornishmen', 'Dorne'], value = "Dornish")
GOT["culture"] = GOT["culture"].replace(to_replace = ['Wildling', 'First men', 'Free Folk', 'free folk', 'Free folk'], value = "Wildlings")
GOT["culture"] = GOT["culture"].replace(to_replace = ['Ghiscaricari',  'Ghis'], value = "Ghiscari")
GOT["culture"] = GOT["culture"].replace(to_replace = 'Ironmen', value = "Ironborn")
GOT["culture"] = GOT["culture"].replace(to_replace = 'Lhazarene', value = "Lhazareen")
GOT["culture"] = GOT["culture"].replace(to_replace = 'Lyseni', value = "Lysene")
GOT["culture"] = GOT["culture"].replace(to_replace = 'Meereenese', value = "Meereen")
GOT["culture"] = GOT["culture"].replace(to_replace = ['Myr','Myrmen'], value = "Myrish")
GOT["culture"] = GOT["culture"].replace(to_replace = 'Northern mountain clans', value = "Northmen")
GOT["culture"] = GOT["culture"].replace(to_replace = 'Norvoshi', value = "Norvos")
GOT["culture"] = GOT["culture"].replace(to_replace = 'Qartheen', value = "Qarth")
GOT["culture"] = GOT["culture"].replace(to_replace = ['The Reach', 'Reachmen'], value = "Reach")
GOT["culture"] = GOT["culture"].replace(to_replace = ['Rivermen'], value = "Riverlands")
GOT["culture"] = GOT["culture"].replace(to_replace = 'Stormlands', value = "Stormlander")
GOT["culture"] = GOT["culture"].replace(to_replace = ['Summer Islander', 'Summer Isles'], value = "Summer Islands")
GOT["culture"] = GOT["culture"].replace(to_replace = ['Valemen', 'Vale mountain clans'], value = "Vale")
GOT["culture"] = GOT["culture"].replace(to_replace = ['Westerman', 'Westerlands'], value = "Westermen")

# Fill NaN with 'Unknown'
GOT['culture'] = GOT['culture'].fillna('Unknown')

GOT['culture'].value_counts(normalize = False,
                          sort      = True,
                          ascending = False)

# create dummies for culture variable
dummies_culture = pd.get_dummies(GOT.culture, prefix='c')

In [None]:
# Include the big culture groups to the dataframe
GOT = pd.concat([GOT, dummies_culture[['c_Unknown', 'c_Northmen', 'c_Ironborn','c_Wildlings','c_Valyrian','c_Braavos','c_Dornish']]], axis=1)

### g. Developing number of names features

In [None]:
# calling text_split_feature
text_split_feature(col = 'name',
                   df  = GOT)


# checking results
GOT['number_of_names'].value_counts(normalize = False,
                                        sort      = False,
                                        ascending = False).sort_index()

GOT.head(5)


### h. Title feature imputation

In [None]:
# Fill NaN with 'Unknown'
GOT['title'] = GOT['title'].fillna('Unknown')

GOT['title'].value_counts(normalize = False,
                          sort      = True,
                          ascending = False)


In [None]:
#generate dummies for the title variable
dummies_title = pd.get_dummies(GOT.title, prefix='t')

dummies_title.head(5)

In [None]:
# Include the titles with highest numbers to the dataframe
GOT = pd.concat([GOT, dummies_title[['t_Unknown', 't_Ser','t_Maester','t_Lord','t_Archmaester','t_Septon']]], axis=1)

### i. House feature imputation

In [None]:
# Fill NaN with 'Unknown'
GOT['house'] = GOT['house'].fillna('Unknown')

GOT['house'].value_counts(normalize = False,
                          sort      = True,
                          ascending = False)


In [None]:
#generate dummies for the house variable
dummies_house = pd.get_dummies(GOT.house, prefix='h')

dummies_house.head(5)

In [None]:
# Include the houses with highest numbers to the dataframe
GOT = pd.concat([GOT, dummies_house[['h_Unknown', "h_Night's Watch",'h_House Frey','h_House Stark','h_House Targaryen']]], axis=1)                       

### j. Missing value verification

In [None]:
# Check null value count
GOT.isnull().sum()


### k. Interactions

In [None]:
GOT['interaction_age_popularity'] = GOT['age'] * GOT['popularity']

In [None]:
GOT['interaction_book1_popularity'] = GOT['book1_A_Game_Of_Thrones'] * GOT['log_popularity']    

In [None]:
GOT['interaction_book2_popularity'] = GOT['book2_A_Clash_Of_Kings'] * GOT['log_popularity']  

In [None]:
GOT['interaction_book3_popularity'] = GOT['book3_A_Storm_Of_Swords'] * GOT['log_popularity']  

In [None]:
GOT['interaction_book4_popularity'] = GOT['book4_A_Feast_For_Crows'] * GOT['log_popularity']  

In [None]:
GOT['interaction_book5_popularity'] = GOT['book5_A_Dance_with_Dragons'] * GOT['log_popularity']  

In [None]:
GOT['interaction_book1_log_numDeadRel'] = GOT['book1_A_Game_Of_Thrones'] * GOT['log_numDeadRelations']   

In [None]:
GOT['interaction_book2_log_numDeadRel'] = GOT['book2_A_Clash_Of_Kings'] * GOT['log_numDeadRelations']

In [None]:
GOT['interaction_book3_log_numDeadRel'] = GOT['book3_A_Storm_Of_Swords'] * GOT['log_numDeadRelations']

In [None]:
GOT['interaction_book4_log_numDeadRel'] = GOT['book4_A_Feast_For_Crows'] * GOT['log_numDeadRelations']

In [None]:
GOT['interaction_book5_log_numDeadRel'] = GOT['book5_A_Dance_with_Dragons'] * GOT['log_numDeadRelations']

In [None]:
GOT['interaction_male_log_numDeadRel'] = GOT['gg_male'] * GOT['log_numDeadRelations']

In [None]:
GOT['interaction_c_Northmen_popularity'] = GOT['c_Northmen'] * GOT['log_popularity']

In [None]:
GOT['interaction_book1_popularity_ddrela'] = GOT['interaction_book1_popularity'] * GOT['log_numDeadRelations']   

In [None]:
GOT['interaction_book2_popularity_ddrela'] = GOT['interaction_book2_popularity'] * GOT['log_numDeadRelations'] 

In [None]:
GOT['interaction_book3_popularity_ddrela'] = GOT['interaction_book3_popularity'] * GOT['log_numDeadRelations'] 

In [None]:
GOT['interaction_book4_popularity_ddrela'] = GOT['interaction_book4_popularity'] * GOT['log_numDeadRelations'] 

In [None]:
GOT['interaction_book5_popularity_ddrela'] = GOT['interaction_book5_popularity'] * GOT['log_numDeadRelations'] 

In [None]:
GOT['interaction_book1_popularity_married'] = GOT['interaction_book1_popularity'] * GOT['isMarried']   

In [None]:
GOT['interaction_book2_popularity_married'] = GOT['interaction_book2_popularity'] * GOT['isMarried'] 

In [None]:
GOT['interaction_book3_popularity_married'] = GOT['interaction_book3_popularity'] * GOT['isMarried'] 

In [None]:
GOT['interaction_book4_popularity_married'] = GOT['interaction_book4_popularity'] * GOT['isMarried'] 

In [None]:
GOT['interaction_book5_popularity_married'] = GOT['interaction_book5_popularity'] * GOT['isMarried'] 

In [None]:
GOT['interaction_book1_popularity_male'] = GOT['interaction_book1_popularity'] * GOT['gg_male']   

In [None]:
GOT['interaction_book2_popularity_male'] = GOT['interaction_book2_popularity'] * GOT['gg_male']   

In [None]:
GOT['interaction_book3_popularity_male'] = GOT['interaction_book3_popularity'] * GOT['gg_male']   

In [None]:
GOT['interaction_book4_popularity_male'] = GOT['interaction_book4_popularity'] * GOT['gg_male']   

In [None]:
GOT['interaction_book5_popularity_male'] = GOT['interaction_book5_popularity'] * GOT['gg_male']   

### l. Final columns after imputation and one-hot encoding

In [None]:
GOT.columns

### m. Dropping addional features

In [None]:
# dropping categorical variables after they've been encoded
GOT = GOT.drop(['name', 'title', 'culture',
                        'house', 'gender_guess','popularity','numDeadRelations'], axis = 1)

# further dropping categorical variables with insufficient data(domain knowledge)
GOT = GOT.drop(['S.No', 'mother', 'father', 'spouse',
                        'heir', 'isAliveMother','isAliveFather','isAliveHeir','isAliveSpouse'], axis = 1)

# further dropping flagged values that do not have missing data 
GOT = GOT.drop(['m_title', 'm_culture', 'm_house', 'm_age'], axis = 1)

# checking the results
GOT.columns

<h2>Part III - Logistic Regression</h2><br>
 
### a. Correlation Analysis
Correlations between the response variable and the explanatory variables.

In [None]:
df_corr = GOT.corr(method = 'pearson').round(decimals = 2)

df_corr['isAlive'].sort_values(ascending = False)

The original balance between those who are alive and those who are not alive in Game of thrones.

In [None]:
GOT.loc[ : ,'isAlive'].value_counts(normalize = True).round(decimals = 2)

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

<h3>b. Preparing Explanatory and Response Data</h3>
Declare the explanatory variables as <strong>GOT_data</strong> and the response variable (survived) as <strong>GOT_response</strong>.

In [None]:
# declaring explanatory variables
# removing dateofbirth as it is highly correlated with age
GOT_data = GOT.drop(['isAlive', 'dateOfBirth'], axis = 1)


# declaring response variable
GOT_target = GOT.loc[: ,'isAlive']

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

<h3>c. Prepare train-test split for statsmodels.</h3>
The stratify argument helps preserve the balance of the response variable on in the training and testing sets.

In [None]:
# train-test split with stratification
x_train, x_test, y_train, y_test = train_test_split(
            GOT_data,
            GOT_target,
            test_size    = 0.10,
            random_state = 219,
            stratify     = GOT_target) # preserving balance


# merging training data for statsmodels
GOT_train = pd.concat([x_train, y_train], axis = 1)

In [None]:
print(f"""

Response Variable Proportions (Training Set)
--------------------------------------------
{y_train.value_counts(normalize = True).round(decimals = 2)}



Response Variable Proportions (Testing Set)
--------------------------------------------
{y_test.value_counts(normalize = True).round(decimals = 2)}
""")



<h3>d. Build a logistic regression model in statsmodels using all of the explanatory variables.</h3>


In [None]:
for val in GOT_data:
    print(f" {val} + ")

In [None]:
# instantiating a logistic regression model object
logistic_full = smf.logit(formula = """  isAlive ~    
                                         book1_A_Game_Of_Thrones + 
                                         book2_A_Clash_Of_Kings + 
                                         book3_A_Storm_Of_Swords +
                                         book4_A_Feast_For_Crows + 
                                         book5_A_Dance_with_Dragons +
                                         log_numDeadRelations +     
                                         m_dateOfBirth + 
                                         m_mother + 
                                         m_father + 
                                         m_heir + 
                                         isMarried +
                                         isNoble +
                                         gg_Unknown +
                                         gg_male +
                                         age +
                                         log_popularity + 
                                         c_Unknown + 
                                         c_Northmen + 
                                         c_Ironborn + 
                                         c_Wildlings + 
                                         c_Valyrian + 
                                         c_Braavos + 
                                         c_Dornish + 
                                         number_of_names + 
                                         interaction_book1_log_numDeadRel + 
                                         interaction_book2_log_numDeadRel + 
                                         interaction_book3_log_numDeadRel + 
                                         interaction_book4_log_numDeadRel + 
                                         interaction_book5_log_numDeadRel + 
                                         interaction_book1_popularity +
                                         interaction_book2_popularity +
                                         interaction_book3_popularity +
                                         interaction_book4_popularity +
                                         interaction_book5_popularity +
                                         interaction_book1_popularity_ddrela +
                                         interaction_book2_popularity_ddrela +
                                         interaction_book3_popularity_ddrela +
                                         interaction_book4_popularity_ddrela +
                                         interaction_book5_popularity_ddrela +
                                         interaction_book1_popularity_male + 
                                         interaction_book2_popularity_male +                                         
                                         interaction_book3_popularity_male + 
                                         interaction_book4_popularity_male + 
                                         interaction_book5_popularity_male +
                                         interaction_male_log_numDeadRel 
                                         """,
                                         data    = GOT_train)


# fitting the model object
results_full = logistic_full.fit()


# checking the results SUMMARY
results_full.summary2()

<h3>e. Model where all features are significant based on their p-values.</h3><br>

In [None]:
# instantiating a logistic regression model object
logit_sig = smf.logit(formula = """  isAlive ~    
                                         book1_A_Game_Of_Thrones + 
                                         book3_A_Storm_Of_Swords +
                                         log_numDeadRelations +     
                                         log_popularity +
                                         interaction_book3_log_numDeadRel +  
                                         interaction_book3_popularity +
                                         interaction_book3_popularity_ddrela 
                                         """,
                                         data    = GOT_train)


# fitting the model object
logit_sig = logit_sig.fit()


# checking the results SUMMARY
logit_sig.summary2()

In [None]:
# instantiating a logistic regression model object
logit_sig_2 = smf.logit(formula = """ isAlive ~      book1_A_Game_Of_Thrones + 
                                                     book4_A_Feast_For_Crows + 
                                                     log_numDeadRelations + 
                                                     log_popularity +
                                                     interaction_book4_log_numDeadRel                                                  
                                                     """,
                                     data    = GOT_train)



# fitting the model object
logit_sig_2 = logit_sig_2.fit()


# checking the results SUMMARY
logit_sig_2.summary()

## Part IV: Logistic Regression in scikit-learn
### a. Dictionary of each candidate model's explanatory variables.

In [None]:
# explanatory sets from last session

# creating a dictionary to store candidate models

candidate_dict = {

 # full model
 'logit_full'   : ['book1_A_Game_Of_Thrones',	'book2_A_Clash_Of_Kings',	'book3_A_Storm_Of_Swords',	
                   'book4_A_Feast_For_Crows',	'book5_A_Dance_with_Dragons',	'log_numDeadRelations',	
                   'm_dateOfBirth',	'm_mother',	'm_father',	'm_heir',	'isMarried',	'isNoble',	'gg_Unknown',
                   'gg_male',	'age',	'log_popularity',	'c_Unknown',	'c_Northmen',	'c_Ironborn',	
                   'c_Wildlings',	'c_Valyrian',	'c_Braavos',	'c_Dornish',	'number_of_names',	
                   'interaction_book1_log_numDeadRel',	'interaction_book2_log_numDeadRel',	
                   'interaction_book3_log_numDeadRel',	'interaction_book4_log_numDeadRel',	
                   'interaction_book5_log_numDeadRel',	'interaction_book1_popularity',	
                   'interaction_book2_popularity',	'interaction_book3_popularity',	'interaction_book4_popularity',	
                   'interaction_book5_popularity',	'interaction_book1_popularity_ddrela',	
                   'interaction_book2_popularity_ddrela',	'interaction_book3_popularity_ddrela',	
                   'interaction_book4_popularity_ddrela',	'interaction_book5_popularity_ddrela',	
                   'interaction_book1_popularity_male',	'interaction_book2_popularity_male',	
                   'interaction_book3_popularity_male',	'interaction_book4_popularity_male',	
                   'interaction_book5_popularity_male',	'interaction_male_log_numDeadRel',
  ],
 

 # significant variables only (set 1)
 'logit_sig'    : ['book1_A_Game_Of_Thrones' , 'book3_A_Storm_Of_Swords' , 
                    'log_numDeadRelations' , 
                    'log_popularity' , 'interaction_book3_log_numDeadRel' ,  'interaction_book3_popularity' , 
                   'interaction_book3_popularity_ddrela'],
    
    
 # significant variables only (set 2)
 'logit_sig_2'  : ['book1_A_Game_Of_Thrones' , 'book4_A_Feast_For_Crows' , 
                    'log_numDeadRelations' , 'log_popularity' ,  'interaction_book4_log_numDeadRel' ],

}

In [None]:
candidate_dict

<h3>b. Dynamically printing each explanatory variable set.</h3><br>

In [None]:
# printing candidate variable sets
print(f"""
/--------------------------\\
|Explanatory Variable Sets |
\\--------------------------/

Full Model:
-----------
{candidate_dict['logit_full']}


First Significant p-value Model:
--------------------------------
{candidate_dict['logit_sig']}


Second Significant p-value Model:
---------------------------------
{candidate_dict['logit_sig_2']}
""")

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>


<h3>b) Build a logistic regression model in scikit-learn</h3>
Building a logistic regression model in scikit-learn using the <strong>logit_sig</strong> explanatory variables and <strong>isAlive</strong> as the response variable.

In [None]:
# train/test split with the full model
GOT_data   =  GOT.loc[ : , candidate_dict['logit_sig_2']]
GOT_target =  GOT.loc[ : , 'isAlive']


# This is the exact code we were using before
x_train, x_test, y_train, y_test = train_test_split(
            GOT_data,
            GOT_target,
            test_size    = 0.10,
            random_state = 219,
            stratify     = GOT_target)


# INSTANTIATING a logistic regression model
logreg = LogisticRegression(solver = 'lbfgs',
                            C = 1,
                            random_state = 219)


# FITTING the training data
logreg_fit = logreg.fit(x_train, y_train)


# PREDICTING based on the testing set
logreg_pred = logreg_fit.predict(x_test)

# SCORING the results
print('Training ACCURACY:', logreg_fit.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', logreg_fit.score(x_test, y_test).round(4))


# saving scoring data for future use
logreg_train_score = logreg_fit.score(x_train, y_train).round(4) # accuracy
logreg_test_score  = logreg_fit.score(x_test, y_test).round(4) # accuracy

# displaying and saving the gap between training and testing
print('Logreg Train-Test Gap :', abs(logreg_train_score - logreg_test_score).round(4))
logreg_test_gap = abs(logreg_train_score - logreg_test_score).round(4)

<br><br>
<h3>Part V: The Confusion Matrix</h3><br>
The confusion matrix in Python can be read as follows:<br><br>

~~~
                   |
  True Negatives   |  False Positives
  (correct)        |  (incorrect)
                   |
-------------------|------------------
                   |
  False Negatives  |  True Positives
  (incorrect)      |  (correct)
                   |
~~~

<br><br>
In terms of our model:<br><br>

~~~
                                                 |
  PREDICTED: IS ALIVE (isAlive=0)                |  PREDICTED: IS NOT ALIVE (isAlive=1)
  ACTUAL:    IS ALIVE (isAlive=0)                |  ACTUAL:    IS ALIVE     (isAlive=0)
                                                 |
-------------------------------------------------|-----------------------------------------------
                                                 |
  PREDICTED: IS ALIVE     (isAlive=0)            |  PREDICTED: IS NOT ALIVE (isAlive=1)
  ACTUAL:    IS NOT ALIVE (isAlive=1)            |  ACTUAL:    IS NOT ALIVE (isAlive=1)
                                                 |  
~~~


In [None]:
# creating a confusion matrix
print(confusion_matrix(y_true = y_test,
                       y_pred = logreg_pred))

<br>

In [None]:
# unpacking the confusion matrix
logreg_tn, \
logreg_fp, \
logreg_fn, \
logreg_tp = confusion_matrix(y_true = y_test, y_pred = logreg_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {logreg_tn}
False Positives: {logreg_fp}
False Negatives: {logreg_fn}
True Positives : {logreg_tp}
""")

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

<strong>Visualized Confusion Matrix</strong><br>


In [None]:
# calling the visual_cm function
visual_cm(true_y = y_test,
          pred_y = logreg_pred,
          labels = ['Is Alive', 'Is Not Alive'])

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

<h3>Area Under The Curve (AUC)</h3><br>

In [None]:
# area under the roc curve (auc)
print(roc_auc_score(y_true  = y_test,
                    y_score = logreg_pred).round(decimals = 4))


# saving AUC score for future use
logreg_auc_score = roc_auc_score(y_true  = y_test,
                                 y_score = logreg_pred).round(decimals = 4)

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

Run the code below to observe the model's coefficients.

In [None]:
# zipping each feature name to its coefficient
logreg_model_values = zip(GOT[candidate_dict['logit_sig_2']].columns,
                          logreg_fit.coef_.ravel().round(decimals = 2))


# setting up a placeholder list to store model features
logreg_model_lst = [('intercept', logreg_fit.intercept_[0].round(decimals = 2))]


# printing out each feature-coefficient pair one by one
for val in logreg_model_values:
    logreg_model_lst.append(val)
    

# checking the results
for pair in logreg_model_lst:
    print(pair)

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

<h2>Part VI: Classification Trees (CART Models)</h2><br>

###  Load a user-defined function for CART model output.

In [None]:
########################################
# plot_feature_importances
########################################
def plot_feature_importances(model, train, export = False):
    """
    Plots the importance of features from a CART model.
    
    PARAMETERS
    ----------
    model  : CART model
    train  : explanatory variable training data
    export : whether or not to export as a .png image, default False
    """
    
    # declaring the number
    n_features = x_train.shape[1]
    
    # setting plot window
    fig, ax = plt.subplots(figsize=(12,9))
    
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), train.columns)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
    
    if export == True:
        plt.savefig('Tree_Leaf_50_Feature_Importance.png')

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

### 1.a. Developing a classification full tree model.

In [None]:
# INSTANTIATING a classification tree object
full_tree = DecisionTreeClassifier()


# FITTING the training data
full_tree_fit = full_tree.fit(x_train, y_train)


# PREDICTING on new data
full_tree_pred = full_tree_fit.predict(x_test)


# SCORING the model
print('Full Tree Training ACCURACY:', full_tree_fit.score(x_train,
                                                     y_train).round(4))

print('Full Tree Testing ACCURACY :', full_tree_fit.score(x_test,
                                                     y_test).round(4))

print('Full Tree AUC Score:', roc_auc_score(y_true  = y_test,
                                            y_score = full_tree_pred).round(4))


# saving scoring data for future use
full_tree_train_score = full_tree_fit.score(x_train, y_train).round(4) # accuracy
full_tree_test_score  = full_tree_fit.score(x_test, y_test).round(4)   # accuracy


# saving AUC
full_tree_auc_score   = roc_auc_score(y_true  = y_test,
                                      y_score = full_tree_pred).round(4) # auc

# displaying and saving the gap between training and testing
print('Full Tree Train-Test Gap :', abs(full_tree_train_score - full_tree_test_score).round(4))
full_tree_test_gap = abs(full_tree_train_score - full_tree_test_score).round(4)

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

<h3>1.b. Confusion matrix.</h3>

In [None]:
# unpacking the confusion matrix
full_tree_tn, \
full_tree_fp, \
full_tree_fn, \
full_tree_tp = confusion_matrix(y_true = y_test, y_pred = full_tree_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {full_tree_tn}
False Positives: {full_tree_fp}
False Negatives: {full_tree_fn}
True Positives : {full_tree_tp}
""")

### 1.c. Run the following code generate a visual tree output.

In [None]:
# # setting figure size
# plt.figure(figsize=(150,50))


# # developing a plotted tree
# plot_tree(decision_tree = full_tree_fit, 
#           feature_names = GOT.columns,
#           filled        = True, 
#           rounded       = True, 
#           fontsize      = 14)


# # rendering the plot
# plt.show()

### 2.a. Developing a pruned classificaion tree model.
Classification tree with a maximum depth of 5 and a minimum number of samples per leaf of 25. 

In [None]:
# INSTANTIATING a classification tree object
pruned_tree = DecisionTreeClassifier(max_depth = 5,
                    min_samples_leaf = 25,
                    random_state = 219)


# FITTING the training data
pruned_tree_fit = pruned_tree.fit(x_train, y_train)


# PREDICTING on new data
pruned_tree_pred = pruned_tree.predict(x_test)


# SCORING the model
print('Training ACCURACY:', pruned_tree_fit.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', pruned_tree_fit.score(x_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = pruned_tree_pred).round(4))


# saving scoring data for future use
pruned_tree_train_score = pruned_tree_fit.score(x_train, y_train).round(4) # accuracy
pruned_tree_test_score  = pruned_tree_fit.score(x_test, y_test).round(4) # accuracy


# saving auc score
pruned_tree_auc_score   = roc_auc_score(y_true  = y_test,
                                        y_score = pruned_tree_pred).round(4) # auc

# displaying and saving the gap between training and testing
print('Pruned Train-Test Gap :', abs(pruned_tree_train_score - pruned_tree_test_score).round(4))
pruned_tree_test_gap = abs(pruned_tree_train_score - pruned_tree_test_score).round(4)

<h3>2.b. Confusion matrix.</h3>

In [None]:
# unpacking the confusion matrix
pruned_tree_tn, \
pruned_tree_fp, \
pruned_tree_fn, \
pruned_tree_tp = confusion_matrix(y_true = y_test, y_pred = pruned_tree_pred.ravel()
).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {pruned_tree_tn}
False Positives: {pruned_tree_fp}
False Negatives: {pruned_tree_fn}
True Positives : {pruned_tree_tp}
""")

<h3>2.c. Visual Tree output.</h3>

In [None]:
# setting figure size
plt.figure(figsize=(20, 10)) # adjusting to better fit the visual


# developing a plotted tree
plot_tree(decision_tree = pruned_tree_fit, # changing to pruned_tree_fit
          feature_names = GOT.columns,
          filled        = True, 
          rounded       = True, 
          fontsize      = 14)


# rendering the plot
plt.show()

### 2.d. Feature performance

In [None]:
# plotting feature importance
plot_feature_importances(pruned_tree_fit,
                         train = x_train,
                         export = False)

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

<h3>Comparing Results</h3><br>

In [None]:
# comparing results
print(f"""
Model         AUC Score    Training Accuracy  Testing Accuracy TN, FP, FN, TP     Train-Test gap
-----         ---------    --------------     ---------------  --------------     --------------
Logistic      {logreg_auc_score}        {logreg_train_score}            {logreg_test_score}           {logreg_tn, logreg_fp, logreg_fn, logreg_tp}    {logreg_test_gap}
Full Tree     {full_tree_auc_score}        {full_tree_train_score}            {full_tree_test_score}           {full_tree_tn, full_tree_fp, full_tree_fn, full_tree_tp}   {full_tree_test_gap}
Pruned Tree   {pruned_tree_auc_score}        {pruned_tree_train_score}            {pruned_tree_test_score}           {pruned_tree_tn, pruned_tree_fp, pruned_tree_fn, pruned_tree_tp}    {pruned_tree_test_gap}
""")


<h2>Part VII: Classification Modeling with KNN</h2><br>
<h3>a. Running Optimal Neighbors function</h3>

In [None]:
# determining the optimal number of neighbors
opt_neighbors = optimal_neighbors(x_data        = GOT_data,
                                  y_data        = GOT_target,
                                  response_type = 'class')


### b. Scaling the explanatory data. Building a KNN classification model.

In [None]:
# INSTANTIATING StandardScaler()
scaler = StandardScaler()


# FITTING the data
scaler.fit(GOT_data)


# TRANSFORMING the data
x_scaled     = scaler.transform(GOT_data)


# converting to a DataFrame
x_scaled_df  = pd.DataFrame(x_scaled) 


# train-test split with the scaled data
x_train_scaled, x_test_scaled, y_train_scaled, y_test_scaled = train_test_split(
            x_scaled_df,
            GOT_target,
            random_state = 219,
            test_size    = 0.10,
            stratify     = GOT_target)


# INSTANTIATING a KNN classification model with optimal neighbors
knn_opt = KNeighborsClassifier(n_neighbors = opt_neighbors)


# FITTING the training data
knn_fit = knn_opt.fit(x_train_scaled, y_train_scaled)


# PREDICTING based on the testing set
knn_pred = knn_fit.predict(x_test_scaled)


# SCORING the results
print('Training ACCURACY:', knn_fit.score(x_train_scaled, y_train_scaled).round(4))
print('Testing  ACCURACY:', knn_fit.score(x_test_scaled, y_test_scaled).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = knn_pred).round(4))


# saving scoring data
knn_train_score = knn_fit.score(x_train_scaled, y_train_scaled).round(4)
knn_test_score  = knn_fit.score(x_test_scaled, y_test_scaled).round(4)


# saving AUC score
knn_auc_score   = roc_auc_score(y_true  = y_test,
                                          y_score = knn_pred).round(4)

# displaying and saving the gap between training and testing
print('knn Train-Test Gap :', abs(knn_train_score - knn_test_score).round(4))
knn_test_gap = abs(knn_train_score - knn_test_score).round(4)

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

<h3>c) Call the visual_cm function.</h3>

In [None]:
# calling the visual_cm function
visual_cm(true_y = y_test,
          pred_y = knn_pred,
          labels = ['Is Alive', 'Is Not Alive'])

<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

<h3>d) Confusion matrix.</h3>

In [None]:
# unpacking the confusion matrix
knn_tree_tn, \
knn_tree_fp, \
knn_tree_fn, \
knn_tree_tp = confusion_matrix(y_true = y_test, y_pred = knn_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {pruned_tree_tn}
False Positives: {pruned_tree_fp}
False Negatives: {pruned_tree_fn}
True Positives : {pruned_tree_tp}
""")

## Part VIII: Hyperparameter Tuning

### a. Preparing Data

In [None]:
########################################
# importing packages
########################################

# new packages
from sklearn.model_selection import RandomizedSearchCV   # hyperparameter tuning
from sklearn.metrics import make_scorer                  # customizable scorer


########################################
# explanatory variable sets
########################################
candidate_dict = {

 # full model
 'logit_full'   : ['book1_A_Game_Of_Thrones',	'book2_A_Clash_Of_Kings',	'book3_A_Storm_Of_Swords',	
                   'book4_A_Feast_For_Crows',	'book5_A_Dance_with_Dragons',	'log_numDeadRelations',	
                   'm_dateOfBirth',	'm_mother',	'm_father',	'm_heir',	'isMarried',	'isNoble',	'gg_Unknown',
                   'gg_male',	'age',	'log_popularity',	'c_Unknown',	'c_Northmen',	'c_Ironborn',	
                   'c_Wildlings',	'c_Valyrian',	'c_Braavos',	'c_Dornish',	'number_of_names',	
                   'interaction_book1_log_numDeadRel',	'interaction_book2_log_numDeadRel',	
                   'interaction_book3_log_numDeadRel',	'interaction_book4_log_numDeadRel',	
                   'interaction_book5_log_numDeadRel',	'interaction_book1_popularity',	
                   'interaction_book2_popularity',	'interaction_book3_popularity',	'interaction_book4_popularity',	
                   'interaction_book5_popularity',	'interaction_book1_popularity_ddrela',	
                   'interaction_book2_popularity_ddrela',	'interaction_book3_popularity_ddrela',	
                   'interaction_book4_popularity_ddrela',	'interaction_book5_popularity_ddrela',	
                   'interaction_book1_popularity_male',	'interaction_book2_popularity_male',	
                   'interaction_book3_popularity_male',	'interaction_book4_popularity_male',	
                   'interaction_book5_popularity_male',	'interaction_male_log_numDeadRel',
  ],
 

 # significant variables only (set 1)
 'logit_sig'    : ['book1_A_Game_Of_Thrones' , 'book3_A_Storm_Of_Swords' , 
                    'log_numDeadRelations' , 
                    'log_popularity' , 'interaction_book3_log_numDeadRel' ,  'interaction_book3_popularity' , 
                   'interaction_book3_popularity_ddrela'],
    
    
 # significant variables only (set 2)
 'logit_sig_2'  : ['book1_A_Game_Of_Thrones' , 'book4_A_Feast_For_Crows' , 
                    'log_numDeadRelations' , 'log_popularity' ,  'interaction_book4_log_numDeadRel' ],

}


########################################
# checking previous model performances
########################################
# comparing results
print(f"""
Model         AUC Score    Training Accuracy  Testing Accuracy TN, FP, FN, TP     Train-Test gap
-----         ---------    --------------     ---------------  --------------     --------------
Logistic      {logreg_auc_score}        {logreg_train_score}            {logreg_test_score}           {logreg_tn, logreg_fp, logreg_fn, logreg_tp}    {logreg_test_gap}
Full Tree     {full_tree_auc_score}        {full_tree_train_score}            {full_tree_test_score}           {full_tree_tn, full_tree_fp, full_tree_fn, full_tree_tp}   {full_tree_test_gap}
Pruned Tree   {pruned_tree_auc_score}        {pruned_tree_train_score}            {pruned_tree_test_score}           {pruned_tree_tn, pruned_tree_fp, pruned_tree_fn, pruned_tree_tp}    {pruned_tree_test_gap}
""")


### b. Split the dataset into training and testing sets using logit_sig_2

In [None]:
# train/test split with the logit_sig variables
GOT_data   =  GOT.loc[ : , candidate_dict['logit_sig_2']]
GOT_target =  GOT.loc[ : , 'isAlive']


# train/test split
x_train, x_test, y_train, y_test = train_test_split(
            GOT_data,
            GOT_target,
            random_state = 219,
            test_size    = 0.10,
            stratify     = GOT_target)

### c. Logistic Regression with Default Hyperparameters

In [None]:
# INSTANTIATING a logistic regression model with default values
lr_default = LogisticRegression(solver = 'lbfgs',
                                C = 1.0,
                                warm_start = False,
                                random_state = 219)

In [None]:
# FITTING the training data
lr_default_fit = lr_default.fit(x_train, y_train)


# PREDICTING based on the testing set
lr_default_pred = lr_default_fit.predict(x_test)


# SCORING the results
print('Training ACCURACY:', lr_default_fit.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', lr_default_fit.score(x_test, y_test).round(4))


# SCORING with AUC
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = lr_default_pred).round(4))


# saving scoring data for future use
logreg_train_score = lr_default_fit.score(x_train, y_train).round(4) # accuracy
logreg_test_score  = lr_default_fit.score(x_test, y_test).round(4)   # accuracy


# saving AUC score
logreg_auc_score = roc_auc_score(y_true  = y_test,
                                 y_score = lr_default_pred).round(4)

# displaying and saving the gap between training and testing
print('Logreg Train-Test Gap :', abs(logreg_train_score - logreg_test_score).round(4))
Logreg_test_gap = abs(logreg_train_score - logreg_test_score).round(4)

## Part IX: Hyperparameter Tuning with RandomizedSearchCV

In [None]:
########################################
# RandomizedSearchCV
########################################

# declaring a hyperparameter space
C_range          = np.arange(0.1, 5.0, 0.1)
warm_start_range = [True, False]
solver_range     = ['newton-cg', 'sag', 'lbfgs']


# creating a hyperparameter grid
param_grid = {'C'          : C_range,
              'warm_start' : warm_start_range,
              'solver'     : solver_range}


# INSTANTIATING the model object without hyperparameters
lr_tuned = LogisticRegression(random_state = 219,
                              max_iter     = 1000) # increased for convergence


# GridSearchCV object
lr_tuned_cv = RandomizedSearchCV(estimator           = lr_tuned,   # the model object
                                 param_distributions = param_grid, # parameters to tune
                                 cv                  = 3,          # how many folds in cross-validation
                                 n_iter              = 250,        # number of combinations of hyperparameters to try
                                 random_state        = 219,        # starting point for random sequence
                                 scoring = make_scorer(
                                           roc_auc_score,
                                           needs_threshold = False)) # scoring criteria (AUC)


# FITTING to the FULL DATASET (due to cross-validation)
lr_tuned_cv.fit(GOT_data, GOT_target)


# PREDICT step is not needed


# printing the optimal parameters and best score
print("Tuned Parameters  :", lr_tuned_cv.best_params_)
print("Tuned CV AUC      :", lr_tuned_cv.best_score_.round(4))

### b. RandomizedSearch CV results

In [None]:
# checking the results of RandomizedSearch CV
lr_tuned_cv.cv_results_

### c. Estimator for the model

In [None]:
# checking the best estimator for the model
lr_tuned_cv.best_estimator_

### d. Hyperparameter with RandomizedSearch CV tuning results

In [None]:
# building a model based on hyperparameter tuning results

# INSTANTIATING a logistic regression model with tuned values
lr_tuned = LogisticRegression(C            = 4.8,
                              warm_start   = False,
                              solver       = 'sag',
                              max_iter     = 1000,
                              random_state = 219)


# FITTING the model to the full dataset
lr_tuned.fit(GOT_data, GOT_target) # this is ok because already tuned


# PREDICTING based on the testing set
lr_tuned_pred = lr_tuned.predict(x_test)


# SCORING the results
print('LR Tuned Training ACCURACY:', lr_tuned.score(x_train, y_train).round(4))
print('LR Tuned Testing  ACCURACY:', lr_tuned.score(x_test, y_test).round(4))
print('LR Tuned AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = lr_tuned_pred).round(4))


# saving scoring data for future use
lr_tuned_train_score = lr_tuned.score(x_train, y_train).round(4) # accuracy
lr_tuned_test_score  = lr_tuned.score(x_test, y_test).round(4)   # accuracy


# saving the AUC score
lr_tuned_auc         = roc_auc_score(y_true  = y_test,
                                     y_score = lr_tuned_pred).round(4) # auc

# displaying and saving the gap between training and testing
print('Logreg Tuned Train-Test Gap :', abs(lr_tuned_train_score - lr_tuned_test_score).round(4))
lr_tuned_test_gap = abs(lr_tuned_train_score - lr_tuned_test_score).round(4)

### e. Confusion matrix

In [None]:
# unpacking the confusion matrix
lr_tuned_tn, \
lr_tuned_fp, \
lr_tuned_fn, \
lr_tuned_tp = confusion_matrix(y_true = y_test, y_pred = lr_tuned_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {lr_tuned_tn}
False Positives: {lr_tuned_fp}
False Negatives: {lr_tuned_fn}
True Positives : {lr_tuned_tp}
""")

### f. Comparing the results

In [None]:
# declaring model performance objects
lr_train_acc = lr_tuned.score(x_train, y_train).round(4)
lr_test_acc  = lr_tuned.score(x_test, y_test).round(4)
lr_auc       = roc_auc_score(y_true  = y_test,
                             y_score = lr_tuned_pred).round(4)
lr_auc_test_gap = abs(lr_train_acc -lr_test_acc).round(4)

# appending to print list
# comparing results
print(f"""
Model         AUC Score    Training Accuracy  Testing Accuracy TN, FP, FN, TP     Train-Test gap
-----         ---------    --------------     ---------------  --------------     --------------
Logistic      {logreg_auc_score}        {logreg_train_score}            {logreg_test_score}           {logreg_tn, logreg_fp, logreg_fn, logreg_tp}    {logreg_test_gap}
Full Tree     {full_tree_auc_score}        {full_tree_train_score}            {full_tree_test_score}           {full_tree_tn, full_tree_fp, full_tree_fn, full_tree_tp}   {full_tree_test_gap}
Pruned Tree   {pruned_tree_auc_score}        {pruned_tree_train_score}            {pruned_tree_test_score}           {pruned_tree_tn, pruned_tree_fp, pruned_tree_fn, pruned_tree_tp}    {pruned_tree_test_gap}
Tuned LR      {lr_auc}        {lr_train_acc}            {lr_test_acc}           {lr_tuned_tn,lr_tuned_fp,lr_tuned_fn,lr_tuned_tp}    {lr_auc_test_gap}
""")


## Part X: Hyperparameter Tuning on Classification Trees

### a) Tune the hyperparameters for a classification tree model.

In [None]:
# # declaring a hyperparameter space
# criterion_range = ['gini', 'entropy']
# splitter_range  = ['best', 'random']
# depth_range     = np.arange(1, 25, 1)
# leaf_range      = np.arange(1, 100, 1)


# # creating a hyperparameter grid
# param_grid = {'criterion'        : criterion_range,
#               'splitter'         : splitter_range,
#               'max_depth'        : depth_range,
#               'min_samples_leaf' : leaf_range}


# # INSTANTIATING the model object without hyperparameters
# tuned_tree = DecisionTreeClassifier(random_state = 219)


# # RandomizedSearchCV object
# tuned_tree_cv = RandomizedSearchCV(estimator             = tuned_tree,
#                                    param_distributions   = param_grid,
#                                    cv                    = 3,
#                                    n_iter                = 1000,
#                                    random_state          = 219,
#                                    scoring = make_scorer(roc_auc_score,
#                                              needs_threshold = False))


# # FITTING to the FULL DATASET (due to cross-validation)
# tuned_tree_cv.fit(GOT_data, GOT_target)

# # printing the optimal parameters and best score
# print("Tuned Parameters  :", tuned_tree_cv.best_params_)
# print("Tuned Training AUC:", tuned_tree_cv.best_score_.round(4))

### b) Build a classification tree model based on the hyperparameter tuning results

In [None]:
# building a model based on hyperparameter tuning results

# INSTANTIATING a logistic regression model with tuned values
tree_tuned = DecisionTreeClassifier(splitter         = 'random',
                                    min_samples_leaf = 6,
                                    max_depth        = 6,
                                    criterion        = 'gini',
                                    random_state     = 219)


# FITTING to the FULL DATASET (due to cross-validation)
tree_tuned_fit = tree_tuned.fit(GOT_data, GOT_target)


# PREDICTING based on the testing set
tree_tuned_pred = tree_tuned.predict(x_test)


# SCORING the results
print('Training ACCURACY:', tree_tuned.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', tree_tuned.score(x_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = tree_tuned_pred).round(4))


# saving scoring data for future use
tree_tuned_train_score = tree_tuned.score(x_train, y_train).round(4) # accuracy
tree_tuned_test_score  = tree_tuned.score(x_test, y_test).round(4)   # accuracy


# saving the AUC score
tree_tuned_auc         = roc_auc_score(y_true  = y_test,
                                       y_score = tree_tuned_pred).round(4) # auc

# displaying and saving the gap between training and testing
print('Tree Tuned Train-Test Gap :', abs(tree_tuned_train_score - tree_tuned_test_score).round(4))
Tree_tuned_test_gap = abs(tree_tuned_train_score - tree_tuned_test_score).round(4)

### c. Confusion Matrix

In [None]:
# unpacking the confusion matrix
tuned_tree_tn, \
tuned_tree_fp, \
tuned_tree_fn, \
tuned_tree_tp = confusion_matrix(y_true = y_test, y_pred = tree_tuned_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {tuned_tree_tn}
False Positives: {tuned_tree_fp}
False Negatives: {tuned_tree_fn}
True Positives : {tuned_tree_tp}
""")

### d. Comparing results

In [None]:
# declaring model performance objects
tree_train_acc = tree_tuned.score(x_train, y_train).round(4)
tree_test_acc  = tree_tuned.score(x_test, y_test).round(4)
tree_auc       = roc_auc_score(y_true  = y_test,
                              y_score = tree_tuned_pred).round(4)
tuned_tree_test_gap = abs(tree_train_acc -tree_test_acc).round(4)


# appending to print list
# comparing results
print(f"""
Model         AUC Score    Training Accuracy  Testing Accuracy TN, FP, FN, TP     Train-Test gap
-----         ---------    --------------     ---------------  --------------     --------------
Logistic      {logreg_auc_score}        {logreg_train_score}            {logreg_test_score}           {logreg_tn, logreg_fp, logreg_fn, logreg_tp}    {logreg_test_gap}
Full Tree     {full_tree_auc_score}        {full_tree_train_score}            {full_tree_test_score}           {full_tree_tn, full_tree_fp, full_tree_fn, full_tree_tp}   {full_tree_test_gap}
Pruned Tree   {pruned_tree_auc_score}        {pruned_tree_train_score}            {pruned_tree_test_score}           {pruned_tree_tn, pruned_tree_fp, pruned_tree_fn, pruned_tree_tp}    {pruned_tree_test_gap}
Tuned LR      {lr_auc}        {lr_train_acc}            {lr_test_acc}           {lr_tuned_tn,lr_tuned_fp,lr_tuned_fn,lr_tuned_tp}    {lr_auc_test_gap}
Tuned Tree    {tree_auc}        {tree_train_acc}            {tree_test_acc}           {tuned_tree_tn,tuned_tree_fp,tuned_tree_fn,tuned_tree_tp}   {tuned_tree_test_gap}
""")


### e. Visual Tree output

In [None]:
# # setting figure size
# plt.figure(figsize=(40, 10))


# # developing a plotted tree
# plot_tree(decision_tree = tree_tuned_fit, 
#           feature_names = GOT.columns,
#           filled        = True, 
#           rounded       = True, 
#           fontsize      = 14)


# # rendering the plot
# plt.show()

## Part XII: Ensemble Modelling

In [None]:
########################################
# importing packages
########################################
# essentials
import matplotlib.pyplot as plt # data visualization
import pandas            as pd  # data science essentials
import numpy             as np  # mathematical essentials


# machine learning
from sklearn.model_selection import train_test_split   # train-test split
from sklearn.metrics import roc_auc_score              # auc score
from sklearn.model_selection import RandomizedSearchCV # hyperparameter tuning
from sklearn.metrics import make_scorer                # customizable scorer
from sklearn.metrics import confusion_matrix           # confusion matrix


# new tools
from sklearn.ensemble import RandomForestClassifier     # random forest
from sklearn.ensemble import GradientBoostingClassifier # gbm


########################################
# explanatory variable sets
########################################
candidate_dict = {

 # full model
 'logit_full'   : ['book1_A_Game_Of_Thrones',	'book2_A_Clash_Of_Kings',	'book3_A_Storm_Of_Swords',	
                   'book4_A_Feast_For_Crows',	'book5_A_Dance_with_Dragons',	'log_numDeadRelations',	
                   'm_dateOfBirth',	'm_mother',	'm_father',	'm_heir',	'isMarried',	'isNoble',	'gg_Unknown',
                   'gg_male',	'age',	'log_popularity',	'c_Unknown',	'c_Northmen',	'c_Ironborn',	
                   'c_Wildlings',	'c_Valyrian',	'c_Braavos',	'c_Dornish',	'number_of_names',	
                   'interaction_book1_log_numDeadRel',	'interaction_book2_log_numDeadRel',	
                   'interaction_book3_log_numDeadRel',	'interaction_book4_log_numDeadRel',	
                   'interaction_book5_log_numDeadRel',	'interaction_book1_popularity',	
                   'interaction_book2_popularity',	'interaction_book3_popularity',	'interaction_book4_popularity',	
                   'interaction_book5_popularity',	'interaction_book1_popularity_ddrela',	
                   'interaction_book2_popularity_ddrela',	'interaction_book3_popularity_ddrela',	
                   'interaction_book4_popularity_ddrela',	'interaction_book5_popularity_ddrela',	
                   'interaction_book1_popularity_male',	'interaction_book2_popularity_male',	
                   'interaction_book3_popularity_male',	'interaction_book4_popularity_male',	
                   'interaction_book5_popularity_male',	'interaction_male_log_numDeadRel'],
 

 # significant variables only (set 1)
 'logit_sig'    : ['book1_A_Game_Of_Thrones' , 'book3_A_Storm_Of_Swords' , 
                    'log_numDeadRelations' , 
                    'log_popularity' , 'interaction_book3_log_numDeadRel' ,  'interaction_book3_popularity' , 
                   'interaction_book3_popularity_ddrela'],
    
    
 # significant variables only (set 2)
 'logit_sig_2'  : ['book1_A_Game_Of_Thrones' , 'book4_A_Feast_For_Crows' , 
                    'log_numDeadRelations' , 'log_popularity' ,  'interaction_book4_log_numDeadRel' ]

}


########################################
# checking previous model performances
########################################

print(f"""
Model         AUC Score    Training Accuracy  Testing Accuracy TN, FP, FN, TP     Train-Test gap
-----         ---------    --------------     ---------------  --------------     --------------
Logistic      {logreg_auc_score}        {logreg_train_score}            {logreg_test_score}           {logreg_tn, logreg_fp, logreg_fn, logreg_tp}    {logreg_test_gap}
Full Tree     {full_tree_auc_score}        {full_tree_train_score}            {full_tree_test_score}           {full_tree_tn, full_tree_fp, full_tree_fn, full_tree_tp}   {full_tree_test_gap}
Pruned Tree   {pruned_tree_auc_score}        {pruned_tree_train_score}            {pruned_tree_test_score}           {pruned_tree_tn, pruned_tree_fp, pruned_tree_fn, pruned_tree_tp}    {pruned_tree_test_gap}
Tuned LR      {lr_auc}        {lr_train_acc}            {lr_test_acc}           {lr_tuned_tn,lr_tuned_fp,lr_tuned_fn,lr_tuned_tp}    {lr_auc_test_gap}
Tuned Tree    {tree_auc}        {tree_train_acc}            {tree_test_acc}           {tuned_tree_tn,tuned_tree_fp,tuned_tree_fn,tuned_tree_tp}   {tuned_tree_test_gap}
""")


#### User defined functions

In [None]:
########################################
# plot_feature_importances
########################################
def plot_feature_importances(model, train, export = False):
    """
    Plots the importance of features from a CART model.
    
    PARAMETERS
    ----------
    model  : CART model
    train  : explanatory variable training data
    export : whether or not to export as a .png image, default False
    """
    
    # declaring the number
    n_features = train.shape[1]
    
    # setting plot window
    fig, ax = plt.subplots(figsize=(12,9))
    
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), train.columns)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
    
    if export == True:
        plt.savefig('./analysis_images/Feature_Importance.png')

### Split the dataset into training and testing sets

In [None]:
# train/test split with the logit_sig variables
GOT_data   =  GOT.loc[ : , candidate_dict['logit_sig_2']]
GOT_target =  GOT.loc[ : , 'isAlive']


# train/test split
x_train, x_test, y_train, y_test = train_test_split(
            GOT_data,
            GOT_target,
            random_state = 219,
            test_size    = 0.10,
            stratify     = GOT_target)

### Part XIII: Random Forest Classifier

In [None]:
# INSTANTIATING a random forest model with default values
rf_default = RandomForestClassifier(n_estimators     = 100,
                                    criterion        = 'gini',
                                    max_depth        = None,
                                    min_samples_leaf = 1,
                                    bootstrap        = True,
                                    warm_start       = False,
                                    random_state     = 219)

In [None]:
# FITTING the training data
rf_default_fit = rf_default.fit(x_train, y_train)


# PREDICTING based on the testing set
rf_default_fit_pred = rf_default_fit.predict(x_test)


# SCORING the results
print('Training ACCURACY:', rf_default_fit.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', rf_default_fit.score(x_test, y_test).round(4))

rf_train_score = rf_default_fit.score(x_train, y_train).round(4)
rf_test_score = rf_default_fit.score(x_test, y_test).round(4)

# saving AUC score
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = rf_default_fit_pred).round(4))

# displaying and saving the gap between training and testing
print('Random Forest Train-Test Gap :', abs(rf_train_score - rf_test_score).round(4))
Random_forest_test_gap = abs(rf_train_score - rf_test_score).round(4)

In [None]:
# plotting feature importances
plot_feature_importances(rf_default_fit, x_train, export = False)

In [None]:
# unpacking the confusion matrix
rf_tn, \
rf_fp, \
rf_fn, \
rf_tp = confusion_matrix(y_true = y_test, y_pred = rf_default_fit_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {rf_tn}
False Positives: {rf_fp}
False Negatives: {rf_fn}
True Positives : {rf_tp}
""")

In [None]:
# declaring model performance objects
rf_train_acc = rf_default_fit.score(x_train, y_train).round(4)
rf_test_acc  = rf_default_fit.score(x_test, y_test).round(4)
rf_auc       = roc_auc_score(y_true  = y_test,
                             y_score = rf_default_fit_pred).round(4)
rf_test_gap = abs(rf_train_acc -rf_test_acc).round(4)


# appending to print list
# comparing results
print(f"""
Model                      AUC Score    Training Accuracy  Testing Accuracy TN, FP, FN, TP     Train-Test gap
-----                      ---------    --------------     ---------------  --------------     --------------
Logistic                  {logreg_auc_score}        {logreg_train_score}            {logreg_test_score}           {logreg_tn, logreg_fp, logreg_fn, logreg_tp}    {logreg_test_gap}
Full Tree                 {full_tree_auc_score}        {full_tree_train_score}            {full_tree_test_score}           {full_tree_tn, full_tree_fp, full_tree_fn, full_tree_tp}   {full_tree_test_gap}
Pruned Tree               {pruned_tree_auc_score}        {pruned_tree_train_score}            {pruned_tree_test_score}           {pruned_tree_tn, pruned_tree_fp, pruned_tree_fn, pruned_tree_tp}    {pruned_tree_test_gap}
Tuned LR                  {lr_auc}        {lr_train_acc}            {lr_test_acc}           {lr_tuned_tn,lr_tuned_fp,lr_tuned_fn,lr_tuned_tp}    {lr_auc_test_gap}
Tuned Tree                {tree_auc}        {tree_train_acc}            {tree_test_acc}           {tuned_tree_tn,tuned_tree_fp,tuned_tree_fn,tuned_tree_tp}   {tuned_tree_test_gap}
Random Forest (Full)      {rf_auc}        {rf_train_acc}            {rf_test_acc}           {rf_tn,rf_fp,rf_fn,rf_tp}   {rf_test_gap}
""")


In [None]:
# # FITTING the training data
# rf_default_fit = rf_default.fit(x_train, y_train)


# # PREDICTING based on the testing set
# rf_default_fit_pred = rf_default_fit.predict(x_test)


# # declaring a hyperparameter space
# estimator_range  = np.arange(100, 1100, 250)
# leaf_range       = np.arange(1, 31, 10)
# criterion_range  = ['gini', 'entropy']
# bootstrap_range  = [True, False]
# warm_start_range = [True, False]


# # creating a hyperparameter grid
# param_grid = {'n_estimators'     : estimator_range,
#               'min_samples_leaf' : leaf_range,
#               'criterion'        : criterion_range,
#               'bootstrap'        : bootstrap_range,
#               'warm_start'       : warm_start_range}


# # INSTANTIATING the model object without hyperparameters
# forest_grid = RandomForestClassifier(random_state = 219)


# # GridSearchCV object
# forest_cv = RandomizedSearchCV(estimator           = forest_grid,
#                                param_distributions = param_grid,
#                                cv         = 3,
#                                n_iter     = 1000,
#                                scoring    = make_scorer(roc_auc_score,
#                                             needs_threshold = False))


# # FITTING to the FULL DATASET (due to cross-validation)
# forest_cv.fit(GOT_data, GOT_target)


# # PREDICT step is not needed


# # printing the optimal parameters and best score
# print("Tuned Parameters  :", forest_cv.best_params_)
# print("Tuned Training AUC:", forest_cv.best_score_.round(4))

In [None]:
# best estimators based on RandomizedSearchCV
# forest_cv.best_estimator_

#### Improving processing efficiency

In [None]:
# building a model based on hyperparameter tuning results

# INSTANTIATING with best_estimator
forest_tuned = RandomForestClassifier(n_estimators   = 350,
                                    min_samples_leaf = 1,
                                    warm_start       = True,
                                    criterion        = 'entropy',
                                    random_state     = 219)



# FITTING to the FULL DATASET (due to cross-validation)
forest_tuned_fit = forest_tuned.fit(GOT_data, GOT_target)


# PREDICTING based on the testing set
forest_tuned_pred = forest_tuned_fit.predict(x_test)


# SCORING the results
print('Forest Tuned Training ACCURACY:', forest_tuned.score(x_train, y_train).round(4))
print('Forest Tuned Testing  ACCURACY:', forest_tuned.score(x_test, y_test).round(4))
print('Forest Tuned AUC Score        :', roc_auc_score(y_true  = y_test,
                                                       y_score = forest_tuned_pred).round(4))


# saving scoring data for future use
forest_tuned_train_score = forest_tuned.score(x_train, y_train).round(4) # accuracy
forest_tuned_test_score  = forest_tuned.score(x_test, y_test).round(4)   # accuracy


# saving the AUC score
forest_tuned_auc = roc_auc_score(y_true  = y_test,
                                 y_score = forest_tuned_pred).round(4) # auc

# displaying and saving the gap between training and testing
print('Random Forest Tuned Train-Test Gap :', abs(forest_tuned_train_score - forest_tuned_test_score).round(4))
Random_forest_tuned_test_gap = abs(forest_tuned_train_score - forest_tuned_test_score).round(4)

In [None]:
# plotting feature importances
plot_feature_importances(forest_tuned_fit,
                         train = x_train,
                         export = False)

In [None]:
# unpacking the confusion matrix
tuned_rf_tn, \
tuned_rf_fp, \
tuned_rf_fn, \
tuned_rf_tp = confusion_matrix(y_true = y_test, y_pred = forest_tuned_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {tuned_rf_tn}
False Positives: {tuned_rf_fp}
False Negatives: {tuned_rf_fn}
True Positives : {tuned_rf_tp}
""")

In [None]:
# declaring model performance objects
tuned_rf_train_acc = forest_tuned_fit.score(x_train, y_train).round(4)
tuned_rf_test_acc  = forest_tuned_fit.score(x_test, y_test).round(4)
tuned_rf_auc       = roc_auc_score(y_true  = y_test,
                                   y_score = forest_tuned_pred).round(4)
tuned_rf_test_gap = abs(tuned_rf_train_acc -tuned_rf_test_acc).round(4)


# appending to print list
# comparing results
print(f"""
Model                        AUC Score    Training Accuracy  Testing Accuracy TN, FP, FN, TP     Train-Test gap
-----                        ---------    --------------     ---------------  --------------     --------------
Logistic                    {logreg_auc_score}        {logreg_train_score}            {logreg_test_score}           {logreg_tn, logreg_fp, logreg_fn, logreg_tp}    {logreg_test_gap}
Full Tree                   {full_tree_auc_score}        {full_tree_train_score}            {full_tree_test_score}           {full_tree_tn, full_tree_fp, full_tree_fn, full_tree_tp}   {full_tree_test_gap}
Pruned Tree                 {pruned_tree_auc_score}        {pruned_tree_train_score}            {pruned_tree_test_score}           {pruned_tree_tn, pruned_tree_fp, pruned_tree_fn, pruned_tree_tp}    {pruned_tree_test_gap}
Tuned LR                    {lr_auc}        {lr_train_acc}            {lr_test_acc}           {lr_tuned_tn,lr_tuned_fp,lr_tuned_fn,lr_tuned_tp}    {lr_auc_test_gap}
Tuned Tree                  {tree_auc}        {tree_train_acc}            {tree_test_acc}           {tuned_tree_tn,tuned_tree_fp,tuned_tree_fn,tuned_tree_tp}   {tuned_tree_test_gap}
Random Forest (Full)        {rf_auc}        {rf_train_acc}            {rf_test_acc}           {rf_tn,rf_fp,rf_fn,rf_tp}   {rf_test_gap}
Tuned Random Forest (Full)  {tuned_rf_auc}        {tuned_rf_train_acc}            {tuned_rf_test_acc}           {tuned_rf_tn,tuned_rf_fp,tuned_rf_fn,tuned_rf_tp}    {tuned_rf_test_gap}
""")

### Part XIV: Gradient Boosted Machines

In [None]:
# INSTANTIATING the model object without hyperparameters
full_gbm_default = GradientBoostingClassifier(loss          = 'deviance',
                                              learning_rate = 0.1,
                                              n_estimators  = 100,
                                              criterion     = 'friedman_mse',
                                              max_depth     = None,
                                              warm_start    = False,
                                              random_state  = 219)


# FIT step is needed as we are not using .best_estimator
full_gbm_default_fit = full_gbm_default.fit(x_train, y_train)


# PREDICTING based on the testing set
full_gbm_default_pred = full_gbm_default_fit.predict(x_test)


# SCORING the results
print('Training ACCURACY:', full_gbm_default_fit.score(x_train, y_train).round(4))
print('Testing ACCURACY :', full_gbm_default_fit.score(x_test, y_test).round(4))

gbm_train_score = full_gbm_default_fit.score(x_train, y_train).round(4)
gbm_test_score = full_gbm_default_fit.score(x_test, y_test).round(4)

print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = full_gbm_default_pred).round(4))

# displaying and saving the gap between training and testing
print('GBM Train-Test Gap :', abs(gbm_train_score - gbm_test_score).round(4))
gbm_test_gap = abs(gbm_train_score - gbm_test_score).round(4)

In [None]:
# unpacking the confusion matrix
gbm_default_tn, \
gbm_default_fp, \
gbm_default_fn, \
gbm_default_tp = confusion_matrix(y_true = y_test, y_pred = full_gbm_default_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {gbm_default_tn}
False Positives: {gbm_default_fp}
False Negatives: {gbm_default_fn}
True Positives : {gbm_default_tp}
""")

In [None]:
# SCORING the model
gbm_train_acc = full_gbm_default_fit.score(x_train, y_train).round(4)
gbm_test_acc  = full_gbm_default_fit.score(x_test, y_test).round(4)
gbm_auc       = roc_auc_score(y_true  = y_test,
                              y_score = forest_tuned_pred).round(4)
gbm_test_gap = abs(gbm_train_acc -gbm_test_acc).round(4)



# appending to print list
# comparing results
print(f"""
Model                        AUC Score    Training Accuracy  Testing Accuracy TN, FP, FN, TP     Train-Test gap
-----                        ---------    --------------     ---------------  --------------     --------------
Logistic                    {logreg_auc_score}        {logreg_train_score}            {logreg_test_score}           {logreg_tn, logreg_fp, logreg_fn, logreg_tp}    {logreg_test_gap}
Full Tree                   {full_tree_auc_score}        {full_tree_train_score}            {full_tree_test_score}           {full_tree_tn, full_tree_fp, full_tree_fn, full_tree_tp}   {full_tree_test_gap}
Pruned Tree                 {pruned_tree_auc_score}        {pruned_tree_train_score}            {pruned_tree_test_score}           {pruned_tree_tn, pruned_tree_fp, pruned_tree_fn, pruned_tree_tp}    {pruned_tree_test_gap}
Tuned LR                    {lr_auc}        {lr_train_acc}            {lr_test_acc}           {lr_tuned_tn,lr_tuned_fp,lr_tuned_fn,lr_tuned_tp}    {lr_auc_test_gap}
Tuned Tree                  {tree_auc}        {tree_train_acc}            {tree_test_acc}           {tuned_tree_tn,tuned_tree_fp,tuned_tree_fn,tuned_tree_tp}   {tuned_tree_test_gap}
Random Forest (Full)        {rf_auc}        {rf_train_acc}            {rf_test_acc}           {rf_tn,rf_fp,rf_fn,rf_tp}   {rf_test_gap}
Tuned Random Forest (Full)  {tuned_rf_auc}        {tuned_rf_train_acc}            {tuned_rf_test_acc}           {tuned_rf_tn,tuned_rf_fp,tuned_rf_fn,tuned_rf_tp}    {tuned_rf_test_gap}
GBM (Full)                  {gbm_auc}        {gbm_train_acc}            {gbm_test_acc}           {gbm_default_tn,gbm_default_fp,gbm_default_fn,gbm_default_tp}   {gbm_test_gap}
""")

In [None]:
# # declaring a hyperparameter space
# learn_range        = np.arange(0.1, 2.2, 0.5)
# estimator_range    = np.arange(100, 501, 25)
# depth_range        = np.arange(2, 11, 2)
# warm_start_range   = [True, False]

# # creating a hyperparameter grid
# param_grid = {'learning_rate' : learn_range,
#               'max_depth'     : depth_range,
#               'n_estimators'  : estimator_range,
#               'warm_start'    : warm_start_range}


# # INSTANTIATING the model object without hyperparameters
# full_gbm_grid = GradientBoostingClassifier(random_state = 219)


# # GridSearchCV object
# full_gbm_cv = RandomizedSearchCV(estimator     = full_gbm_grid,
#                            param_distributions = param_grid,
#                            cv                  = 3,
#                            n_iter              = 500,
#                            random_state        = 219,
#                            scoring             = make_scorer(roc_auc_score,
#                                                  needs_threshold = False))


# # FITTING to the FULL DATASET (due to cross-validation)
# full_gbm_cv.fit(GOT_data, GOT_target)


# # PREDICT step is not needed


# # printing the optimal parameters and best score
# print("Tuned Parameters  :", full_gbm_cv.best_params_)
# print("Tuned Training AUC:", full_gbm_cv.best_score_.round(4))

In [None]:
# # checking the best estimator for the model
# full_gbm_cv.best_estimator_

In [None]:
# INSTANTIATING with best_estimator
gbm_tuned = GradientBoostingClassifier(learning_rate = 0.1,
                                       max_depth     = 2,
                                       n_estimators  = 175,
                                       warm_start    = True,
                                       random_state  = 219)


# FITTING to the FULL DATASET (due to cross-validation)
gbm_tuned_fit = gbm_tuned.fit(GOT_data, GOT_target)


# PREDICTING based on the testing set
gbm_tuned_pred = gbm_tuned_fit.predict(x_test)


# SCORING the results
print('Training ACCURACY:', gbm_tuned_fit.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', gbm_tuned_fit.score(x_test, y_test).round(4))

gbm_tuned_train_score = gbm_tuned_fit.score(x_train, y_train).round(4)
gbm_tuned_test_score = gbm_tuned_fit.score(x_test, y_test).round(4)

print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = gbm_tuned_pred).round(4))

# displaying and saving the gap between training and testing
print('GBM Tuned Train-Test Gap :', abs(gbm_tuned_train_score - gbm_tuned_test_score).round(4))
gbm_tuned_test_gap = abs(gbm_tuned_train_score - gbm_tuned_test_score).round(4)

In [None]:
# unpacking the confusion matrix
gbm_tuned_tn, \
gbm_tuned_fp, \
gbm_tuned_fn, \
gbm_tuned_tp = confusion_matrix(y_true = y_test, y_pred = gbm_tuned_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {gbm_tuned_tn}
False Positives: {gbm_tuned_fp}
False Negatives: {gbm_tuned_fn}
True Positives : {gbm_tuned_tp}
""")

In [None]:
# declaring model performance objects
gbm_tuned_train_acc = gbm_tuned_fit.score(x_train, y_train).round(4)
gbm_tuned_test_acc  = gbm_tuned_fit.score(x_test, y_test).round(4)
gbm_tuned_auc       = roc_auc_score(y_true  = y_test,
                              y_score = gbm_tuned_pred).round(4)
gbm_tuned_test_gap = abs(gbm_tuned_train_acc -gbm_tuned_test_acc).round(4)


# appending to print list
# comparing results
print(f"""
Model                        AUC Score    Training Accuracy  Testing Accuracy TN, FP, FN, TP     Train-Test gap
-----                        ---------    --------------     ---------------  --------------     --------------
Logistic                    {logreg_auc_score}        {logreg_train_score}            {logreg_test_score}           {logreg_tn, logreg_fp, logreg_fn, logreg_tp}    {logreg_test_gap}
Full Tree                   {full_tree_auc_score}        {full_tree_train_score}            {full_tree_test_score}           {full_tree_tn, full_tree_fp, full_tree_fn, full_tree_tp}   {full_tree_test_gap}
Pruned Tree                 {pruned_tree_auc_score}        {pruned_tree_train_score}            {pruned_tree_test_score}           {pruned_tree_tn, pruned_tree_fp, pruned_tree_fn, pruned_tree_tp}    {pruned_tree_test_gap}
Tuned LR                    {lr_auc}        {lr_train_acc}            {lr_test_acc}           {lr_tuned_tn,lr_tuned_fp,lr_tuned_fn,lr_tuned_tp}    {lr_auc_test_gap}
Tuned Tree                  {tree_auc}        {tree_train_acc}            {tree_test_acc}           {tuned_tree_tn,tuned_tree_fp,tuned_tree_fn,tuned_tree_tp}   {tuned_tree_test_gap}
Random Forest (Full)        {rf_auc}        {rf_train_acc}            {rf_test_acc}           {rf_tn,rf_fp,rf_fn,rf_tp}   {rf_test_gap}
Tuned Random Forest (Full)  {tuned_rf_auc}        {tuned_rf_train_acc}            {tuned_rf_test_acc}           {tuned_rf_tn,tuned_rf_fp,tuned_rf_fn,tuned_rf_tp}    {tuned_rf_test_gap}
GBM (Full)(Final Model)     {gbm_auc}        {gbm_train_acc}            {gbm_test_acc}           {gbm_default_tn,gbm_default_fp,gbm_default_fn,gbm_default_tp}   {gbm_test_gap}
Tuned GBM                   {gbm_tuned_auc}        {gbm_tuned_train_acc}            {gbm_tuned_test_acc}           {gbm_tuned_tn,gbm_tuned_fp,gbm_tuned_fn,gbm_tuned_tp}    {gbm_tuned_test_gap}
""")