# DnD Monster Data Wrangling


## Importation

In [701]:
import math
import numpy as np
import pandas as pd
import ast
import re
from src.features.build_features import clean_list
from word2number import w2n

%matplotlib inline

monster_df = pd.read_csv('../data/raw/Monster_Data_RAW.csv')

monster_df.head()

Unnamed: 0.1,Unnamed: 0,Monster Name,Size,Type,Alignment,Traits,Damage Resistances,Monster Tags:,Mythic Actions,Reactions,...,Proficiency Bonus,STR,DEX,CON,INT,WIS,CHA,Actions,Legendary Actions,Environment:
0,0,Adult Green Dragon,Huge,['dragon'],lawful evil,['Amphibious. The dragon can breathe air and w...,,,,,...,5,23,12,21,18,15,17,['Multiattack. The dragon can use its Frightfu...,"[""The dragon can take 3 legendary actions, cho...",['Forest']
1,1,Adult Silver Dragon,Huge,['dragon'],lawful good,['Legendary Resistance (3/Day). If the dragon ...,,,,,...,5,27,10,25,16,13,21,['Multiattack. The dragon can use its Frightfu...,"[""The dragon can take 3 legendary actions, cho...","['Mountain', 'Urban']"
2,2,Adult White Dragon,Huge,['dragon'],chaotic evil,"[""Ice Walk. The dragon can move across and cli...",,,,,...,5,22,10,22,8,12,12,['Multiattack. The dragon can use its Frightfu...,"[""The dragon can take 3 legendary actions, cho...",['Arctic']
3,3,Air Elemental,Large,['elemental'],neutral,"[""Air Form. The elemental can enter a hostile ...","Lightning, Thunder; Bludgeoning, Piercing, and...",,,,...,3,14,20,14,6,10,6,['Multiattack. The elemental makes two slam at...,,"['Desert', 'Mountain']"
4,4,Ape,Medium,['beast'],unaligned,[nan],,['Misc Creature'],,,...,2,16,14,14,6,12,7,['Multiattack. The ape makes two fist attacks....,,['Forest']


## Performing the Basic Cleanup
Removing columns we won't use, cleanining up feature titles, and checking out our dataset for datatypes and issues.

We know that Mythic Actions weren't introducted until a later version, so no monsters contain this category.

Unnamed:0 is useless, Monster Tags: is the same as Monster type (with a bit more specificity, which we don't need.)

Skills, Source, Launguages, and Senses are all unecessary for our MVP.

In [702]:
monster_df.drop(columns = {'Unnamed: 0', 'Mythic Actions', 'Monster Tags:', "Skills", 'Source', 'Languages', "Senses"}, inplace = True, axis = 1)
monster_df.rename(columns = {"Environment:":"Environment"}, inplace=True)

In [703]:
print(monster_df.columns)
print(monster_df.describe())

Index(['Monster Name', 'Size', 'Type', 'Alignment', 'Traits',
       'Damage Resistances', 'Reactions', 'Armor Class', 'Hit Points', 'Speed',
       'Saving Throws', 'Damage Vulnerabilities', 'Damage Immunities',
       'Condition Immunities', 'Challenge', 'Proficiency Bonus', 'STR', 'DEX',
       'CON', 'INT', 'WIS', 'CHA', 'Actions', 'Legendary Actions',
       'Environment'],
      dtype='object')
       Armor Class  Hit Points  Proficiency Bonus         STR         DEX  \
count   348.000000  348.000000         348.000000  348.000000  348.000000   
mean     13.985632   78.399425           2.712644   14.951149   12.706897   
std       3.155403   96.670352           1.296486    6.705018    3.078279   
min       5.000000    1.000000           2.000000    1.000000    1.000000   
25%      12.000000   17.000000           2.000000   11.000000   10.000000   
50%      13.000000   45.000000           2.000000   16.000000   13.000000   
75%      16.000000  110.000000           3.000000   19.00

## Challenge Rating (CR)
Currently Challenge Rating is a string, we want it to be an integer so we can use it. We need to remove the experience string attached and convert.

In [704]:
#split the string and only take the first part (Challenge Rating)
monster_df["Challenge"] = monster_df["Challenge"].str.split().str[0]

#turn fraction strings into floats
for indx, challenge in enumerate(monster_df["Challenge"]):
    if "/" in challenge:
       monster_df.loc[indx,'Challenge'] = pd.eval(challenge)
    else:
        monster_df.loc[indx,'Challenge'] = pd.to_numeric(challenge)

monster_df["Challenge"] = pd.to_numeric(monster_df["Challenge"])

## Monster Type
There are a large number of monster sub-types, this is unecessary for our analysis. we want to consolidate. 

In [705]:
for indx,Type in enumerate(monster_df['Type']):
    monster_df.loc[indx,"Type"] = Type.split(",")[0].strip("[']")

## Missing Values

All features with missing values make sense esxcept Actions, which I would have assumed every creature has an action, that may not be the case however. For the others, they are all optional features that lower level monsters won't have.

The NAs may still cause issues down the road, so I will replace them all with a string text like "NA"

In [706]:
monster_df.isna().any()

monster_df.fillna("['NA']",inplace=True)

## List Values
We current have features where the values are varying list of items. For example, a monster may be found in more than one environment such as [mountain, coastal, underdark]. Unfortuntaly, they are reading as strings right now so we will need to convert them to lists for easy of use.

In [707]:
#All lists columns are actually strings!
for i,j in enumerate(monster_df["Environment"]):
   print("list",i,"is",type(j))

list 0 is <class 'str'>
list 1 is <class 'str'>
list 2 is <class 'str'>
list 3 is <class 'str'>
list 4 is <class 'str'>
list 5 is <class 'str'>
list 6 is <class 'str'>
list 7 is <class 'str'>
list 8 is <class 'str'>
list 9 is <class 'str'>
list 10 is <class 'str'>
list 11 is <class 'str'>
list 12 is <class 'str'>
list 13 is <class 'str'>
list 14 is <class 'str'>
list 15 is <class 'str'>
list 16 is <class 'str'>
list 17 is <class 'str'>
list 18 is <class 'str'>
list 19 is <class 'str'>
list 20 is <class 'str'>
list 21 is <class 'str'>
list 22 is <class 'str'>
list 23 is <class 'str'>
list 24 is <class 'str'>
list 25 is <class 'str'>
list 26 is <class 'str'>
list 27 is <class 'str'>
list 28 is <class 'str'>
list 29 is <class 'str'>
list 30 is <class 'str'>
list 31 is <class 'str'>
list 32 is <class 'str'>
list 33 is <class 'str'>
list 34 is <class 'str'>
list 35 is <class 'str'>
list 36 is <class 'str'>
list 37 is <class 'str'>
list 38 is <class 'str'>
list 39 is <class 'str'>
list 40 is

### Attack, Spell Attack, Save DC
Actually, there is some information within these strings we can pull out easily with regex search, match, findall. Let's do that before converting into lists

In [708]:
# Create new columns for features
monster_df = monster_df.assign(Attack_Bonus= '', Spell_Bonus = '', Spell_Save_DC = '')

#Attack Bonus
for indx, action in enumerate(monster_df['Actions']):
    try:
        found = re.search("\+(.+?) to hit", action).group(0)
        monster_df.loc[indx,'Attack_Bonus'] = int(found.split()[0].lstrip('+'))
    except:
        monster_df.loc[indx,'Attack_Bonus'] = 0

#Spell Attack Bonus

for indx, trait in enumerate(monster_df['Traits']):
    try:
        found = re.search("\+(.+?) to hit", trait).group(0)
        monster_df.loc[indx,'Spell_Bonus'] = int(found.split()[0].lstrip('+'))
    except:
        monster_df.loc[indx,'Spell_Bonus'] = 0

#Spell Save DC

for indx, trait in enumerate(monster_df['Traits']):
    try:
        found = re.search("spell save DC [0-9]+", trait).group(0)
        monster_df.loc[indx,'Spell_Save_DC'] = int(found.split()[-1])
    except:
        monster_df.loc[indx,'Spell_Save_DC'] = 0

## Saving Throw Expansion
We want to be able to evaluate Saving Throw Numbers just like we due stats. Some monsters have bonuses to certain saving throws, which we will input first. Then we will use the stats to fill in the rest of the saving throws. Stat numbers have a base relationship to saving throw where every 2 stat increases is +1 into Saving throw. Example, 10 in Strength is a +0 in Str Saving Throw, but a 12 in Strength is a +1, and finally a 20 in Strength is a +5 in Strength saving throw.

In [709]:
monster_df["Saving Throws"] = monster_df['Saving Throws'].apply(clean_list)

#turn saving throw feature values into lists using literal_eval
monster_df["Saving Throws"] = monster_df["Saving Throws"].apply(ast.literal_eval)

#Saving Throw Exapanded features
saving_throw_df = pd.DataFrame(columns={"STR_SV","DEX_SV","CON_SV","INT_SV","WIS_SV","CHA_SV"})

for indx, saving_throw in enumerate(monster_df['Saving Throws']): 
    for string in saving_throw:
        if "DEX" in string:
            saving_throw_df.loc[indx,"DEX_SV"] = int(string.split()[1].lstrip('+'))
        elif "CON" in string:
            saving_throw_df.loc[indx,"CON_SV"] = int(string.split()[1].lstrip('+'))
        elif "STR" in string:
            saving_throw_df.loc[indx,"STR_SV"] = int(string.split()[1].lstrip('+'))
        elif "WIS" in string:
            saving_throw_df.loc[indx,"WIS_SV"] = int(string.split()[1].lstrip('+'))
        elif "INT" in string:
            saving_throw_df.loc[indx,"INT_SV"] = int(string.split()[1].lstrip('+'))
        elif "CHA" in string:
            saving_throw_df.loc[indx,"CHA_SV"] = int(string.split()[1].lstrip('+'))
            
monster_df = pd.concat([monster_df,saving_throw_df], axis=1)

In [710]:
#Using Stats to fill in missing saving throws
stat_modifiers ={('1') : -5, ('2','3') : -4, ('4','5') : -3, ('6','7') : -2, ('8','9') : -1, ('10','11') : 0, ('12','13') : 1, ('14','15') : 2, ('16','17') : 3, ('18','19') : 4, 
('20','21') : 5, ('22','23') : 6, ('24','25') : 7, ('26','27') : 8, ('28','29') : 9, ('30') : 10}

for clms in monster_df.iloc[:,29:34]:
    monster_stat = clms.split('_')[0]
    for indx, value in enumerate(monster_df[clms]):
        if math.isnan(value):
            for stat_num, modifier in stat_modifiers.items():
               if str(monster_df.loc[indx,monster_stat]) in stat_num:
                    monster_df.loc[indx,clms] = modifier
          

In [711]:

#evaluate string and turn into lists
column_lists = ["Environment", "Reactions", "Actions", "Legendary Actions"]
for columns in column_lists:
    monster_df[columns] = monster_df[columns].apply(ast.literal_eval)
            
#"Damage Resistances","Damage Vulnerabilities", "Damage Immunities", have wonky typing due to semicolon
# Traits, Condition immunities, saving throws create type error

#check that they are lists
for i,j in enumerate(monster_df["Environment"]):
   print("list",i,"is",type(j))

#create dummy variables for envinroment, which includes the list for variables
dummies = pd.get_dummies(monster_df['Environment'].explode()).reset_index().groupby(['index']).sum()
monster_df = pd.concat([monster_df,dummies], axis=1)

ValueError: malformed node or string on line 1: <ast.Name object at 0x000001A9BF9B80A0>

## Actions: Damage
While there is a ton of information in Actions that we may use for word clouds later, the most critical thing for our MVP is trying to pull out the potential damage of the monsters. This is difficult since monsters are so diverse, some have multiattack, which could mean many different things, some have spells, which we don't immediatelly have the damage for, some do secondary damage upon a failed saving throw. So distilling this down into a simple X damage per round will prove difficult. 

First, we can see that regular attacks follow a pattern of 'Hit: XX (XdX + X) """ """ damage.' This is important because we can pull out the average damage and use it for the monster. I'm thinking we may need to make a seperate dataframe to work with this information to start.

In [None]:
monster_actions = monster_df[['Monster Name', 'Actions']]

#Find out max number of attacks is 10
max_attacks = monster_actions['Actions'].explode()
max_attacks.groupby(max_attacks.index).count().max()

#Create 10 columns to work through attacks individually
monster_actions = monster_actions.assign(Attack_1 = "", Attack_2 = "",  Attack_3 = "", Attack_4 = "", Attack_5 = "", Attack_6 = "", Attack_7 = "", Attack_8 = "", Attack_9 = "", Attack_10 = "")

for indx,actions in enumerate(monster_actions['Actions']):
    n = 0
    for action in actions:
        monster_actions.iloc[indx, n+2] = action
        n+=1
        
#columns Attack_9 and Attack_10 can be removed since they are relating to dragon polymorph
monster_actions.drop(columns={"Actions", "Attack_9","Attack_10"},axis=1, inplace = True)

IndexError: iloc cannot enlarge its target object

In [None]:
#update column to show dictionary of types and number of attacks for multiattacks

def MultiAttackSearch(attack_value, search, replaced, replace, split):
    multiattack = re.search(search, attack_value).group(1)
    multiattack = multiattack.replace(replaced,replace)
    multiattack = re.split(split, multiattack)
    for indx1, item in enumerate(multiattack):
        if item !=" ":
            MA_number = {}
            multiattack[indx1] = item.split()
            value, key = multiattack[indx1][0], multiattack[indx1][1]
            MA_number[key] = w2n.word_to_num(value)
            multiattack[indx1] = MA_number
    return multiattack

for indx, attack in enumerate(monster_actions["Attack_1"]):
        if "Multiattack" in attack:
            if ": " in attack:
                monster_actions.loc[indx,"Attack_1"] = MultiAttackSearch(attack,"\: (.*?)\.", " with its "," ",'and |,')
            elif ("makes " in attack) and ("1d4" not in attack) and ("either" not in attack) and ("as" not in attack):   
                multiattack = re.search("makes (.*?) ", attack).group(1)
                multiattack = multiattack.replace("makes ","")
                multiattack = w2n.word_to_num(multiattack)
                MA_number = {}
                value, key = multiattack, "Attack"
                MA_number[key] = value
                Monster_list = []
                Monster_list.append(MA_number)
                monster_actions.loc[indx,"Attack_1"] = Monster_list
            elif ("medusa" in attack):
                monster_actions.loc[indx,"Attack_1"] = [{'snake hair':1},{'shortsword':2}]
            elif ("drider" in attack):
                monster_actions.loc[indx,"Attack_1"] = [{'longsword':3}]
            elif ("flameskull" in attack):
                monster_actions.loc[indx,"Attack_1"] = [{'Fire Ray':2}]
            elif ("oni" in attack):
                monster_actions.loc[indx,"Attack_1"] = [{'Glaive':2}]
            elif ("fungus" in attack):
                monster_actions.loc[indx,"Attack_1"] = [{"Rotting Touch": 4}]
            elif ("hydra" in attack):
                monster_actions.loc[indx,"Attack_1"] = [{"Bite": 5}]
            elif ("assassin" in attack):
                monster_actions.loc[indx,"Attack_1"] = [{"Bite": 5}]
            elif ("rakshasa" in attack):
                monster_actions.loc[indx,"Attack_1"] = [{"Bite": 5}]
            elif ("veteran" in attack):
                monster_actions.loc[indx,"Attack_1"] = [{"longsword": 2},{"shortsword":1}]





In [None]:
#The way split works if there is ', and' it causes a blank value in the list, so we will delete it
for indx, values in enumerate(monster_actions["Attack_1"]):
    if isinstance(values,list):
        if " " in values:
            values.remove(" ")

monster_actions["Multiattack"] = monster_actions["Attack_1"]

In [None]:
#remove the multiattack from Attack_1 so that it only will contain attacks for calculations
for indx, value in enumerate(monster_actions["Attack_1"]):
    if type(value) == list:
        monster_actions.loc[indx,"Attack_1"] = ""

#remove the attacks from multiattack so that it only will contain attacks for calculations
for indx, value in enumerate(monster_actions["Multiattack"]):
    if type(value) != list:
        monster_actions.loc[indx,"Multiattack"] = ""

In [None]:
# Several monsters have "if this monster takes damage than X happens", which makes finding and replacing attacks with "target takes X damage" difficult. We will replace these 5 spots manually

#replace "Hit:" attacks with just damage and bonus damage such as extra lightning damage
for col in monster_actions.iloc[:,1:9]:
    for indx, attack in enumerate(monster_actions[col]):
        try:
            if ("worm" in attack) or ("kraken takes" in attack) or ("remorhaz takes" in attack) or ("behir takes" in attack) or ("tarrasque takes" in attack): 
                monster_actions.loc[indx,col] = 0
            elif "plus" in attack:
                ext_damage = int(re.search("plus (.+?) ", attack).group(1))         
                if "Hit:" in attack:
                    prim_damage = int(re.search("Hit: (.+?) ", attack).group(1))
                monster_actions.loc[indx,col] = prim_damage + ext_damage
            else:
                if "Hit:" in attack:
                    monster_actions.loc[indx,col] = int(re.search("Hit: (.+?) ", attack).group(1))
                elif "taking " in attack:
                    monster_actions.loc[indx,col] = int(re.search("taking (.+?) ", attack).group(1))
                elif "take " in attack:
                    monster_actions.loc[indx,col] = int(re.search("take [0-9]+ ", attack).group(0).split()[1])
                elif "takes " in attack:
                    monster_actions.loc[indx,col] = int(re.search("takes [0-9]+ ", attack).group(0).split()[1])                                    
        except:
            continue

In [None]:
#replace remaining strings as 0
for col in monster_actions.iloc[:,1:11]:
    monster_actions[col].replace(to_replace='^', value=0, regex=True, inplace=True)

In [None]:
# Create Round 1 damage for multi attack and single attack monsters
for indx, lst in enumerate(monster_actions["Multiattack"]):
    if lst != 0:
        attack_total = 0
        for i in range(len(lst)):
            col = str("Attack_"+str(i+2))
            attack_num = monster_actions.loc[indx,col]
            multiplier = int(list(lst[i].values())[0])
            attack_total += attack_num * multiplier 
        monster_actions.loc[indx,"Round_1"] = attack_total
    else:
        monster_actions.loc[indx,"Round_1"] = monster_actions.loc[indx,"Attack_1"] 

monster_actions["Round_2"] = monster_actions["Round_1"] 

In [None]:
monster_actions["Round_3"] = monster_actions["Round_1"]
for col in monster_actions.iloc[:,1:9]:
    for indx, attack in enumerate(monster_actions[col]):
        if attack > monster_actions.loc[indx,"Round_1"]:
            monster_actions.loc[indx,"Round_3"] = attack


In [None]:
# Final Round Total
monster_actions["Total_Action Damage_3Rounds"] = (monster_actions["Round_1"] + monster_actions["Round_2"] +  monster_actions["Round_3"])

In [None]:
# Creatures with swallow all have additional Round damage
# Giant Toad 10
# Giant Frog 5
# Remorhaz 21
# Tarrasque 56
# Behir 21
# Old Croaker 10 atk2
monster_actions[monster_actions["Monster Name"] == "Giant Toad"]

swallow_monsters = ['Giant Toad', 'Giant Frog', 'Remorhaz', 'Tarrasque', 'Behir', 'Old Croaker']

for monster in swallow_monsters:
    indx = monster_actions[monster_actions["Monster Name"] == monster].index
    for col in monster_actions[['Attack_8','Attack_7','Attack_6','Attack_5','Attack_4','Attack_3','Attack_2']]:
        attack = int(monster_actions.loc[indx, col])
        if attack != 0:
            print(monster_actions.loc[indx, "Round_1"] )
            monster_actions.loc[indx, "Round_1"] += attack
            monster_actions.loc[indx, "Round_2"] += attack
            monster_actions.loc[indx, "Round_3"] += attack
            break                
                   


251    12.0
Name: Round_1, dtype: float64
302    4.0
Name: Round_1, dtype: float64
340    50.0
Name: Round_1, dtype: float64
341    148.0
Name: Round_1, dtype: float64
295    56.0
Name: Round_1, dtype: float64
314    12.0
Name: Round_1, dtype: float64


In [None]:
#While not a perfect set, its a great start for the MVP lets clean up for Adding new columns
monster_actions.drop(columns={"Attack_1","Attack_2","Attack_3","Attack_4","Attack_5","Attack_6","Attack_7","Attack_8","Multiattack", "Round_1", "Round_2","Round_3"}, axis = 1, inplace = True)

In [None]:
monster_actions.describe()

Unnamed: 0,Total_Action Damage_3Rounds
count,348.0
mean,58.922414
std,61.782432
min,0.0
25%,15.0
50%,36.0
75%,78.0
max,444.0


## Reactions and Legendary Actions
Now we need to add Reaction and Legendary Action damage to the total action damage for many of these monsters.

In [None]:
#Need to Add Legendary Actions and Reactions to Total Damage before calculating average
monster_reactions = monster_df[['Monster Name', 'Reactions']]

In [None]:
# None of the Reactions deal with damage. There are some AC based ones we may consider
for value in enumerate(monster_reactions["Reactions"]):
    print(value)

In [None]:
#Need to Add Legendary Actions and Reactions to Total Damage before calculating average
monster_leg_actions = monster_df[['Monster Name', 'Legendary Actions']]