## Web scrape the data

#### Personas

In [1]:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
import csv
import json

In [2]:
page_url = 'https://megamitensei.fandom.com/wiki/List_of_Persona_4_Personas'
uClient = uReq(page_url)
page_soup = soup(uClient.read(), "html.parser")
uClient.close()

In [3]:
persona = []

arcana = page_soup('table', {'class':'table p4'})
arcana_l = len(arcana)

for arc in range(arcana_l):
    pname = arcana[arc].findAll('a')
    pname_l = len(pname)
    for pn in range(pname_l):
        name = pname[pn].text
        if pname[pn].previous_element.previous_element.previous_element != '**':
            persona.append(name)

In [4]:
# Data cleaning: 
# It shouldn't be in the list, but only for this element, there was an extra span
# which broke the previous_element method.
persona.remove('Takehaya Susano-o')

# For this one, the link will be different than the sample for all the other links,
# because 'https://megamitensei.fandom.com/wiki/Seth' existed before this collection.
persona[173] = 'Seth_(demon)'

# The japan to english translation was incorrect in this persona's name, and one 
# website uses the incerrect one, the other the correct one. Because of this, the
# link is not unified with the name.
persona[177] = 'Feng_Huang'

# This one has a significantly different webpage, so I can't automatize it. If its needed, a solution
# could be adding it at the end, by code, or by hand. Now I won't include it.
persona.remove('Magatsu-Izanagi')

# For some reason, this persona's name differs on the main page and on its own page.
persona[198] = 'Kushinada-Hime'

In [5]:
links = []

for p in persona:
    links.append('https://megamitensei.fandom.com/wiki/' + p.replace(' ','_'))

In [6]:
# I change the names back, for later use
persona[173] = 'Seth'
persona[177] = 'Phoenix'
persona[198] = 'Kushinada'

In [7]:
# It will take a few minutes

counter = 0

headers = ['Name','Arcana','Base level','Strength','Magic','Endurance','Agility','Luck','Inherit','Reflects','Absorbs',
           'Block','Resists','Weak','List of Skills']

with open('persona4.csv', 'w', newline='\n') as f:
    writer = csv.writer(f)
    writer.writerow(headers)

    for link in links:

        page_url = link
        uClient = uReq(page_url)
        page_soup = soup(uClient.read(), "html.parser")
        uClient.close()

        span = page_soup('span',{'id':'Persona_4_3'})

        if span == []:
            span = page_soup('span',{'id':'Persona_4_2'})
        if span == []:
            span = span = page_soup('span',{'id':'Persona_4'})
        if span == []:
            span = span = page_soup('span',{'id':'Persona_4_Golden_2'})
        if span == []:
            span = span = page_soup('span',{'id':'Persona_4_Golden'})

        table = span[0].next_element.next_element.next_element.next_element.next_element.next_element.next_element.next_element

        
        arc = table.findAll('table',{'class':'customtable'})[0].findAll('td')[-2].text.strip()
        lvl = int(table.findAll('table',{'class':'customtable'})[0].findAll('td')[-1].text.strip())

        stat = []
        for j in range(0,5):
            tmp = table.findAll('table',{'class':'customtable'})[0].table.findAll('td')[3*j+1].text
            tmp = tmp.strip()
            stat.append(int(tmp))

        elem = []
        for j in range(0,6):
            tmp = table.findAll('table',{'class':'customtable'})[1].findAll('td')[j].text
            tmp = tmp.strip()
            elem.append(tmp)

        skills = table.findAll('table',{'class':'customtable'})[2].findAll('tr')[2:]
        skill_set = []
        for j in range(len(skills)):

            sp = {}
            skill = list(filter(None, skills[j].text.split('\n')))
            
            if skill[3] == 'Innate':
                skill[3] = lvl
            elif 'S' in skill[3]:
                skill[3] = None
            else:
                skill[3] = int(skill[3])

            sp.update({'name':skill[0],
                       'level':skill[3]})
            skill_set.append(sp)
            
        skill_set = {'skills':skill_set}
        skill_set = json.dumps(skill_set)

        writer.writerow([persona[counter],arc,lvl,stat[0],stat[1],stat[2],stat[3],stat[4],elem[0],elem[1],elem[2],
                         elem[3],elem[4],elem[5],skill_set])
        counter = counter + 1
        
f.close()

#### Skills

In [8]:
page_url = 'https://megamitensei.fandom.com/wiki/List_of_Persona_4_Skills'
uClient = uReq(page_url)
page_soup = soup(uClient.read(), "html.parser")
uClient.close()

In [9]:
# The sizes of the tables changes, so it will be a little bit messy.
# I have to adjust it manually.

table = page_soup.findAll('table',{'class':'table p4'})

headers = ['Skill','Effect','Cost']

with open('skills.csv', 'w', newline='\n') as f:
    writer = csv.writer(f)
    writer.writerow(headers)
    
    for j in range(0,2):
        tmp_table = table[j].findAll('td')
        for k in range(0,int(len(tmp_table)/6)):
            skill = tmp_table[6*k].text.strip().replace(';', ' ').replace('\n','|')
            effect = tmp_table[6*k+1].text.strip().replace(';', ' ').replace('\n','|')
            cost = tmp_table[6*k+5].text.strip().replace(';', ' ').replace('\n','|')
            writer.writerow([skill,effect,cost])
    
    for j in range(2,6):
        tmp_table = table[j].findAll('td')
        for k in range(0,int(len(tmp_table)/5)):
            skill = tmp_table[5*k].text.strip().replace(';', ' ').replace('\n','|')
            effect = tmp_table[5*k+1].text.strip().replace(';', ' ').replace('\n','|')
            cost = tmp_table[5*k+4].text.strip().replace(';', ' ').replace('\n','|')
            writer.writerow([skill,effect,cost])
            
    for j in range(6,8):
        tmp_table = table[j].findAll('td')
        for k in range(0,int(len(tmp_table)/4)):
            skill = tmp_table[4*k].text.strip().replace(';', ' ').replace('\n','|')
            effect = tmp_table[4*k+1].text.strip().replace(';', ' ').replace('\n','|')
            cost = tmp_table[4*k+3].text.strip().replace(';', ' ').replace('\n','|')
            writer.writerow([skill,effect,cost])
            
    for j in range(8,9):
        tmp_table = table[j].findAll('td')
        for k in range(0,int(len(tmp_table)/5)):
            skill = tmp_table[5*k].text.strip().replace(';', ' ').replace('\n','|')
            effect = tmp_table[5*k+1].text.strip().replace(';', ' ').replace('\n','|')
            cost = tmp_table[5*k+4].text.strip().replace(';', ' ').replace('\n','|')
            writer.writerow([skill,effect,cost])
            
    for j in range(9,12):
        tmp_table = table[j].findAll('td')
        for k in range(0,int(len(tmp_table)/3)):
            skill = tmp_table[3*k].text.strip().replace(';', ' ').replace('\n','|')
            effect = tmp_table[3*k+1].text.strip().replace(';', ' ').replace('\n','|')
            cost = tmp_table[3*k+2].text.strip().replace(';', ' ').replace('\n','|')
            writer.writerow([skill,effect,cost])
            
    for j in range(12,13):
        tmp_table = table[j].findAll('td')
        for k in range(0,int(len(tmp_table)/3)):
            skill = tmp_table[3*k].text.strip().replace(';', ' ').replace('\n','|')
            effect = tmp_table[3*k+1].text.strip().replace(';', ' ').replace('\n','|')
            cost = '0 Passive'
            writer.writerow([skill,effect,cost])
            
    for j in range(13,14):
        tmp_table = table[j].findAll('td')
        for k in range(0,int(len(tmp_table)/2)):
            skill = tmp_table[2*k].text.strip().replace(';', ' ').replace('\n','|')
            effect = tmp_table[2*k+1].text.strip().replace(';', ' ').replace('\n','|')
            cost = '0 Passive'
            writer.writerow([skill,effect,cost])
            
f.close()

## Cleaning

#### Personas

In [10]:
import pandas as pd
import numpy as np
import re

In [11]:
p4 = pd.read_csv('persona4.csv')

In [12]:
# Firstly, we have to work out some inconsistency

# In Inherit:
p4['Inherit'] = p4['Inherit'].astype(str)
p4.loc[p4['Inherit'] == 'Dark','Inherit'] = 'Darkness'
p4.loc[p4['Inherit'] == 'Elec','Inherit'] = 'Electricity'
p4.loc[p4['Inherit'] == 'Negative stats','Inherit'] = 'Negative Status'
p4.loc[p4['Inherit'] == 'Phys','Inherit'] = 'Physical'
p4.loc[p4['Inherit'] == 'Support Skills','Inherit'] = 'Support'
p4.loc[p4['Inherit'] == 'Status','Inherit'] = 'Negative Status'
p4.loc[p4['Inherit'] == 'Bad Status','Inherit'] = 'Negative Status'
p4.loc[p4['Inherit'] == 'nan','Inherit'] = '-'
p4.loc[p4['Inherit'] == 'None','Inherit'] = '-'
# In Reflects:
p4['Reflects'] = p4['Reflects'].str.replace(', ','/')
p4.loc[p4['Reflects'] == 'Dark','Reflects'] = 'Darkness'
p4.loc[p4['Reflects'] == 'Dark/Fire','Reflects'] = 'Darkness/Fire'
p4.loc[p4['Reflects'] == 'Elec/Light','Reflects'] = 'Electricity/Light'
p4['Reflects'] = p4['Reflects'].str.replace('Electricity','Elec')
p4['Reflects'] = p4['Reflects'].str.replace('Elec','Electricity')
p4['Reflects'] = p4['Reflects'].str.replace('Physical','Phys')
p4['Reflects'] = p4['Reflects'].str.replace('Phys','Physical')
# In Absorbs:
p4['Absorbs'] = p4['Absorbs'].str.replace(', ','/')
p4.loc[p4['Absorbs'] == 'Elec','Absorbs'] = 'Electricity'
# In Block:
p4['Block'] = p4['Block'].str.replace(', ','/')
p4['Block'] = p4['Block'].str.replace('Darkness','Dark')
p4['Block'] = p4['Block'].str.replace('Dark','Darkness')
p4['Block'] = p4['Block'].str.replace('Physical','Phys')
p4['Block'] = p4['Block'].str.replace('Phys','Physical')
p4['Block'] = p4['Block'].str.replace('Electricity','Elec')
p4['Block'] = p4['Block'].str.replace('Elec','Electricity')
p4.loc[p4['Block'] == 'Fire/Darkness','Block'] = 'Darkness/Fire'
p4.loc[p4['Block'] == 'Ice/Electricity','Block'] = 'Electricity/Ice'
p4.loc[p4['Block'] == 'Light/Fire','Block'] = 'Fire/Light'
# In Resists:
p4['Resists'] = p4['Resists'].str.replace(', ','/')
p4['Resists'] = p4['Resists'].str.replace(',','/')
p4['Resists'] = p4['Resists'].str.replace('Darkness','Dark')
p4['Resists'] = p4['Resists'].str.replace('Dark','Darkness')
p4['Resists'] = p4['Resists'].str.replace('Electricity','Elec')
p4['Resists'] = p4['Resists'].str.replace('Elec','Electricity')
p4['Resists'] = p4['Resists'].str.replace('Physical','Phys')
p4['Resists'] = p4['Resists'].str.replace('Phys','Physical')
# In Weak:
p4['Weak'] = p4['Weak'].str.replace(', ','/')
p4['Weak'] = p4['Weak'].str.replace('Darkness','Dark')
p4['Weak'] = p4['Weak'].str.replace('Dark','Darkness')
p4['Weak'] = p4['Weak'].str.replace('Electricity','Elec')
p4['Weak'] = p4['Weak'].str.replace('Elec','Electricity')
p4['Weak'] = p4['Weak'].str.replace('Physical','Phys')
p4['Weak'] = p4['Weak'].str.replace('Phys','Physical')
p4.loc[p4['Weak'] == 'Lightning','Weak'] = 'Electricity'
p4.loc[p4['Weak'] == 'Fire/Darkness','Weak'] = 'Darkness/Fire'

In [13]:
# Secondly, we should adjust the columns types if needed
p4['List of Skills'] = p4['List of Skills'].apply(json.loads)

In [14]:
print(p4.shape)
p4.head()

(211, 15)


Unnamed: 0,Name,Arcana,Base level,Strength,Magic,Endurance,Agility,Luck,Inherit,Reflects,Absorbs,Block,Resists,Weak,List of Skills
0,Izanagi,Fool,1,3,2,2,3,2,Electricity,-,-,Darkness,Electricity,Wind,"{'skills': [{'name': 'Zio', 'level': 1}, {'nam..."
1,Yomotsu-Shikome,Fool,7,2,7,7,8,4,Negative Status,-,-,-,Ice,Fire,"{'skills': [{'name': 'Poisma', 'level': 7}, {'..."
2,Obariyon,Fool,13,11,6,11,9,9,Physical,-,-,-,Physical/Fire,-,"{'skills': [{'name': 'Sonic Punch', 'level': 1..."
3,Legion,Fool,21,14,14,18,13,11,Ailment,-,-,-,Fire/Ice/Darkness,Light,"{'skills': [{'name': 'Tentarafoo', 'level': 21..."
4,Ose,Fool,31,28,12,20,25,15,Physical,-,-,Wind,Physical,Light,"{'skills': [{'name': 'Power Slash', 'level': 3..."


In [15]:
# Save it in a separate file before further changes
p4.to_csv('persona4.csv')

#### Skills

In [16]:
skills = pd.read_csv('skills.csv')

In [17]:
Costs = []
Resource = []

for j in range(0,skills.shape[0]):
    tmp = skills['Cost'][j]
    Costs.append(re.search('[0-9]+',tmp).group(0))
    Resource.append(re.search('% HP|SP|Passive',tmp).group(0))

In [18]:
tmp_d = pd.DataFrame(list(zip(Costs, Resource)), columns =['Costs', 'Resource'])
skills = skills.merge(tmp_d, on=skills.index)[['Skill','Effect','Costs','Resource']]

In [19]:
print(skills.shape)
skills.head()

(296, 4)


Unnamed: 0,Skill,Effect,Costs,Resource
0,Bash,Deals light Phys damage to 1 foe.,6,% HP
1,Cleave,Deals light Phys damage to 1 foe.,5,% HP
2,Skewer,Deals light Phys damage to 1 foe.,5,% HP
3,Sonic Punch,Deals light Phys damage to 1 foe.,8,% HP
4,Double Fangs,Deals light Phys damage to 1 foe 2x.,8,% HP


## Normalization

#### skill_connection

In [20]:
P_index = []
S_name = []
S_lvl = []

for j in range(0,len(p4)):
    for k in range(0,len(p4['List of Skills'][j]['skills'])):
        P_index.append(p4.index[j])
        S_name.append(p4['List of Skills'][j]['skills'][k]['name'])
        S_lvl.append(p4['List of Skills'][j]['skills'][k]['level'])

In [21]:
skill_connection = pd.DataFrame(list(zip(P_index, S_name,S_lvl)), columns = ['P_index', 'S_name','S_lvl'])

In [22]:
skills['S_index'] = skills.index
skill_connection = skill_connection.merge(skills, left_on = 'S_name', right_on = 'Skill')[['P_index','S_index','S_lvl']]
skills.drop(columns = 'S_index',inplace = True)

In [23]:
p4.drop(columns = 'List of Skills', inplace = True)

In [24]:
print(skill_connection.shape)
skill_connection.head()

(1496, 3)


Unnamed: 0,P_index,S_index,S_lvl
0,0,70,1.0
1,10,70,3.0
2,40,70,12.0
3,48,70,7.0
4,132,70,16.0


#### elements

In [25]:
elements = pd.DataFrame(p4['Inherit'].sort_values().unique())
elements.rename(columns = {0 : 'Elements'}, inplace = True)

In [26]:
connections = [pd.DataFrame()] * 6

for j in range(0,len(p4.columns[8:])):

    Element = []
    Index = []

    for k in range(0,len(p4)):
        if p4[p4.columns[j+8]][k] != '-':
            if '/' in p4[p4.columns[j+8]][k]:
                tmp = p4[p4.columns[j+8]][k].split('/')
                for l in range(0,len(tmp)):
                    Index.append(k)
                    Element.append(tmp[l])
            else:
                Index.append(k)
                Element.append(p4[p4.columns[j+8]][k])
                
    connections[j] = pd.DataFrame(list(zip(Index, Element)), columns = ['P_index', p4.columns[j+8]])

In [27]:
connections[5].shape

(207, 2)

In [28]:
for j in range(0,len(p4.columns[8:])):
    print(p4.columns[j+8] + ': ' + str(connections[j].shape))
connections[0].head()

Inherit: (203, 2)
Reflects: (57, 2)
Absorbs: (35, 2)
Block: (162, 2)
Resists: (111, 2)
Weak: (207, 2)


Unnamed: 0,P_index,Inherit
0,0,Electricity
1,1,Negative Status
2,2,Physical
3,3,Ailment
4,4,Physical


In [29]:
p4.drop(columns = ['Inherit','Reflects','Absorbs','Block','Resists','Weak'], inplace = True)

In [30]:
print(p4.shape)
p4.head()

(211, 8)


Unnamed: 0,Name,Arcana,Base level,Strength,Magic,Endurance,Agility,Luck
0,Izanagi,Fool,1,3,2,2,3,2
1,Yomotsu-Shikome,Fool,7,2,7,7,8,4
2,Obariyon,Fool,13,11,6,11,9,9
3,Legion,Fool,21,14,14,18,13,11
4,Ose,Fool,31,28,12,20,25,15


## Final saves

In [31]:
p4.to_csv('personas.csv')

In [32]:
skills.to_csv('skills.csv')

In [33]:
skill_connection.to_csv('skill_connection.csv')

In [34]:
connections[0].to_csv('c_inherit.csv')
connections[1].to_csv('c_reflects.csv')
connections[2].to_csv('c_absorbs.csv')
connections[3].to_csv('c_block.csv')
connections[4].to_csv('c_resists.csv')
connections[5].to_csv('c_weak.csv')