# Scrape FF14 items from https://na.finalfantasyxiv.com/lodestone/playguide/db/item 

In [1]:
import requests
from bs4 import BeautifulSoup as bs
import re
import time

# Preview HTML

In [2]:
URL = 'https://na.finalfantasyxiv.com/lodestone/playguide/db/item'
req = requests.get(URL)
soup = bs(req.text, 'html.parser')

#grabs the table element
table = soup.find_all('div', attrs={'class':"db-table__wrapper"}) 

#grabs every <td>, can iterate through
rows = soup.find_all('td', attrs={'class':"db-table__body--light latest_patch__major__item"})

rows[0]

<td class="db-table__body--light latest_patch__major__item">
<div class="db-list__item__icon latest_patch__major__box">
<div class="db-list__item__icon__inner">
<div class="staining"></div>
<img alt="" class="db-list__item__icon__item_image" height="40" src="https://img.finalfantasyxiv.com/lds/pc/global/images/itemicon/c3/c3d79cdeae09313642aee21ce715e64e01d816e4.png?n6.15" width="40"/> <a href="/lodestone/playguide/db/item/275d11e2087/">
<div class="db-list__item__icon__cover db_popup" data-ldst-href="/lodestone/playguide/db/item/275d11e2087/"></div>
</a>
</div>
</div>
<div class="db-table__link_txt">
<span class="db-table__txt--type">
<a href="/lodestone/playguide/db/item/?category2=1">Arms</a>
							&gt;
							<a href="/lodestone/playguide/db/item/?category2=1&amp;category3=2">Gladiator's Arm</a>
</span>
<a class="db_popup db-table__txt--detail_link" href="/lodestone/playguide/db/item/275d11e2087/">Asphodelos Longsword</a>
</div>
</td>

# Scrape category1, category2, item within Table

In [4]:
#Find only categories within the table (excludes sidebar). 

#Category 1 and 2 require regex arguments:
#Category 1 href: "/lodestone/playguide/db/item/?category2=1"  <-- category2=n where n changes per category 
#Category 2 href: "/lodestone/playguide/db/item/?category2=1&amp;category3=2"  <-- category3=n where n changes per cat
#Item: a, class="db_popup db-table__txt--detail_link"

#Create regex function to include "category2" and exclude "category3" to identify category_one
def not_cat_two(href):
    match = re.compile('category2').search(str(href))
    if match:
        return href and not re.compile("category3").search(href)
    
for i in table:
    category_one = i.find_all(href=not_cat_two)
    category_two = i.find_all(href=re.compile("category3="))
    items = i.find_all('a', attrs={'class':"db_popup db-table__txt--detail_link"})

category_one = [i.text for i in category_one]
category_two = [i.text for i in category_two]
items = [i.text for i in items]

#Zip lists
item_list = list(zip(category_one, category_two, items))
item_list[0:5]

[('Arms', "Gladiator's Arm", 'Asphodelos Longsword'),
 ('Arms', "Gladiator's Arm", 'Ultimate Sword of the Heavens'),
 ('Arms', "Gladiator's Arm", "Augmented Radiant's Bastard Sword"),
 ('Arms', "Gladiator's Arm", 'Bluefeather Sword'),
 ('Arms', "Gladiator's Arm", "Radiant's Bastard Sword")]

# Scrape from each page

In [5]:
start = time.time()
URL = 'https://na.finalfantasyxiv.com/lodestone/playguide/db/item?page='  #item?page=685

req = requests.get(URL) #remove?
soup = bs(req.text, 'html.parser') #remove?

def not_cat_two(href):
    match = re.compile('category2').search(str(href))
    if match:
        return href and not re.compile("category3").search(href)

cat_one = []
cat_two = []
item_list = []
for page in range(1,686):
    req = requests.get(URL + str(page))
    soup = bs(req.text, 'html.parser')
    table = soup.find_all('div', attrs={'class':"db-table__wrapper"})
    
    for i in table:
        category_one = i.find_all(href=not_cat_two)
        category_two = i.find_all(href=re.compile("category3="))
        items = i.find_all('a', attrs={'class':"db_popup db-table__txt--detail_link"})
    
    [cat_one.append(i.text) for i in category_one]
    [cat_two.append(i.text) for i in category_two]
    [item_list.append(i.text) for i in items]
    
end = time.time()
print(end - start)

1535.1348598003387


In [6]:
final_list = list(zip(cat_one, cat_two, item_list))
print(len(final_list))
final_list[32828:32835]

34227


[('Other', 'Other', 'Rolling Tankard Ignition Key'),
 ('Other', 'Other', 'Gabriel α Identification Key'),
 ('Other', 'Other', 'Megalotragus Horn'),
 ('Other', 'Other', 'Gwiber of Light Trumpet'),
 ('Other', 'Other', 'Jibanyan Couch Medal'),
 ('Other', 'Other', 'Incitatus Whistle'),
 ('Other', 'Other', 'Ehll Tou Whistle')]

# Write to csv 

In [7]:
# without csv module

cols = ['Category1', 'Category2','Item']
with open('ff14_items.csv','w',  encoding='utf-8') as f:
    for head in cols[:-1]:
        fr = f.write(f'{head},') # add first value, with a comma for csv
    fr = f.write(cols[-1])       # add last value separately
    fr = f.write('\n')           # insert paragraph break
    
    for i in final_list:
        for value in i[:-1]:
            fr = f.write(f'{value},')  
        fr = f.write(f'{i[-1]}')
        fr = f.write('\n')

In [10]:
# Preview
with open('ff14_items.csv', 'r', encoding='utf-8') as f:
    fr = f.readlines()
fr[0:10]

['Category1,Category2,Item\n',
 "Arms,Gladiator's Arm,Asphodelos Longsword\n",
 "Arms,Gladiator's Arm,Ultimate Sword of the Heavens\n",
 "Arms,Gladiator's Arm,Augmented Radiant's Bastard Sword\n",
 "Arms,Gladiator's Arm,Bluefeather Sword\n",
 "Arms,Gladiator's Arm,Radiant's Bastard Sword\n",
 "Arms,Gladiator's Arm,Augmented Classical Longsword\n",
 "Arms,Gladiator's Arm,Bastard Sword of Divine Light\n",
 "Arms,Gladiator's Arm,Classical Longsword\n",
 "Arms,Gladiator's Arm,Moonward Longsword\n"]

In [12]:
# with csv module

import csv
with open('ff14_items.csv','w',encoding='utf-8',newline="") as f:
    fr = csv.writer(f)
    #Write Header
    fr.writerow(('Category1', 'Category2', 'Item'))

    #Write rows
    for i in final_list:
        fr.writerow(i)