In [3]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
from selenium import webdriver

# Warcraft Logs

For this project, I will be using data scraped from [warcraftlogs](https://classic.warcraftlogs.com/) to build a linear regression model that can predict how long it will take a group to kill the WoW Classic raid boss Onyxia.

While the website contains information for all raid bosses in the game, I felt it would be best to focus on one for this project due to the very large differences in fight mechanics and relative difficulty. Models should likely be tuned for individual fights for improved accuracy.

The first step will be to collect a list of links to fight summaries for the specific raid boss we are looking at.

In [17]:
# Create an empty dictionary to store links and guild name information.
wow_logs = {}
"""
Because I wanted to get a larger time span of data, I will skip 40 pages at a time and pull all the reports from those pages
This will give us data going back to about January. This should help us capture a larger number of groups, some who may
be very familair with the fight by now, and those who may only just be starting the content. ex.(40,80,120...)
"""
for i in range(0,2000,40):
    url = 'https://classic.warcraftlogs.com/zone/reports?zone=1001&page='+str(i)
    response = requests.get(url)
    page = response.text

    # Pull the relevant table of information which contains the link to the report as well as guild information.
    soup = BeautifulSoup(page,"lxml")
    table=soup.find('table')
    rows = [row for row in table.find_all('tr')]
    
    # Iterate through the rows of the table and add them to the dictionary
    for row in rows:
        #gets report link
        link = row.find_all('td')[0].find('a')['href']
    
        #name of guild
        guild = row.find_all('td')[2].text
        
        #add to dictionary
        wow_logs[link] = [link,guild]


len(wow_logs)

5000

We now have 5000 links to run through. It is important to remember for rerunning that, this will now pull different data as new logs are posted every day.

# Pulling the Fight Summary Data

First I will initiate a selenium webdriver as we will have to interact with certain elements of the page and load some javascript.

In [None]:
# indicate where in my local machine the geckodriver is for using firefox.
driver = webdriver.Firefox(executable_path=r'C:/Users/atag3/Desktop/NYC DSA Python EDA/Untitled Folder/geckodriver.exe')

# we have to click through the first message accepting cookies. this should only need to be done once a window is open.
driver.get('https://classic.warcraftlogs.com/reports/b2tKdk1xmj7pw43z')
butt = driver.find_elements_by_xpath('/html/body/div[2]/div/div/div/div[2]/div/button[3]')
butt[0].click()

In [None]:
# A New empty dictionary to store our values.
new_logs = {}

# A counter in case our scraping hits an error or stops, this will make it easier to resume where we hit an error.
counter_place = 0

# There was some sort of memory leak issue that cause the firefox driver to crash after some amount of time. This will act as
# a counter to close and start a new driver after going through 100 reports.
subcounter = 0

#now we want to go through each link and pull the relevant info.
for links in list(wow_logs.keys())[counter_place:len(wow_logs.keys())]:
    
    # Navigate to the fight summary page
    link = 'https://classic.warcraftlogs.com'+links
    driver.get(link)
    
    # Look for boss fights recorded in the log and return the 3rd boss fight. It should always be onyxia.
    p_element = driver.find_elements_by_class_name("report-overview-boss-caption")
    
    # If the page correctly loaded and the fight summary is there move on and collect the data. Otherwise skip to the next.
    if len(p_element)==0:
        pass
    else:
    
        try:
            #click the onyxia fight to give us the fight report
            p_element[2].click()
            time.sleep(.5+2*random.random())

            #at this point we have made it to the fight summary page we want.

            # First I want to make sure we are looking at Onyxia
            boss_icon = driver.find_elements_by_id("filter-fight-boss-icon")
            is_ony = 'Onyxia' if re.search('1084-icon.jpg',boss_icon[0].get_attribute('src')) else 'Not Onyxia'

            #collect the fight time and status
            kill_wipe= driver.find_elements_by_id("filter-fight-details-text")

            #is this a wipe or kill?
            is_kill = kill_wipe[0].text.split(" (")[0]

            #how long did it take? format in MM:SS
            time_f = kill_wipe[0].text.split('(')[1].split(')')[0]

            # look at character details. The format is Class (Specialization), so we can use that to identify how many of each
            # class show up in the list.
            player_info = driver.find_elements_by_class_name("character-details-contents")
            chars = player_info[0].text.split('\n')
            
            #if a line matches our Class (Specialization) format, add it to a list
            classes = [i for i in chars if '(' in i] 
            
            # make a dict of class (spec) counts
            classes = Counter(classes)

            # Average Item Level. The list of text indicates tanks then healers then dps. We can use these words to seperate the 
            # groups of players into their roles, and take an average of their item level.
            tanks_end=chars.index('DPS')
            dps_end=chars.index('Healers')
            heals_end=len(chars)

            #Average Ilvl of tanks. Elements in the list that contain a numeric digit are always item level information.
            ilvls = [re.findall("\d+", i) for i in chars[0:tanks_end] if 'Item Level' in i]
            flattened_ilvl = [int(val) for sublist in ilvls for val in sublist]
            tank_ilvls = np.mean(flattened_ilvl)

            #Average Ilvl of DPS
            ilvls = [re.findall("\d+", i) for i in chars[tanks_end:dps_end] if 'Item Level' in i]
            flattened_ilvl = [int(val) for sublist in ilvls for val in sublist]
            dps_ilvls = np.mean(flattened_ilvl)

            #Average Ilvl of Healers
            ilvls = [re.findall("\d+", i) for i in chars[dps_end:heals_end] if 'Item Level' in i]
            flattened_ilvl = [int(val) for sublist in ilvls for val in sublist]
            heals_ilvls = np.mean(flattened_ilvl)

            # World Buff Info. This returns how many world buffs are on the entire group.
            buff_info = driver.find_elements_by_xpath('/html/body/div[3]/div[2]/div[6]/div[3]/div[1]/div[7]/div[3]/div[2]/div[7]/div[2]/div[*]/div/table/tbody/tr[*]/td[3]/a[*]/img')
            w_buff = len(buff_info)

            # Add the information we have scrapped to the dictionary.
            new_logs[links] = {'boss':is_ony,
                               'kill':is_kill,
                               'fight_time':time_f,
                               'tank_avg_ilvl': tank_ilvls,
                               'dps_avg_ilvl': dps_ilvls,
                               'heals_avg_ilvl': heals_ilvls,
                               'world_buffs': w_buff
                              }
            
            # add the counts of class (spec) dictionary to our dictionary.
            new_logs[links].update(dict(zip(list(classes.keys()),list(classes.values()))))

            counter_place += 1
            subcounter += 1
            
            # For every 100 reports we go through, we will close the window, open a new one and click the cookie agreement.
            if subcounter >=100:
                driver.quit()
                # Start up remote window
                driver = webdriver.Firefox(executable_path=r'C:/Users/atag3/Desktop/NYC DSA Python EDA/Untitled Folder/geckodriver.exe')
                driver.get('https://classic.warcraftlogs.com/reports/b2tKdk1xmj7pw43z')

                # we have to click through the first message accepting cookies. this should only nee to be done once.
                butt = driver.find_elements_by_xpath('/html/body/div[2]/div/div/div/div[2]/div/button[3]')
                butt[0].click()
                print("reset browser")
                
                #reset the subcounter
                subcounter=0
                
        except Exception:
            # A generic error printout. 
            # We have enough data to iterate through that losing some reports from errors should be fine.
            
            print("oh no an exception")
            continue



We should now have our data in a dictionary, and we can simply convert it to a pandas dataframe and pickle it for analysis.

In [None]:
df = pd.DataFrame.from_dict(new_logs, orient='index')
df.to_pickle('wowlogs_data_v2.pkl')