# OVERLOG.GG 사이트 크롤링

## 1. import

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas
import json

## 2. 데이터 정리

* str2level : 'LV.123' 을 123인 int형으로 반환
* str2int : '1,234' 나 '36%' 를 1234, 36인 int형으로 반환
* str2int2 : '1,234 Games'나 '3 Hours' 를 1234, 3인 int형으로 반환 
* str2wld : ' 26W    36D   2L  '를 26,2,36인 int 쌍으로 반환 
* str2kda : '3.14:1'를 3.14인 float 형으로 반환
* str2sec : 2mins36secs나 2 mins 36 secs, 4 secs 등을 모두 초로 바꾸어 int 형으로 반환
* str2hour : '9 Hours'나 '9 Mins' 나 '9s'를 시간 단위로 float로 반환

In [2]:
def str2level(levelform):
    #input -> LV.123 (string)
    #output -> 123 (int)
    check = 0
    if(levelform):
        wd = ""
        for ch in levelform:
            if check == 1:
                wd = wd+ch
            else:
                if ch =='.':
                    check = 1
        return int(wd)
    return 0

def str2int(numform):
    #input -> 1,357   or  2,345  form. (string)
    #output -> 1357 or 2345 form. (int)
    if(numform):
        return int(numform.replace(",","").replace("%",""))
    else:
        return 0

def str2wld(wldform):
    #input ->   ' 26W      31D    2L 'string
    #output -> (26,31,2) pair of int
    win=0
    draw=0
    lose=0
    wd=""
    wldform = wldform.replace(" ","").replace("\n","").replace("/","")
    if(wldform):
        for ch in wldform:
            if ch =='W':
                win = int(wd)
                wd = ""
            elif ch =='D':
                draw = int(wd)
                wd = ""
            elif ch =='L':
                lose = int(wd)
                wd = ""
                break
            else:
                wd = wd+ch
    else:
        return (0,0,0)
    return (win,lose,draw)

def str2kda(kdaform):
    #input '3.16:1' form
    #output 3.16 (float)
    kdaform = kdaform.replace(" ","")
    if(kdaform):
        wd =""
        for ch in kdaform:
            if(ch ==':'):
                return float(wd)
            else:
                wd = wd+ch
    return 0.0

def str2int2(hourform):
    #input: 9 hours or 9 Games form
    #output: 9 (int)
    hourform = hourform.replace(",","").split(' ')[0]
    if(hourform):
        return int(hourform)
    return 0

def str2sec(secform):
    #input : 29 secs or 2 mins 29 secs  -> 2mins29secs or 2secs or 2mins
    #output : 29 or 149 (int)
    check = 0
    sec = 0
    minute = 0
    secform = secform.replace(" ","").replace("\n","").replace(",","")
    
    if(secform):
        # There can be - (which means 0, never burned, or played.. else)
        if secform == '-':
            return 0
        wd = ""
        for ch in secform:
            if ch == 'm' or ch =='s':
                check = check+1
            if check == 0:
                wd = wd+ch
            elif ch == 'm':
                minute = int(wd)
                wd = ""
            elif check == 2 and ch != ' ' and ch != 's':
                wd = wd+ch
            elif ch == 'e':
                sec = int(wd)
                break

        return minute * 60 + sec
    return 0

def str2hour(hour):
    #2 Hour -> 2.0
    #2 Min -> 0.03
    if(hour):    
        hour = hour.split(' ')
        if len(hour) == 1:
            return int(hour[0][:-1])*1.0/60/60
        elif hour[1][0] == 'H':
            return int(hour[0])*1.0
        elif hour[1][0] == 'M':
            return int(hour[0])*1.0/60
        else:
            return 0
    else:
        0

## 3. parsing 코드

* get_rank_page(page_num) : 해당 숫자의 랭킹 페이지의 html을 반환
* get_user_id(html) : 랭킹 페이지 html에서 user-id들을 뽑아내 반환
* get_log_page(user_id) : 각 유저의 전적 페이지의 html을 반환
* parsing_detail_data(html) : 해당 전적 html에서 데이터들을 뽑아내 반환

In [3]:

def get_rank_page(page_num):

    """
    input: page number of ranking site. 1: 1~100 rankers , 2:101~200 rankers ...
    output: html source from ranking site.
    """
    
    url = 'http://overlog.gg/leaderboards/global/rank/' + `page_num`
    r = requests.get(url)
    html = r.text
   
    return html

def get_user_id(html):
    """
    from html source, get userid.
    
    """

    id_list = []
    soup = BeautifulSoup(html,"lxml")
    trs = soup.find_all("tr")
    
    # ex) <tr data-uid="225204161111254161080204">
    for tr in trs:
        if tr.get('data-uid'):
            id_list.append(tr.get('data-uid'))
    
    return id_list

def get_log_page(user_id):
    """
    personal log page
    """
  
    url = 'http://overlog.gg/detail/overview/' + user_id + '/'
    r = requests.get(url)
    html = r.text
    
    return html

def parsing_detail_data(html):
    """
    from personal log page, parse detail data
    id, level, score, rank, winning rate, KD, play time, most hero1, most hero2, most hero3
    
    most-hero1,2,3 -> each has other dict <승, 패, 승률, K/D, 임무기여시간, 평균폭주, 플레이시간>
    """

    soup = BeautifulSoup(html,"lxml")
    
    #PlayerInfo has id, level
    div = soup.find("div", "PlayerInfo")
    divs = div.find_all("div")
    level = divs[0].contents[0]
    name = divs[1].find("span").contents[0]
    
    #PlayerSummaryInfo has score, rank, win, lose, winning_rate, K,D, play time
    bigdiv = soup.find("div", "PlayerSummaryInfo")
    
    #in PlayerSummaryInfo, SkillRating has score ex 5,000 form
    div = bigdiv.find("div", "SkillRating")
    score = div.find("b").contents[0]
    
    #in PlayerSummaryInfo, PlayerSummaryStats has win, lose, winning rate (90%)
    # winning rate 90%form    win-> 51W  3D   2L  form
    div = bigdiv.find("div", "PlayerSummaryStats")
    winning_rate = div.find("span").contents[0]
    win = div.find("em").contents[0]

    #in PlayerSummaryInfo, first PlayerSummaryStat-LeftLine has K/D, second has play time
    divs = bigdiv.find_all("div", "PlayerSummaryStat-LeftLine")
    kda = divs[0].find("span").contents[0]
    play_time = divs[1].find("span").contents[0]
    game_num = divs[1].find("em").contents[0]

    #in ChampionStatsTable, Hero information list.
    table = soup.find("div", "ChampionStatsTable")
    trs = table.find_all("tr")

    h_hero = []
    h_win = []
    h_lose = []
    h_winning_rate = []
    h_kd = []
    h_avg_work_sec = []
    h_avg_burn_sec = []
    h_play_hour = []
    
    #most 1~3
    for i in range(1,4):
        if trs[i].get('class')[0] == 'NotPlayed':
            h_hero.append("Not_Played")
            h_win.append(0)
            h_lose.append(0)
            h_winning_rate.append(0)
            h_kd.append(0)
            h_avg_work_sec.append(0)
            h_avg_burn_sec.append(0)
            h_play_hour.append(0)
        else:
            tds = trs[i].find_all("td")
            h_hero.append(tds[0].contents[2].replace(" ","").replace("\n",""))
            h_win.append(int(tds[1].find("b").contents[0]))
            h_lose.append(int(tds[2].find("b").contents[0]))
            h_winning_rate.append(str2int(tds[3].contents[0]))
            h_kd.append(str2kda(tds[4].find("b").contents[0]))
            h_avg_work_sec.append(str2sec(tds[5].contents[0]))
            h_avg_burn_sec.append(str2sec(tds[6].find("b").contents[0]))
            h_play_hour.append(tds[7].contents[0])
            
            
    win, lose, draw = str2wld(win)
    inform = {
        
        'id' : name.replace(" ","").replace("\n",""),
        'level' : str2level(level),
        'score' : str2int(score),
        'win' : win,
        'lose' : lose,
        'draw' : draw,
        'winning_rate': str2int(winning_rate),
        'kd' : str2kda(kda),
        'play_time' : str2int2(play_time),
        'num_of_games' : str2int2(game_num),
        'most1champ' : {
            'name' : h_hero[0],
            'win' : h_win[0],
            'lose' : h_lose[0],
            'winning_rate' : h_winning_rate[0],
            'kd' : h_kd[0],
            'avg_burn_sec' : h_avg_burn_sec[0],
            'avg_work_sec' : h_avg_work_sec[0],
            'play_hour' : str2hour(h_play_hour[0])
        } ,      
        'most2champ' : {
            'name' : h_hero[1],
            'win' : h_win[1],
            'lose' : h_lose[1],
            'winning_rate' : h_winning_rate[1],
            'kd' : h_kd[1],
            'avg_burn_sec' : h_avg_burn_sec[1],
            'avg_work_sec' : h_avg_work_sec[1],
            'play_hour' : str2hour(h_play_hour[1])
        },
        'most3champ' : {
            'name' : h_hero[2],
            'win' : h_win[2],
            'lose' : h_lose[2],
            'winning_rate' : h_winning_rate[2],
            'kd' : h_kd[2],
            'avg_burn_sec' : h_avg_burn_sec[2],
            'avg_work_sec' : h_avg_work_sec[2],
            'play_hour' : str2hour(h_play_hour[2])
        }
    }
    return inform

### 예시 (현재 1위인 EVERMORE)

In [4]:
data = parsing_detail_data(get_log_page('62056152118194164203194'))

In [5]:
data

{'draw': 3,
 'id': u'EVERMORE',
 'kd': 6.43,
 'level': 428,
 'lose': 2,
 'most1champ': {'avg_burn_sec': 127,
  'avg_work_sec': 29,
  'kd': 6.42,
  'lose': 4,
  'name': u'Roadhog',
  'play_hour': 8.0,
  'win': 47,
  'winning_rate': 92},
 'most2champ': {'avg_burn_sec': 167,
  'avg_work_sec': 149,
  'kd': 8.88,
  'lose': 1,
  'name': u'D.Va',
  'play_hour': 0.31666666666666665,
  'win': 0,
  'winning_rate': 0},
 'most3champ': {'avg_burn_sec': 39,
  'avg_work_sec': 60,
  'kd': 3.14,
  'lose': 0,
  'name': u'L\xfacio',
  'play_hour': 0.26666666666666666,
  'win': 1,
  'winning_rate': 100},
 'num_of_games': 57,
 'play_time': 9,
 'score': 5000,
 'win': 52,
 'winning_rate': 96}

## 4. 실행

* main(1,2,"outputfile.json") : 1~200등 까지의 유저 조사

In [6]:
def main(spage=1,epage=2,filename='sample1.json'):
    
    user_ids = []
    if spage < 0 :
        sys.exit("start page should be bigger than 1!!")
    elif epage < spage :
        sys.exit("end page should be bigger than start page!!")
        
        
    for i in range(spage,epage+1):
        user_ids = user_ids + get_user_id(get_rank_page(i))

    data = { user_id : parsing_detail_data(get_log_page(user_id))
        for user_id in user_ids}
    

    with open("sample1.json", "w") as jsonFile:
        jsonFile.write(json.dumps(data))

    return data

In [7]:
data = main(1,1,'sample1.json')
pandas.DataFrame.from_records(data)

Unnamed: 0,100118137052093064070025,106116145125187169177143,109086163112108211020105,118136103145115002013031,120240168126194116120090,122182136070199174047095,124170035240131223086095,127156234038122117250176,131087203239184225084012,133074108000166100047129,...,82077213066006222211212,85119182188060028068249,86004043050096153231050,89009048241147223095125,91070255179161050131086,93223119232238095026111,94058034163198013124052,95167171196022074248220,96211065043005070199060,97039195154087003190121
draw,18,2,14,10,40,7,6,24,117,15,...,8,10,17,9,27,22,10,5,11,25
id,BKBERNAR,zza,Nikhcnum,ShaDowBurn,ROXTRY,SPREE,hyped,ToT,쪼물락낙지,REUNGREUNG,...,쪼낙,TheHell,shadder2k,Shaz,HaventMetYou,claris,dafran,Didier,Dante,siralc
kd,3.54,4.46,3.86,4,2.85,4.21,4.75,3.06,2.2,2.37,...,2.4,3.57,3.59,3.61,3.11,3.23,3.38,3.55,2.99,3.17
level,160,212,57,504,675,328,428,66,555,486,...,221,493,323,567,139,540,307,107,304,100
lose,85,20,38,102,165,46,21,52,311,40,...,25,26,135,119,118,97,97,74,56,67
most1champ,"{u'winning_rate': 59, u'avg_work_sec': 60, u'n...","{u'winning_rate': 70, u'avg_work_sec': 120, u'...","{u'winning_rate': 78, u'avg_work_sec': 41, u'n...","{u'winning_rate': 59, u'avg_work_sec': 55, u'n...","{u'winning_rate': 61, u'avg_work_sec': 40, u'n...","{u'winning_rate': 65, u'avg_work_sec': 60, u'n...","{u'winning_rate': 81, u'avg_work_sec': 60, u'n...","{u'winning_rate': 52, u'avg_work_sec': 60, u'n...","{u'winning_rate': 55, u'avg_work_sec': 30, u'n...","{u'winning_rate': 51, u'avg_work_sec': 36, u'n...",...,"{u'winning_rate': 62, u'avg_work_sec': 29, u'n...","{u'winning_rate': 66, u'avg_work_sec': 60, u'n...","{u'winning_rate': 60, u'avg_work_sec': 60, u'n...","{u'winning_rate': 72, u'avg_work_sec': 42, u'n...","{u'winning_rate': 61, u'avg_work_sec': 41, u'n...","{u'winning_rate': 57, u'avg_work_sec': 39, u'n...","{u'winning_rate': 67, u'avg_work_sec': 60, u'n...","{u'winning_rate': 67, u'avg_work_sec': 60, u'n...","{u'winning_rate': 61, u'avg_work_sec': 36, u'n...","{u'winning_rate': 52, u'avg_work_sec': 36, u'n..."
most2champ,"{u'winning_rate': 53, u'avg_work_sec': 32, u'n...","{u'winning_rate': 57, u'avg_work_sec': 60, u'n...","{u'winning_rate': 56, u'avg_work_sec': 35, u'n...","{u'winning_rate': 61, u'avg_work_sec': 60, u'n...","{u'winning_rate': 43, u'avg_work_sec': 35, u'n...","{u'winning_rate': 72, u'avg_work_sec': 60, u'n...","{u'winning_rate': 80, u'avg_work_sec': 60, u'n...","{u'winning_rate': 60, u'avg_work_sec': 60, u'n...","{u'winning_rate': 25, u'avg_work_sec': 17, u'n...","{u'winning_rate': 58, u'avg_work_sec': 60, u'n...",...,"{u'winning_rate': 33, u'avg_work_sec': 36, u'n...","{u'winning_rate': 73, u'avg_work_sec': 46, u'n...","{u'winning_rate': 50, u'avg_work_sec': 38, u'n...","{u'winning_rate': 55, u'avg_work_sec': 60, u'n...","{u'winning_rate': 57, u'avg_work_sec': 39, u'n...","{u'winning_rate': 63, u'avg_work_sec': 22, u'n...","{u'winning_rate': 59, u'avg_work_sec': 36, u'n...","{u'winning_rate': 67, u'avg_work_sec': 60, u'n...","{u'winning_rate': 62, u'avg_work_sec': 60, u'n...","{u'winning_rate': 68, u'avg_work_sec': 17, u'n..."
most3champ,"{u'winning_rate': 64, u'avg_work_sec': 43, u'n...","{u'winning_rate': 57, u'avg_work_sec': 60, u'n...","{u'winning_rate': 60, u'avg_work_sec': 43, u'n...","{u'winning_rate': 67, u'avg_work_sec': 40, u'n...","{u'winning_rate': 60, u'avg_work_sec': 60, u'n...","{u'winning_rate': 68, u'avg_work_sec': 38, u'n...","{u'winning_rate': 83, u'avg_work_sec': 60, u'n...","{u'winning_rate': 63, u'avg_work_sec': 40, u'n...","{u'winning_rate': 33, u'avg_work_sec': 18, u'n...","{u'winning_rate': 50, u'avg_work_sec': 43, u'n...",...,"{u'winning_rate': 100, u'avg_work_sec': 52, u'...","{u'winning_rate': 71, u'avg_work_sec': 60, u'n...","{u'winning_rate': 55, u'avg_work_sec': 60, u'n...","{u'winning_rate': 63, u'avg_work_sec': 24, u'n...","{u'winning_rate': 51, u'avg_work_sec': 38, u'n...","{u'winning_rate': 58, u'avg_work_sec': 44, u'n...","{u'winning_rate': 53, u'avg_work_sec': 28, u'n...","{u'winning_rate': 78, u'avg_work_sec': 120, u'...","{u'winning_rate': 62, u'avg_work_sec': 60, u'n...","{u'winning_rate': 66, u'avg_work_sec': 57, u'n..."
num_of_games,228,64,139,281,445,154,144,178,849,112,...,86,114,347,317,306,288,265,212,165,209
play_time,50,14,29,64,96,33,31,35,187,25,...,18,25,79,68,63,64,59,46,37,43


In [8]:
#f = open('sample1.json')
#json_str = f.read()
#f.close()
with open('sample1.json') as data_file:    
    data = json.load(data_file)
#data = json.loads(json_str)

In [9]:
data = pandas.DataFrame(data)
pandas.DataFrame.from_records(data)

Unnamed: 0,100118137052093064070025,106116145125187169177143,109086163112108211020105,118136103145115002013031,120240168126194116120090,122182136070199174047095,124170035240131223086095,127156234038122117250176,131087203239184225084012,133074108000166100047129,...,82077213066006222211212,85119182188060028068249,86004043050096153231050,89009048241147223095125,91070255179161050131086,93223119232238095026111,94058034163198013124052,95167171196022074248220,96211065043005070199060,97039195154087003190121
0,18,2,14,10,40,7,6,24,117,15,...,8,10,17,9,27,22,10,5,11,25
1,BKBERNAR,zza,Nikhcnum,ShaDowBurn,ROXTRY,SPREE,hyped,ToT,쪼물락낙지,REUNGREUNG,...,쪼낙,TheHell,shadder2k,Shaz,HaventMetYou,claris,dafran,Didier,Dante,siralc
2,3.54,4.46,3.86,4,2.85,4.21,4.75,3.06,2.2,2.37,...,2.4,3.57,3.59,3.61,3.11,3.23,3.38,3.55,2.99,3.17
3,160,212,57,504,675,328,428,66,555,486,...,221,493,323,567,139,540,307,107,304,100
4,85,20,38,102,165,46,21,52,311,40,...,25,26,135,119,118,97,97,74,56,67
5,"{u'avg_work_sec': 60, u'name': u'Zarya', u'kd'...","{u'avg_work_sec': 120, u'name': u'D.Va', u'kd'...","{u'avg_work_sec': 41, u'name': u'McCree', u'kd...","{u'avg_work_sec': 55, u'name': u'Genji', u'kd'...","{u'avg_work_sec': 40, u'name': u'Ana', u'kd': ...","{u'avg_work_sec': 60, u'name': u'Zarya', u'kd'...","{u'avg_work_sec': 60, u'name': u'D.Va', u'kd':...","{u'avg_work_sec': 60, u'name': u'Reinhardt', u...","{u'avg_work_sec': 30, u'name': u'Ana', u'kd': ...","{u'avg_work_sec': 36, u'name': u'Ana', u'kd': ...",...,"{u'avg_work_sec': 29, u'name': u'Ana', u'kd': ...","{u'avg_work_sec': 60, u'name': u'Roadhog', u'k...","{u'avg_work_sec': 60, u'name': u'Genji', u'kd'...","{u'avg_work_sec': 42, u'name': u'Ana', u'kd': ...","{u'avg_work_sec': 41, u'name': u'McCree', u'kd...","{u'avg_work_sec': 39, u'name': u'McCree', u'kd...","{u'avg_work_sec': 60, u'name': u'Tracer', u'kd...","{u'avg_work_sec': 60, u'name': u'Zarya', u'kd'...","{u'avg_work_sec': 36, u'name': u'Soldier:76', ...","{u'avg_work_sec': 36, u'name': u'McCree', u'kd..."
6,"{u'avg_work_sec': 32, u'name': u'McCree', u'kd...","{u'avg_work_sec': 60, u'name': u'Pharah', u'kd...","{u'avg_work_sec': 35, u'name': u'Soldier:76', ...","{u'avg_work_sec': 60, u'name': u'D.Va', u'kd':...","{u'avg_work_sec': 35, u'name': u'Soldier:76', ...","{u'avg_work_sec': 60, u'name': u'D.Va', u'kd':...","{u'avg_work_sec': 60, u'name': u'Zarya', u'kd'...","{u'avg_work_sec': 60, u'name': u'Genji', u'kd'...","{u'avg_work_sec': 17, u'name': u'Genji', u'kd'...","{u'avg_work_sec': 60, u'name': u'Lúcio', u'kd'...",...,"{u'avg_work_sec': 36, u'name': u'Roadhog', u'k...","{u'avg_work_sec': 46, u'name': u'Mei', u'kd': ...","{u'avg_work_sec': 38, u'name': u'Soldier:76', ...","{u'avg_work_sec': 60, u'name': u'Genji', u'kd'...","{u'avg_work_sec': 39, u'name': u'Soldier:76', ...","{u'avg_work_sec': 22, u'name': u'Soldier:76', ...","{u'avg_work_sec': 36, u'name': u'Soldier:76', ...","{u'avg_work_sec': 60, u'name': u'D.Va', u'kd':...","{u'avg_work_sec': 60, u'name': u'Tracer', u'kd...","{u'avg_work_sec': 17, u'name': u'Soldier:76', ..."
7,"{u'avg_work_sec': 43, u'name': u'Ana', u'kd': ...","{u'avg_work_sec': 60, u'name': u'Roadhog', u'k...","{u'avg_work_sec': 43, u'name': u'Roadhog', u'k...","{u'avg_work_sec': 40, u'name': u'Pharah', u'kd...","{u'avg_work_sec': 60, u'name': u'Lúcio', u'kd'...","{u'avg_work_sec': 38, u'name': u'Soldier:76', ...","{u'avg_work_sec': 60, u'name': u'Reinhardt', u...","{u'avg_work_sec': 40, u'name': u'Soldier:76', ...","{u'avg_work_sec': 18, u'name': u'Roadhog', u'k...","{u'avg_work_sec': 43, u'name': u'Mercy', u'kd'...",...,"{u'avg_work_sec': 52, u'name': u'Zarya', u'kd'...","{u'avg_work_sec': 60, u'name': u'Tracer', u'kd...","{u'avg_work_sec': 60, u'name': u'Roadhog', u'k...","{u'avg_work_sec': 24, u'name': u'Soldier:76', ...","{u'avg_work_sec': 38, u'name': u'Ana', u'kd': ...","{u'avg_work_sec': 44, u'name': u'Ana', u'kd': ...","{u'avg_work_sec': 28, u'name': u'Torbjörn', u'...","{u'avg_work_sec': 120, u'name': u'Lúcio', u'kd...","{u'avg_work_sec': 60, u'name': u'Roadhog', u'k...","{u'avg_work_sec': 57, u'name': u'Roadhog', u'k..."
8,228,64,139,281,445,154,144,178,849,112,...,86,114,347,317,306,288,265,212,165,209
9,50,14,29,64,96,33,31,35,187,25,...,18,25,79,68,63,64,59,46,37,43
