# [빅데이터 5조] 01. 데이터 수집 
### 목차
- [1.1 분석 대상 게임 목록 선별](#1.1-분석-대상-게임-목록-선별)
- [1.2 리뷰 데이터 수집](#1.2-리뷰-데이터-수집)

In [1]:
from datetime import datetime
import gc
import json
import matplotlib.pyplot as plt 
import os
import pandas as pd
import re
import requests
import time
from tqdm import tqdm, trange
import urllib.parse
import numpy as np

---
### 1.1 분석 대상 게임 목록 선별

In [2]:
# 월별 스팀 이용자 통계 데이터셋 (https://www.kaggle.com/connorwynkoop/steam-monthly-player-data)
# mu is Monthly User
mu_df = pd.read_csv("./dataset/AllSteamData.csv", encoding="utf-8-sig")
display(mu_df.head(5))

# 4만여 가지의 스팀 게임 정보 데이터셋 (https://www.kaggle.com/trolukovich/steam-games-complete-dataset)
# gi is Game Info
gi_df = pd.read_csv("./dataset/steam_games.csv", encoding="utf-8-sig")
display(gi_df.head(5))

# 해당 데이터셋들은 최신의 데이터는 아니지만, 해당 데이터 셋들로 데이터 수집의 방향성을 정하는 지표 정도로 취급

Unnamed: 0,Name,Month,Avg. Players,Gain,% Gain,Peak Players
0,Counter-Strike,Last 30 Days,8488.74,27.6,0.33%,15065
1,Counter-Strike,Sep-21,8461.12,-390.07,-4.41%,14559
2,Counter-Strike,Aug-21,8851.19,-286.57,-3.14%,14064
3,Counter-Strike,Jul-21,9137.76,-359.69,-3.79%,14972
4,Counter-Strike,Jun-21,9497.45,-1243.5,-11.58%,16391


Unnamed: 0,url,types,name,desc_snippet,recent_reviews,all_reviews,release_date,developer,publisher,popular_tags,game_details,languages,achievements,genre,game_description,mature_content,minimum_requirements,recommended_requirements,original_price,discount_price
0,https://store.steampowered.com/app/379720/DOOM/,app,DOOM,Now includes all three premium DLC packs (Unto...,"Very Positive,(554),- 89% of the 554 user revi...","Very Positive,(42,550),- 92% of the 42,550 use...","May 12, 2016",id Software,"Bethesda Softworks,Bethesda Softworks","FPS,Gore,Action,Demons,Shooter,First-Person,Gr...","Single-player,Multi-player,Co-op,Steam Achieve...","English,French,Italian,German,Spanish - Spain,...",54.0,Action,"About This Game Developed by id software, the...",,"Minimum:,OS:,Windows 7/8.1/10 (64-bit versions...","Recommended:,OS:,Windows 7/8.1/10 (64-bit vers...",$19.99,$14.99
1,https://store.steampowered.com/app/578080/PLAY...,app,PLAYERUNKNOWN'S BATTLEGROUNDS,PLAYERUNKNOWN'S BATTLEGROUNDS is a battle roya...,"Mixed,(6,214),- 49% of the 6,214 user reviews ...","Mixed,(836,608),- 49% of the 836,608 user revi...","Dec 21, 2017",PUBG Corporation,"PUBG Corporation,PUBG Corporation","Survival,Shooter,Multiplayer,Battle Royale,PvP...","Multi-player,Online Multi-Player,Stats","English,Korean,Simplified Chinese,French,Germa...",37.0,"Action,Adventure,Massively Multiplayer",About This Game PLAYERUNKNOWN'S BATTLEGROUND...,Mature Content Description The developers de...,"Minimum:,Requires a 64-bit processor and opera...","Recommended:,Requires a 64-bit processor and o...",$29.99,
2,https://store.steampowered.com/app/637090/BATT...,app,BATTLETECH,Take command of your own mercenary outfit of '...,"Mixed,(166),- 54% of the 166 user reviews in t...","Mostly Positive,(7,030),- 71% of the 7,030 use...","Apr 24, 2018",Harebrained Schemes,"Paradox Interactive,Paradox Interactive","Mechs,Strategy,Turn-Based,Turn-Based Tactics,S...","Single-player,Multi-player,Online Multi-Player...","English,French,German,Russian",128.0,"Action,Adventure,Strategy",About This Game From original BATTLETECH/Mec...,,"Minimum:,Requires a 64-bit processor and opera...","Recommended:,Requires a 64-bit processor and o...",$39.99,
3,https://store.steampowered.com/app/221100/DayZ/,app,DayZ,The post-soviet country of Chernarus is struck...,"Mixed,(932),- 57% of the 932 user reviews in t...","Mixed,(167,115),- 61% of the 167,115 user revi...","Dec 13, 2018",Bohemia Interactive,"Bohemia Interactive,Bohemia Interactive","Survival,Zombies,Open World,Multiplayer,PvP,Ma...","Multi-player,Online Multi-Player,Steam Worksho...","English,French,Italian,German,Spanish - Spain,...",,"Action,Adventure,Massively Multiplayer",About This Game The post-soviet country of Ch...,,"Minimum:,OS:,Windows 7/8.1 64-bit,Processor:,I...","Recommended:,OS:,Windows 10 64-bit,Processor:,...",$44.99,
4,https://store.steampowered.com/app/8500/EVE_On...,app,EVE Online,EVE Online is a community-driven spaceship MMO...,"Mixed,(287),- 54% of the 287 user reviews in t...","Mostly Positive,(11,481),- 74% of the 11,481 u...","May 6, 2003",CCP,"CCP,CCP","Space,Massively Multiplayer,Sci-fi,Sandbox,MMO...","Multi-player,Online Multi-Player,MMO,Co-op,Onl...","English,German,Russian,French",,"Action,Free to Play,Massively Multiplayer,RPG,...",About This Game,,"Minimum:,OS:,Windows 7,Processor:,Intel Dual C...","Recommended:,OS:,Windows 10,Processor:,Intel i...",Free,


In [3]:
# mu_df의 열 이름 통일화
mu_df.columns = ["name", "month", "avg_players", "gain", "gain_percentage", "peak_players"]

# mu_df의 시계열 데이터 정규화
mu_df = mu_df.drop(mu_df[mu_df["month"] == "Last 30 Days"].index) # 불필요한 행 삭제
mu_df["month"] = mu_df[["month"]].apply(
    lambda x: datetime.strptime(x["month"], "%b-%y").strftime("%Y-%m"), axis=1
)

# gi_df의 시계열 데이터 정규화
def normalize_release_date(release_date):
    try:
        return datetime.strptime(release_date, "%b %d, %Y").strftime("%Y-%m-%d")
    except:
        return np.nan
        
gi_df["release_date"] = gi_df[["release_date"]].apply(
    lambda x: normalize_release_date(x["release_date"]), axis=1
)

display(mu_df[["name", "month"]].head(3))
display(gi_df[["name", "release_date"]].head(3))

Unnamed: 0,name,month
1,Counter-Strike,2021-09
2,Counter-Strike,2021-08
3,Counter-Strike,2021-07


Unnamed: 0,name,release_date
0,DOOM,2016-05-12
1,PLAYERUNKNOWN'S BATTLEGROUNDS,2017-12-21
2,BATTLETECH,2018-04-24


In [4]:
# 분석 대상 리뷰 데이터의 기간 설정
# COVID-19의 최초 발병 : 2019년 12월, WHO에 의한 국제적 공중보건 비상사태 선포 : 2020년 1월
# 전세계 사람들에게 COVID-19의 인식이 심어진 2020년 1월을 기준으로 잡은 뒤 동일한 개월수를 설정

pheic_date = datetime(2020, 1, 31) # 국제적 공중보건 비상사태(Public Health Emergency of International Concern) 선포일
latest_date = datetime.strptime(mu_df[["month"]].max()["month"], "%Y-%m") # mu_df의 최신 데이터의 일자
num_month = (latest_date.year - pheic_date.year) * 12 + (latest_date.month - pheic_date.month)
start_date = datetime(latest_date.year - (num_month * 2 // 12), latest_date.month - (num_month * 2 % 12), 1) # COVID-19 전/후 비교 시 분석 시작일이 되는 일자

# 분석 대상이 되는 데이터는 2018-05-01 00:00:00부터 작성된 리뷰부터 2021-09-30 23:59:59 까지의 기간만 해당됨
print(f"분석 데이터의 시작 기준일 : {start_date.strftime('%Y-%m-%d')}")
print(f"월별 이용자수 데이터의 최신 기간 : {latest_date.strftime('%Y-%m')}")

분석 데이터의 시작 기준일 : 2018-05-01
월별 이용자수 데이터의 최신 기간 : 2021-09


In [5]:
# mu_df의 미사용 행 제거
mu_df = mu_df.drop(["gain", "gain_percentage", "peak_players"], axis=1)

display(mu_df)

Unnamed: 0,name,month,avg_players
1,Counter-Strike,2021-09,8461.12
2,Counter-Strike,2021-08,8851.19
3,Counter-Strike,2021-07,9137.76
4,Counter-Strike,2021-06,9497.45
5,Counter-Strike,2021-05,10740.95
...,...,...,...
877907,我的侠客 试玩版,2021-09,21.66
877912,The Planet Crafter Demo,2021-09,35.16
878062,Haunted Hotel: The Axiom Butcher Collector's E...,2021-09,3.32
878078,Karryn's Prison Demo,2021-09,3.98


In [6]:
# 분석 기간 이전의 이용자 수 데이터 제거
def is_not_range(month):
    if datetime.strptime(month, "%Y-%m") < start_date:
        return True
    else:
        return False

mu_df["not_range"] = mu_df[["month"]].apply(
    lambda x: is_not_range(x["month"]), axis = 1
)
mu_df = mu_df[mu_df["not_range"] == False].drop(["not_range"], axis=1).reset_index(drop=True)

display(mu_df)

Unnamed: 0,name,month,avg_players
0,Counter-Strike,2021-09,8461.12
1,Counter-Strike,2021-08,8851.19
2,Counter-Strike,2021-07,9137.76
3,Counter-Strike,2021-06,9497.45
4,Counter-Strike,2021-05,10740.95
...,...,...,...
486359,我的侠客 试玩版,2021-09,21.66
486360,The Planet Crafter Demo,2021-09,35.16
486361,Haunted Hotel: The Axiom Butcher Collector's E...,2021-09,3.32
486362,Karryn's Prison Demo,2021-09,3.98


In [7]:
# 분석 기간 사이의 이용자 수 데이터가 부족한 게임 제거
# 게임 별 이용자 수 데이터 개수를 나타내는 tmp_df
tmp_df = mu_df.groupby("name").count().reset_index()[["name", "month"]] 
tmp_df.columns = ["name", "num_of_data"]

display(tmp_df)

Unnamed: 0,name,num_of_data
0,! That Bastard Is Trying To Steal Our Gold !,29
1,"!""Time Lock VR-1""!",27
2,"!""We Are The Dwarves""!",40
3,!4RC4N01D! 2: Retro Edition,16
4,!4RC4N01D! 3: Cold Space 5000 Achievements!,17
...,...,...
26002,龙魂时刻,33
26003,그녀가 공작저로 가야 했던 사정,1
26004,이 AI를 복구해주세요.,2
26005,이라,1


In [8]:
# 데이터프레임 병합을 통해 mu_df에 게임 별 이용자 수 데이터 추가
mu_df = pd.merge(mu_df, tmp_df, how="inner", on="name") 

# 분석 기간 (2018-05 ~ 2021-09) 사이의 이용자 수 데이터가 충분하지 않은 게임 제거
mu_df = mu_df[mu_df["num_of_data"] > (num_month * 2)].drop(["num_of_data"], axis=1)

display(mu_df)

Unnamed: 0,name,month,avg_players
0,Counter-Strike,2021-09,8461.12
1,Counter-Strike,2021-08,8851.19
2,Counter-Strike,2021-07,9137.76
3,Counter-Strike,2021-06,9497.45
4,Counter-Strike,2021-05,10740.95
...,...,...,...
390012,Caffeine,2018-09,7.18
390013,Caffeine,2018-08,10.19
390014,Caffeine,2018-07,11.20
390015,Caffeine,2018-06,11.12


In [9]:
# 해당 열의 month 데이터가 COVID-19 이전인지 이후인지를 구분하기 위한 period 열 추가
def is_before_after(month):
    if datetime.strptime(month, "%Y-%m") >= pheic_date:
        return "after covid-19"
    else:
        return "before covid-19"
    
mu_df["period"] = mu_df[["month"]].apply(
    lambda x: is_before_after(x["month"]), axis=1
)

display(mu_df.sample(10))

Unnamed: 0,name,month,avg_players,period
256262,Solitaire,2020-09,1.43,after covid-19
80583,Obscure 2,2019-09,1.4,before covid-19
80411,FortressCraft Evolved,2019-02,108.87,before covid-19
297719,Dishonored®: Death of the Outsider™,2019-01,126.37,before covid-19
269008,Tanki Online,2021-02,84.67,after covid-19
75231,Cabela's Big Game Hunter Pro Hunts,2019-02,3.93,before covid-19
7397,Dreamfall: The Longest Journey,2020-06,6.34,after covid-19
180643,Vikings - Wolves of Midgard,2019-08,34.34,before covid-19
173608,Hearts of Iron IV,2018-10,12595.39,before covid-19
246228,Uncharted Waters,2019-11,1.69,before covid-19


In [10]:
# COVID-19 이전과 이후의 평균 이용자 수를 비교하기 위한 tmp_df
tmp_df = mu_df[["name", "avg_players", "period"]].groupby(["name", "period"]).sum().reset_index()

display(tmp_df)

Unnamed: 0,name,period,avg_players
0,#killallzombies,after covid-19,102.21
1,#killallzombies,before covid-19,17.60
2,#monstercakes,after covid-19,434.16
3,#monstercakes,before covid-19,1013.44
4,- Arcane RERaise -,after covid-19,974.25
...,...,...,...
9635,永遠消失的幻想鄉 ～ The Disappearing of Gensokyo,before covid-19,390.68
9636,神舞幻想 Faith of Danschant,after covid-19,1007.44
9637,神舞幻想 Faith of Danschant,before covid-19,439.30
9638,鸿源战纪 - Tales of Hongyuan,after covid-19,41.33


In [11]:
# gl is Game List
gl_df = mu_df.drop_duplicates(["name"], ignore_index=True)[["name"]]

display(gl_df)

Unnamed: 0,name
0,Counter-Strike
1,Team Fortress Classic
2,Day of Defeat
3,Deathmatch Classic
4,Half-Life: Opposing Force
...,...
4815,LOGistICAL: ABC Islands
4816,Ultimate Sudoku Collection
4817,FPV Freerider
4818,SMITE - Public Test


In [12]:
# COVID-19 이후(pheic_date 이후) 평균 게임 이용자 수가 증가한 게임 선별
def is_covid_up(name):
    if name.iloc[0, 2] > name.iloc[1, 2]:
        return True
    else:
        return False

gl_df["covid_up"] = gl_df[["name"]].apply(
    lambda x: is_covid_up(tmp_df[tmp_df["name"] == x["name"]]), axis = 1
)

gl_df = gl_df[gl_df["covid_up"]][["name"]].reset_index(drop=True)

display(gl_df)

Unnamed: 0,name
0,Counter-Strike
1,Team Fortress Classic
2,Day of Defeat
3,Deathmatch Classic
4,Half-Life: Opposing Force
...,...
2454,Fish Tycoon 2: Virtual Aquarium
2455,The friends of Ringo Ishikawa
2456,YYNote
2457,FPV Freerider


In [13]:
# gl_df에 선별된 게임 목록에 속성을 병합하기 위한 gi_df 전처리
gi_df = gi_df[gi_df["name"].isin(list(gl_df["name"]))].reset_index(drop=True)

display(gi_df[["name"]])

Unnamed: 0,name
0,DayZ
1,EVE Online
2,Human: Fall Flat
3,For The King
4,Danganronpa V3: Killing Harmony
...,...
1571,Vector
1572,Flashback
1573,Hacker Evolution
1574,Black Ink


In [14]:
# gi_df의 url 열로 부터 app_id 추출
def get_app_id(url):
    pattern = "app/(.+?)/"
    result = re.search(pattern, url)
    if result:
        return result.group(1)
    else:
        # bundle app의 경우 url 패턴이 달라 app_id 대신 None이 리턴됨 
        return None

gi_df["app_id"] = gi_df[["url"]].apply(
    lambda x: get_app_id(x["url"]), axis = 1
)

display(gi_df[["name", "app_id"]])

Unnamed: 0,name,app_id
0,DayZ,221100
1,EVE Online,8500
2,Human: Fall Flat,477160
3,For The King,527230
4,Danganronpa V3: Killing Harmony,567640
...,...,...
1571,Vector,248970
1572,Flashback,245730
1573,Hacker Evolution,70100
1574,Black Ink,233680


In [15]:
# all_reviews 열로 부터 num_reviews 추출
def get_num_reviews(text):
    pattern = "(\d+)"
    result = re.search(pattern, str(text).replace(",", ""))
    if result:
        return int(result.group(0))
    else:
        # bundle app의 경우 url 패턴이 달라 app_id 대신 None이 리턴됨 
        return None
    

gi_df["num_reviews"] = gi_df[["all_reviews"]].apply(
    lambda x: get_num_reviews(x["all_reviews"]), axis = 1
)

display(gi_df[["name", "app_id", "num_reviews"]])

Unnamed: 0,name,app_id,num_reviews
0,DayZ,221100,167115.0
1,EVE Online,8500,11481.0
2,Human: Fall Flat,477160,23763.0
3,For The King,527230,4600.0
4,Danganronpa V3: Killing Harmony,567640,3547.0
...,...,...,...
1571,Vector,248970,448.0
1572,Flashback,245730,352.0
1573,Hacker Evolution,70100,309.0
1574,Black Ink,233680,319.0


In [16]:
# gi_df에서 사용할 데이터를 선별하여 gl_df와 병합(natural join)
gi_df = gi_df[["name", "app_id", "num_reviews", "release_date", "popular_tags", "genre"]]

display(gi_df)

Unnamed: 0,name,app_id,num_reviews,release_date,popular_tags,genre
0,DayZ,221100,167115.0,2018-12-13,"Survival,Zombies,Open World,Multiplayer,PvP,Ma...","Action,Adventure,Massively Multiplayer"
1,EVE Online,8500,11481.0,2003-05-06,"Space,Massively Multiplayer,Sci-fi,Sandbox,MMO...","Action,Free to Play,Massively Multiplayer,RPG,..."
2,Human: Fall Flat,477160,23763.0,2016-07-22,"Funny,Multiplayer,Co-op,Puzzle,Physics,Local C...","Adventure,Indie"
3,For The King,527230,4600.0,2018-04-19,"RPG,Turn-Based Combat,Adventure,Online Co-Op,C...","Adventure,Indie,RPG,Strategy"
4,Danganronpa V3: Killing Harmony,567640,3547.0,2017-09-25,"Story Rich,Anime,Visual Novel,Detective,Myster...",Adventure
...,...,...,...,...,...,...
1571,Vector,248970,448.0,2013-12-04,"Parkour,Action,Indie,Platformer,Singleplayer,2...","Action,Indie"
1572,Flashback,245730,352.0,2013-10-01,"Action,Adventure,Platformer,RPG,Remake,Cyberpu...","Action,Adventure,RPG"
1573,Hacker Evolution,70100,309.0,2010-09-14,"Simulation,Hacking,Puzzle,Singleplayer",Simulation
1574,Black Ink,233680,319.0,2016-02-08,"Design & Illustration,Utilities,Photo Editing,...","Animation & Modeling,Design & Illustration,Pho..."


In [17]:
# 리뷰 개수에 대한 오름차순으로 정렬
gi_df = gi_df.sort_values(by="num_reviews", ascending=True, na_position="last").reset_index(drop=True)

display(gi_df)

Unnamed: 0,name,app_id,num_reviews,release_date,popular_tags,genre
0,Nightshade,1028500,1.0,2019-03-07,"Adventure,Action","Action,Adventure"
1,Black Rose,464510,1.0,2016-06-02,Action,Action
2,GameMaker Studio 2 UWP,585610,1.0,2017-03-08,"Game Development,Design & Illustration,Utiliti...","Animation & Modeling,Design & Illustration,Edu..."
3,Loot Collection: Mahjong,718350,2.0,2017-10-05,"Casual,Simulation,Strategy,Adventure,Indie","Adventure,Casual,Indie,Simulation,Strategy"
4,ChessBase 13 Academy,377340,2.0,2016-08-04,Chess,
...,...,...,...,...,...,...
1571,Total War: SHOGUN 2,201270,,2011-03-15,"Strategy,Historical,Turn-Based Strategy,RTS,Gr...",Strategy
1572,Total War: NAPOLEON - Definitive Edition,,,,,Strategy
1573,Total War: MEDIEVAL II - Definitive Edition,,,,,Strategy
1574,Aliens vs. Predator,,,,,Action


In [19]:
# 데이터 수집을 위한 분석 대상 게임들의 목록인 gi_df를 결측치 처리 및 num_review열을 정수형으로 변환 후 중간 데이터로 저장
gi_df = gi_df.dropna(axis=0, how="any")
gi_df[["num_reviews"]] = gi_df[["num_reviews"]].astype(int)

display(gi_df)
gi_df.to_csv("./dataset/target_games.csv", encoding="utf-8-sig", index=False)

Unnamed: 0,name,app_id,num_reviews,release_date,popular_tags,genre
0,Nightshade,1028500,1,2019-03-07,"Adventure,Action","Action,Adventure"
1,Black Rose,464510,1,2016-06-02,Action,Action
2,GameMaker Studio 2 UWP,585610,1,2017-03-08,"Game Development,Design & Illustration,Utiliti...","Animation & Modeling,Design & Illustration,Edu..."
3,Loot Collection: Mahjong,718350,2,2017-10-05,"Casual,Simulation,Strategy,Adventure,Indie","Adventure,Casual,Indie,Simulation,Strategy"
5,ControlMyJoystick,773210,2,2014-12-01,Utilities,Utilities
...,...,...,...,...,...,...
1566,PAYDAY 2,218620,215078,2013-08-13,"Co-op,Action,FPS,Heist,Online Co-Op,Stealth,Mu...","Action,RPG"
1567,Unturned,304930,325675,2017-07-07,"Free to Play,Survival,Zombies,Multiplayer,Open...","Action,Adventure,Casual,Free to Play,Indie"
1568,Grand Theft Auto V,271590,407706,2015-04-14,"Open World,Action,Multiplayer,Third Person,Fir...","Action,Adventure"
1569,Team Fortress 2,440,553458,2007-10-10,"Free to Play,Multiplayer,FPS,Shooter,Action,Cl...","Action,Free to Play"


---
### 1.2 리뷰 데이터 수집

In [2]:
# 수집 대상 목록 데이터셋 불러오기
# tg is Target
tg_df = pd.read_csv("./dataset/target_games.csv", encoding="utf-8-sig")

display(tg_df)

Unnamed: 0,name,app_id,num_reviews,release_date,popular_tags,genre
0,Nightshade,1028500,1,2019-03-07,"Adventure,Action","Action,Adventure"
1,Black Rose,464510,1,2016-06-02,Action,Action
2,GameMaker Studio 2 UWP,585610,1,2017-03-08,"Game Development,Design & Illustration,Utiliti...","Animation & Modeling,Design & Illustration,Edu..."
3,Loot Collection: Mahjong,718350,2,2017-10-05,"Casual,Simulation,Strategy,Adventure,Indie","Adventure,Casual,Indie,Simulation,Strategy"
4,ControlMyJoystick,773210,2,2014-12-01,Utilities,Utilities
...,...,...,...,...,...,...
1551,PAYDAY 2,218620,215078,2013-08-13,"Co-op,Action,FPS,Heist,Online Co-Op,Stealth,Mu...","Action,RPG"
1552,Unturned,304930,325675,2017-07-07,"Free to Play,Survival,Zombies,Multiplayer,Open...","Action,Adventure,Casual,Free to Play,Indie"
1553,Grand Theft Auto V,271590,407706,2015-04-14,"Open World,Action,Multiplayer,Third Person,Fir...","Action,Adventure"
1554,Team Fortress 2,440,553458,2007-10-10,"Free to Play,Multiplayer,FPS,Shooter,Action,Cl...","Action,Free to Play"


In [3]:
# Steam API를 이용하여 리뷰 데이터를 받아오는 함수 선언
def get_review(app_id, app_name):            
    # 필요 변수 선언
    normalized_review_list = []
    cursor_list = []

    # get request 관련
    params = {
        "filter": "recent",
        "language": "all",
        # "day_range": day_range,
        "cursor": "*",
        "review_type": "all",
        "purchase_type": "all",
        "num_per_page": 100
    }
    url = f"http://store.steampowered.com/appreviews/{app_id}?json=1&{urllib.parse.urlencode(params)}"
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
              "Accept": "application/json"}  
    
    # cursor를 받아오기 위한 최초 1회의 request
    try:
        result = requests.get(url, headers=headers)
    except Exception as e:
        time.sleep(2.0)
        result = requests.get(url, headers=headers)
    
    response = json.loads(result.text.replace("\n", " "))
    # response = result.json() # UTF-8 BOM error?
    # response = json.loads(emoji_pattern.sub(r'', result.text).encode().decode("utf-8-sig"))

    if int(response["success"]) != 1 or result.status_code != 200:
        print(f"[System] Request 요청 실패, {app_id}")
        return False

    total_reviews = response["query_summary"]["total_reviews"]
    
    # 리뷰 데이터를 저장하는 비어있는 데이터프레임 생성
    reviews_df = pd.DataFrame(columns = ["app_id", "app_name", "recommendationid", "language", "review", "timestamp_created", "timestamp_updated",
                                         "voted_up", "votes_up", "votes_funny", "weighted_vote_score", "comment_count", "steam_purchase", "received_for_free",
                                         "written_during_early_access", "author.steamid", "author.num_games_owned",
                                         "author.num_reviews", "author.playtime_forever", "author.playtime_last_two_weeks", "author.playtime_at_review", "author.last_played"])
    
    while True:
        # 종료 조건 (무한루프 방지)
        if reviews_df.shape[0] >= total_reviews or params["cursor"] in cursor_list:
            break
            
        url = f"http://store.steampowered.com/appreviews/{app_id}?json=1&{urllib.parse.urlencode(params)}"
        
        # request 오류가 자주 발생하는 부분을 중복적으로 예외처리 (1차 재시도, 2차 오류발생 부분 출력)
        try:
            result = requests.get(url, headers=headers)
            response = json.loads(result.text.replace("\n", " "))
        except Exception as e:
            time.sleep(2.0)
            result = requests.get(url, headers=headers)
            try:
                response = json.loads(result.text.replace("\n", " "))
            except:
                print(f"[Error] Request 오류 발생")
                print(url)
                print(result.text)
                
                return False

        if int(response["success"]) != 1 or result.status_code != 200:
            print(f"[Error] Request 요청 실패, {app_id}")
            continue
            
        reviews = response["reviews"]

        # next cursor - 다음 리뷰 페이지로 연결되는 파라매터
        cursor_list.append(params["cursor"])
        params["cursor"] = response["cursor"].encode()
        
        try:
            for review in reviews:
                # author 값 정규화
                keys = review["author"].keys()
                values = review["author"].values()
                normalized_author = dict(zip(map(lambda x: "author."+x, keys), values))
                review.update(normalized_author)
                del(review["author"])

                # app_id, app_name 추가
                review.update({"app_id": app_id, "app_name": app_name})
                        
                # 데이터프레임에 리뷰 데이터 추가  
                reviews_df = reviews_df.append(review, ignore_index=True)
                    
        except Exception as e:
            print(f"[Error] {e}")
            
            return False
    
    reviews_df.to_csv(f"./dataset/new_review/{app_id}.csv", index=False, encoding="utf-8-sig") 
    
    # 명시적 메모리 관리
    del [[reviews_df]]
    gc.collect()
    reviews_df = pd.DataFrame()
    
    return True

In [4]:
# 리뷰 다운로드 디렉토리 생성
try:
    if not os.path.exists("./dataset/new_review"):
        os.makedirs("./dataset/new_review")
except OSError as e:
        print(e)

# API를 호출하기 위해 ["app_id", "name"] 쌍을 이루는 game_list 선언
game_list = list(zip([app_id for app_id in list(tg_df["app_id"])], [app_name for app_name in list(tg_df["name"])]))

# 완료된 리뷰 데이터를 건너 뛰기 위한 complete_list 선언
complete_list = [int(x.replace(".csv", "")) for x in os.listdir("./dataset/new_review") if ".csv" in x]

# 받아올 리뷰의 임의 개수로, 해당 변수를 조정해가며 적절한 규모의 리뷰 데이터를 수집
limit = 1500

print("리뷰 데이터 다운로드 시작")

# 뒤 리뷰 개수가 낮은 게임의 리뷰 데이터를 수집(리뷰가 너무 많은 게임으로 인한 단어의 편향 우려)
# 리뷰 개수의 내림차순으로 리뷰 데이터를 받아오면서 오류가 발생하는 게임의 경우는 건너뛰도록 함
for count, game_data in enumerate(game_list):
    if count+1 > limit:
        print("리뷰 데이터 다운로드 완료")
        break
        
    app_id, app_name = game_data
    
    # 진행 상황 표시
    print(f"[{str(count+1).zfill(4)}] {app_id}\t- ", end="")
    
    # 해당 게임 리뷰가 이미 존재하는 경우
    if app_id in complete_list:
        print(" Done!")
        continue
        
    # 리뷰 다운로드가 완료된 경우
    if get_review(app_id, app_name):
        print(" Done!")
        complete_list.append(app_id)

리뷰 데이터 다운로드 시작
[0001] 1028500	-  Done!
[0002] 464510	-  Done!
[0003] 585610	-  Done!
[0004] 718350	-  Done!
[0005] 773210	-  Done!
[0006] 573340	-  Done!
[0007] 540360	-  Done!
[0008] 854250	-  Done!
[0009] 600430	-  Done!
[0010] 29540	-  Done!
[0011] 573740	-  Done!
[0012] 567860	-  Done!
[0013] 812930	-  Done!
[0014] 776920	-  Done!
[0015] 748540	-  Done!
[0016] 32120	-  Done!
[0017] 793940	-  Done!
[0018] 701290	-  Done!
[0019] 718080	-  Done!
[0020] 763270	-  Done!
[0021] 570580	-  Done!
[0022] 411060	-  Done!
[0023] 385960	-  Done!
[0024] 583850	-  Done!
[0025] 376760	-  Done!
[0026] 600420	-  Done!
[0027] 38130	-  Done!
[0028] 38120	-  Done!
[0029] 813530	-  Done!
[0030] 320970	-  Done!
[0031] 630830	-  Done!
[0032] 701870	-  Done!
[0033] 38170	-  Done!
[0034] 376770	-  Done!
[0035] 16040	-  Done!
[0036] 275810	-  Done!
[0037] 467430	-  Done!
[0038] 435040	-  Done!
[0039] 539300	-  Done!
[0040] 433290	-  Done!
[0041] 259960	-  Done!
[0042] 585620	-  Done!
[0043] 521880	-  Done!
[