-
Notifications
You must be signed in to change notification settings - Fork 17
/
steam-review-extractor.py
127 lines (117 loc) · 5.57 KB
/
steam-review-extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2018 Andrea Esuli (andrea@esuli.it)
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import argparse
import csv
import datetime
import json
import os
import re
import sys
from bs4 import BeautifulSoup
def extract_reviews(basepath, outputfile_name):
helpfulre = re.compile(r'([0-9]+) [a-z]+? found this review helpful')
funnyre = re.compile(r'([0-9]+) [a-z]+? found this review funny')
ownedre = re.compile(r'([0-9]+) product')
revre = re.compile(r'([0-9]+) review')
timere = re.compile(r'([0-9.]+) hrs on record')
postedre = re.compile(r'Posted: (.+)')
idre = re.compile(r'app-([0-9]+)$')
yearre = re.compile(r'.* [0-9][0-9][0-9][0-9]$')
userre = re.compile(r'/(profiles|id)/(.+?)/')
with open(outputfile_name, mode="w", encoding="utf-8", newline="") as outputfile:
writer = csv.writer(outputfile)
for root, _, files in os.walk(basepath):
m = idre.search(root)
if m:
id_ = m.group(1)
else:
print('skipping non-game path ', root, file=sys.stderr)
continue
for file in files:
fullpath = os.path.join(root, file)
print('processing', fullpath)
with open(fullpath, encoding='utf8') as f:
try:
soup = BeautifulSoup(json.loads(f.read())['html'], "html.parser")
except ValueError:
print('error on ', fullpath, file=sys.stderr)
for reviewdiv in soup.findAll('div', attrs={'class': 'review_box'}):
helpful = 0
funny = 0
elem = reviewdiv.find('div', attrs={'class': 'vote_info'})
if elem:
m = helpfulre.search(elem.text)
if m:
helpful = m.group(1)
m = funnyre.search(elem.text)
if m:
funny = m.group(1)
username = '__anon__'
elem = reviewdiv.find('div', attrs={'class': 'persona_name'})
if elem:
m = userre.search(str(elem.contents))
if m:
username = m.group(2)
owned = 0
elem = reviewdiv.find('div', attrs={'class': 'num_owned_games'})
if elem:
m = ownedre.search(elem.text)
if m:
owned = m.group(1)
numrev = 0
elem = reviewdiv.find('div', attrs={'class': 'num_reviews'})
if elem:
m = revre.search(elem.text)
if m:
numrev = m.group(1)
recco = 0
elem = reviewdiv.find('div', attrs={'class': 'title ellipsis'})
if elem:
if elem.text == 'Recommended':
recco = 1
else:
recco = -1
time = 0
elem = reviewdiv.find('div', attrs={'class': 'hours ellipsis'})
if elem:
m = timere.search(elem.text)
if m:
time = m.group(1)
posted = 0
elem = reviewdiv.find('div', attrs={'class': 'postedDate'})
if elem:
m = postedre.search(elem.text)
if m:
posted = m.group(1).strip()
if not yearre.match(posted):
posted = posted + ", %s" % datetime.date.today().year
content = ''
elem = reviewdiv.find('div', attrs={'class': 'content'})
if elem:
content = elem.text.strip()
writer.writerow(
(id_, helpful, funny, username, owned, numrev, recco, time, posted, content))
def main():
parser = argparse.ArgumentParser(description='Extractor of Steam reviews')
parser.add_argument(
'-i', '--input', help='Input file or path (all files in subpath are processed)', default="./data/pages/reviews",
required=False)
parser.add_argument(
'-o', '--output', help='Output file', default='./data/reviews.csv', required=False)
args = parser.parse_args()
extract_reviews(args.input, args.output)
if __name__ == '__main__':
main()