Skip to content

Commit

Permalink
整理代码
Browse files Browse the repository at this point in the history
  • Loading branch information
awolfly9 committed Mar 6, 2017
1 parent 02daa93 commit 4f78ab5
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 170 deletions.
52 changes: 15 additions & 37 deletions steam/spiders/gameinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def init(self):
"`metacritic_score` FLOAT DEFAULT NULL,"
"`user_reviews_count` INT(6) NOT NULL,"
"`positive_user_reviews_count` INT(6) NOT NULL,"
"`positive_percent` FLOAT NOT NULL ,"
"`negative_user_reviews_count` INT(6) NOT NULL,"
'`steam_user_reviews_count` INT(6) NOT NULL,'
'`non_steam_user_reviews_count` INT(6) NOT NULL,'
Expand Down Expand Up @@ -84,7 +85,7 @@ def start_requests(self):
)

def parse_game(self, response):
self.log('parse_game:\n%s' % response.url)
self.log('parse_game url:%s' % response.url)
id = response.meta.get('id')

# file_name = '%s/%s.html' % (self.dir_game, id)
Expand All @@ -106,9 +107,6 @@ def parse_game(self, response):
},
callback = self.parse_game
)
elif u'Content in this product may not be appropriate for all ages' in response.body:
self.log('need valicate meta:%s' % response.meta)
return

soup = BeautifulSoup(response.body, 'lxml')
sel = Selector(text = response.body)
Expand All @@ -125,6 +123,7 @@ def parse_game(self, response):
except:
price = -1

# 该游戏在 metacritic 上的评分
metacritic_score = sel.xpath('//div[@class="score high"]/text()').extract_first()
try:
metacritic_score = int(metacritic_score)
Expand All @@ -139,6 +138,12 @@ def parse_game(self, response):
positive_user_reviews_count = sel.xpath('//label[@for="review_type_positive"]/span/text()').extract_first()
positive_user_reviews_count = self.count_to_int(positive_user_reviews_count)

# 好评的百分比
if user_reviews_count != -1 and positive_user_reviews_count != -1:
positive_percent = positive_user_reviews_count * 1.0 / user_reviews_count * 100
else:
positive_percent = 0

# 差评的用户数量
negative_user_reviews_count = sel.xpath('//label[@for="review_type_negative"]/span/text()').extract_first()
negative_user_reviews_count = self.count_to_int(negative_user_reviews_count)
Expand Down Expand Up @@ -204,29 +209,24 @@ def parse_game(self, response):
save_time = None

msg = (id, name, price, response.url, metacritic_score, user_reviews_count, positive_user_reviews_count,
negative_user_reviews_count, steam_user_reviews_count, non_steam_user_reviews_count,
positive_percent, negative_user_reviews_count, steam_user_reviews_count, non_steam_user_reviews_count,
english_user_reviews_count, non_english_user_reviews_count, tag_list, achievements_count, category,
genre, developer, publisher, release_date, language_number, description, save_time)

command = ("INSERT IGNORE INTO {} "
"(id, name, price, url, metacritic_score, user_reviews_count, positive_user_reviews_count, "
"negative_user_reviews_count, steam_user_reviews_count, non_steam_user_reviews_count, "
"english_user_reviews_count, non_english_user_reviews_count, tag_list, achievements_count, "
"category, genre, developer, publisher, release_date, language_number, description, save_time)"
"VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, "
"positive_percent, negative_user_reviews_count, steam_user_reviews_count, "
"non_steam_user_reviews_count, english_user_reviews_count, non_english_user_reviews_count, "
"tag_list, achievements_count, category, genre, developer, publisher, release_date, "
"language_number, description, save_time)"
"VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, "
"%s)".format(config.steam_game_info_table))

self.sql.insert_data(command, msg)

command = "UPDATE {0} SET is_crawled=\'yes\' WHERE id=\'{1}\'".format(config.steam_game_urls_table, id)
self.sql.execute(command)

def validate(self, response):
id = self.get_id(response.url)

file_name = '%s.html' % (id)
self.save_page(file_name, response.body)

def error_parse(self, faiture):
request = faiture.request
utils.log('error_parse url:%s meta:%s' % (request.url, request.meta))
Expand Down Expand Up @@ -267,28 +267,6 @@ def count_to_int(self, data):
except:
return -1

def parse_steam(self, response):
file_name = 'log/steam_urls.txt'
url = response.url + '\n'
with open(file_name, 'a+') as f:
f.write(url)
f.close()

def parse_game_by_number(self, response):
self.log('parse_game_by_number:\n%s' % response.url)

def parse_player(self, response):
self.log('parse_player:\n%s' % response.url)

url = response.url
str(url).split('/')
names = str(url).split('/')
name = names[len(names) - 1]

with open('players/%s.html' % name, 'w') as f:
f.write(response.body)
f.close()

def save_page(self, file_name, data):
with open(file_name, 'w') as f:
f.write(data)
Expand Down
135 changes: 2 additions & 133 deletions steam/spiders/gameurls.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,9 @@
import config
import utils

from scrapy.spiders import CrawlSpider, Spider
from scrapy.spiders import Rule
from scrapy import Request, FormRequest
from scrapy.linkextractors.sgml import SgmlLinkExtractor as sle
from scrapy.spiders import Spider
from scrapy import Request
from scrapy.selector import Selector
from bs4 import BeautifulSoup
from sqlhelper import SqlHelper


Expand All @@ -27,8 +24,6 @@ def __init__(self, *a, **kw):

utils.make_dir(self.dir_game)

self.game_count = 0

def init(self):
command = (
"CREATE TABLE IF NOT EXISTS {} ("
Expand Down Expand Up @@ -87,11 +82,6 @@ def parse_all(self, response):
"VALUES(%s, %s, %s, %s, %s, %s)".format(config.steam_game_urls_table))

self.sql.insert_data(command, msg)
count = count + 1

self.game_count = self.game_count + count
utils.log('game_list length:%s insert count:%s self.game_count:%s meta:%s' %
(str(len(game_list)), str(count), str(self.game_count), response.meta))

def error_parse(self, faiture):
request = faiture.request
Expand Down Expand Up @@ -121,127 +111,6 @@ def get_id(self, url):
utils.log('get_id error url:%s' % url)
return 0, 'error'

def parse_game(self, response):
self.log('parse_game:\n%s' % response.url)

url = response.url
pattern = re.compile('/app/(\d+)/', re.S)
id = re.search(pattern, url)
if id:
id = id.group(1)

file_name = '%s/%s.html' % (self.dir_game, id)
self.save_page(file_name, response.body)

soup = BeautifulSoup(response.body, 'lxml')
sel = Selector(text = response.body)

name = sel.xpath('//div[@class="apphub_AppName"]/text()').extract_first()
#print('name:%s' % name)
price = sel.xpath('//div[@class="game_purchase_action"]/div/div/text()').extract_first()
#print('price:%s' % price)

metacritic_score = sel.xpath('//div[@class="score high"]/text()').extract_first()
#print('metacritic_score:%s' % metacritic_score)

reviews_recent_positive = sel.xpath('//div[@class="user_reviews"]/div[2]/div[2]/span[2]/text()').extract_first()
#print('reviews_recent_positive:%s' % reviews_recent_positive)

reviews_recent_positive_percent = sel.xpath(
'//div[@class="user_reviews"]/div[2]/div[2]/span[3]/text()').extract_first()
#print('reviews_recent_positive_percent:%s' % reviews_recent_positive_percent)

reviews_overall_positive = sel.xpath(
'////div[@class="user_reviews"]/div[3]/div[2]/span[2]/text()').extract_first()
#print('reviews_overall_positive:%s' % reviews_overall_positive)

reviews_overall_positive_percent = sel.xpath(
'//div[@class="user_reviews"]/div[3]/div[2]/span[3]/text()').extract_first()
#print('reviews_overall_positive_percent:%s' % reviews_overall_positive_percent)

tags = soup.find(attrs = {'class': 'glance_tags popular_tags'})
#print('tags:%s' % tags.text)

review_all = sel.xpath('//label[@for="review_type_all"]/span/text()').extract_first()
#print('review_all:%s' % review_all)

review_positive = sel.xpath('//label[@for="review_type_positive"]/span/text()').extract_first()
#print('review_positive:%s' % review_positive)

review_negative = sel.xpath('//label[@for="review_type_negative"]/span/text()').extract_first()
#print('review_negative:%s' % review_negative)

review_purchase_steam = sel.xpath('//label[@for="purchase_type_steam"]/span/text()').extract_first()
#print('review_purchase_steam :%s' % review_purchase_steam)

review_purchase_cd_key = sel.xpath('//label[@for="purchase_type_cd_key"]/span/text()').extract_first()
#print('review_purchase_cd_key:%s' % review_purchase_cd_key)

review_chinese_language = sel.xpath('//label[@for="review_language_mine"]/span/text()').extract_first()
#print('review_chinese_language:%s' % review_chinese_language)

achievements = sel.xpath('//div[@id="achievement_block"]/div/text()').extract_first()
#print('achievements:%s' % achievements)

curators = ''

category = soup.find(name = 'div', attrs = {'class': 'breadcrumbs'})
#print('category:%s' % category.text)

genre = sel.xpath('//div[@class="block_content"]/div/div/a/text()').extract_first()
#print('genre:%s' % genre)

developer = sel.xpath('//div[@class="block_content"]/div/div/a[2]/text()').extract_first()
#print('developer:%s' % developer)

publisher = sel.xpath('//div[@class="block_content"]/div/div/a[3]/text()').extract_first()
#print('publisher:%s' % publisher)

release_date = sel.xpath('//div[@class="block_content"]/div/div/b[5]/text()').extract_first()
#print('release_date:%s' % release_date)

description = sel.xpath('//div[@class="game_description_snippet"]/text()').extract_first()
#print('description:%s' % description)

dlc_number = ''
dlc_names = ''
dlc_prices = ''
language_number = ''
languages = ''

# msg = (id, name, price, metacritic_score, reviews_overall_positive, reviews_overall_positive_percent,
# reviews_recent_positive, reviews_recent_positive_percent, tags, review_all, review_positive,
# review_negative, review_purchase_steam, review_purchase_cd_key, review_chinese_language, achievements,
# curators, category, genre, developer, publisher, release_date, dlc_number, dlc_names, dlc_prices, url,
# language_number, languages, description, None)

msg = (id, name, price, url)
command = utils.get_insert_data_command(config.steam_game_table)

self.sql.insert_data(command, msg)

def parse_steam(self, response):
file_name = 'log/steam_urls.txt'
url = response.url + '\n'
with open(file_name, 'a+') as f:
f.write(url)
f.close()

def parse_game_by_number(self, response):
self.log('parse_game_by_number:\n%s' % response.url)

def parse_player(self, response):
self.log('parse_player:\n%s' % response.url)

url = response.url
str(url).split('/')
names = str(url).split('/')
name = names[len(names) - 1]

with open('players/%s.html' % name, 'w') as f:
f.write(response.body)
f.close()

def save_page(self, file_name, data):
with open(file_name, 'w') as f:
f.write(data)
Expand Down

0 comments on commit 4f78ab5

Please sign in to comment.