This repository has been archived by the owner on Mar 26, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
awwnimeScraper.py
91 lines (79 loc) · 3.29 KB
/
awwnimeScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#
# /r/awwnime scraper that gathers images from a specific page, usually the top 100 from today
#
# NOTE: reddit limits requests to at most 1000 items of anything
# However, praw will split requests up into 100 item batches
#
import sys
import praw # Reddit python wrapper
import time # Used to get the time the image was DL'ed
import urllib # Used to download images
import urllib.request # ^^^
from image import image # Custom image class
import os # Folder operations
# constants for connecting to reddit
PLATFORM = 'windows'
VERSION = '0.5'
APP_ID = 'TopImageScraper'
USERNAME = 'I_Collect_Images'
USER_AGENT = PLATFORM + ':' + APP_ID + ':' + VERSION + 'by /u/' + USERNAME
TOP_FOLDER = 'C:\\Users\\<USER>\\Pictures\\Anime\\Scraped' #removed for Github
#other constants
FILE_TYPES = ['.jpg', '.png']
redditConnection = praw.Reddit(user_agent=USER_AGENT)
# defaults to be changed by the user
subreddit = 'all'
count = 50
includeGifs = False
def setup():
if len(sys.argv) > 1:
global subreddit, count, includeGifs
subreddit = sys.argv[1]
count = int(sys.argv[2])
# Temporarily removing this feature
# includeGifs = sys.argv[3].upper() == 'TRUE'
def testConnection():
"""
Tests out the connection to reddit by grabbing the
top 3 submissions in /r/all and displaying their
URL along with the submitter
"""
print('------------------------------------')
print('Testing connection...')
testSub = redditConnection.get_subreddit('all')
print('Top 3 submissions in /r/all right now')
listing = 1
for submission in testSub.get_hot(limit=3):
uprint('#' + str(listing) + ': ' + submission.url[:35] + ' posted by: ' + submission.author)
listing += 1
print('Test Complete!')
print('------------------------------------')
def scrapeImages():
"""
Main entry point to scrape images off the given subreddit
"""
listings = redditConnection.get_subreddit(subreddit).get_hot(limit=count)
for idx, submission in enumerate(listings):
uprint('#'+ str(idx + 1) + ' getting: ' + submission.url[:35] + ' titled: ' + submission.title[:25])
data = image(submission.title, submission.url)
if data.imageType != FILE_TYPES[0] and data.imageType != FILE_TYPES[1]:
print('^ERROR! Could not download the file. No proper extension was given^')
continue
path = TOP_FOLDER + '\\' + data.folderName
os.makedirs(path, exist_ok=True)
if not os.path.isfile(path + '\\' + data.fileName):
urllib.request.urlretrieve(data.url, TOP_FOLDER + '\\' + data.folderName+'\\'+data.fileName)
# http://stackoverflow.com/questions/14630288/unicodeencodeerror-charmap-codec-cant-encode-character-maps-to-undefined
# Unicode print function. This is to ensure that unicode strings in post titles don't break stdout
def uprint(*objects, sep=' ', end='\n', file=sys.stdout):
enc = file.encoding
if enc == 'UTF-8':
print(*objects, sep=sep, end=end, file=file)
else:
f = lambda obj: str(obj).encode(enc, errors='backslashreplace').decode(enc)
print(*map(f, objects), sep=sep, end=end, file=file)
if __name__ == "__main__":
setup()
#testConnection()
scrapeImages()
exit()