## 레딧 크롤러

글을 저장할 시간 범위를 `hourly` 혹은 `daily` basis로 지정 가능.

- hourly : 실행 시간을 기준으로 이전 1시간 범위
- daily : 실행 시간을 기준으로 이전 하루 범위

해당 데이터를 DataFrame으로 저장한 뒤 컬럼별로 JSON에 저장함.

저장할 파일의 이름은 `${subreddit_name}_${start_datetime}_${end_datetime}.json` 으로 저장.

### Prerequisites

In [1]:
import praw
from dotenv import dotenv_values   
import pandas as pd
from datetime import datetime, timezone
import numpy as np

In [2]:
def get_time_range(basis = 'daily') :
    if basis == 'daily' :
        current_time = datetime.now(timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0)
        end_timestamp = datetime.timestamp(current_time)
        start_timestamp = end_timestamp - 3600*24
        time_range = (start_timestamp, end_timestamp, basis)
        return time_range
    
    elif basis == 'hourly' :    
        current_time = datetime.now(timezone.utc).replace(minute=0, second=0, microsecond=0)
        end_timestamp = datetime.timestamp(current_time)
        start_timestamp = end_timestamp - 3600
        time_range = (start_timestamp, end_timestamp, basis)
        return time_range
    
    else :
        raise InvalidValueError("Invalid Basis type: " + basis)

In [3]:
def get_last_element_of(iterable) :
    last_element = next(iterable)
    for last_element in iterable:
        continue
    return last_element

In [18]:
class SubredditScraper :
    
    columns = [
        'title', 'created_utc', 'distinguished', 'id', 'is_self', 'name', 'score', 'selftext', 'upvote_ratio'
    ]
    
    def __init__(self, reddit_instance, subreddit_name : str):
        self.__subreddit = reddit_instance.subreddit(subreddit_name)
        
    def __get_search_limit(self, start_time, start=100, step=50) :
        
        search_limit = start
        
        while True:
            submissions = self.__subreddit.new(limit = search_limit)
            last = get_last_element_of(submissions)
            if last.created_utc > start_time :
                search_limit += 100
            else :
                break 
                
        return search_limit
        
    def get_submissions(self, time_range):
        
        (start_time, end_time, basis) = time_range
        
        title=[]
        created_utc=[]
        distinguished=[]
        id=[]
        is_self=[]
        name=[]
        score=[]
        selftext=[]
        upvote_ratio=[]
        comments=[]
        num_comments=[]
        
        search_limit = self.__get_search_limit(start_time)
  
        
        for item in self.__subreddit.new(limit = search_limit):
            
            if item.created_utc >= start_time and item.created_utc < end_time :
                title.append(item.title)
                created_utc.append(item.created_utc)
                distinguished.append(item.distinguished)
                id.append(item.id)
                is_self.append(item.is_self)
                name.append(item.name)
                score.append(item.score)
                selftext.append(item.selftext)
                upvote_ratio.append(item.upvote_ratio)
                num_comments.append(item.num_comments)
                
                submission_comments = []
                item.comments.replace_more(limit = 0)
                for comment in item.comments.list() :
                    submission_comments.append({
                        'id' : comment.id,
                        'score' : comment.score,
                        'body' : comment.body,
                        'created_utc' : comment.created_utc,
                    })
                
                comments.append(submission_comments)
        
        df = pd.DataFrame({
            'title' : title,
            'created_utc' : created_utc,
            'distinguished' : distinguished,
            'id' : id,
            'is_self' : is_self,
            'name' : name,
            'score' : score,
            'selftext' : selftext,
            'upvote_ratio' : upvote_ratio,
            'num_comments' : num_comments,
            'comments' : comments,
        })
        
        return df
        
        

## Main

In [12]:
config = dotenv_values("../.env")

In [13]:
reddit = praw.Reddit(
    client_id=config['REDDIT_CLIENT_ID'],
    client_secret=config['REDDIT_CLIENT_SECRET'],
    user_agent="testscript by u/No_Masterpiece_9985"
)

In [14]:
time_range = get_time_range(basis = 'daily')

In [20]:
subreddit_lists = ["television", "netflix", "NetflixBestOf"]

In [16]:
dataFrames = []

In [19]:
for name in subreddit_lists :
    instance = SubredditScraper(reddit, name)
    df = instance.get_submissions(time_range)
    dataFrames.append(df)

(4, 11)


KeyboardInterrupt: 

In [None]:
for idx, df in enumerate(dataFrames) :
    
    file_name = f'{subreddit_lists[idx]}_{time_range[0]}_{time_range[1]}'
    
    df.to_json(f'../data/{file_name}.json', orient = 'split', compression = 'infer')
    