In [None]:
from googleapiclient.discovery import build
import pandas as pd
import os
import json
import time
import googleapiclient.errors

import logging

from tqdm.notebook import tqdm
import random

In [None]:
api_key = os.environ.get('YOUTUBE_DATA_API_KEY')

In [None]:
youtube = build('youtube', 'v3', developerKey=api_key)

In [None]:
comments = pd.read_json("../pseudoscience-paper-data/groundtruth_videos_comments_ids.json", lines = True)

In [None]:
comments.head()

In [None]:
video_ids = comments['id']

In [None]:
# function to add to JSON
def write_json(comment_response, filename='raw_comment_responses.json'):
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        # Join new_data with file_data inside emp_details
        file_data["comments"].append(comment_response)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file, indent = 4)

In [None]:
logging.basicConfig(filename='log.txt', 
		    filemode='a', 
		    level=logging.INFO,
		    datefmt='%H:%M:%S',
		    format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s')

In [None]:
comments = json.load(open("raw_comment_responses.json"))

In [None]:
completed_requests = [list(comment.keys())[0] for comment in comments['comments']]

In [None]:
len(completed_requests)

In [None]:
max_count = len(video_ids)
print(len(completed_requests))

# f = IntProgress(min=0, max=max_count) # instantiate the bar
# display(f) # display the bar
# print(f.value)

logging.info("\n")
p_bar = tqdm(range(max_count))
already_completed = 0
total_time = 0
previous_time = 0
previous_id = "START"

for number in p_bar:
    start = time.time()
    id = video_ids[number]
    p_bar.set_description(f'Working on {id}... Average Time: {total_time/(number + 1 - already_completed)}... Total Time: {total_time}... Last Write: {previous_time} seconds for {previous_id}')
    if id in completed_requests:
        logging.info(f"{id} already scraped")
        already_completed += 1
        continue

    try:
        video_response=youtube.commentThreads().list(
            part='snippet,replies',
            videoId=id,
            maxResults = 100
        ).execute()
    except googleapiclient.errors.HttpError as e:
        logging.info(f"Encountered {e}")
        video_response = {"noComments":[]}

    write_json({id : video_response})
    completed_requests.append(id)
    logging.info(f"Wrote {id} to JSON")
    end = time.time()
    previous_time = end-start
    previous_id = id
    total_time += (previous_time)
