In [None]:
from googleapiclient import discovery
from pathlib import Path
import json
import time
from tqdm import tqdms

from constants import DATA_DIR, TEXTS_DIR, PERSPECTIVE_API_RESPONSE_DIR, PERSPECTIVE_API_KEY

In [None]:
ATTRIBUTES = [
    'TOXICITY',
    'SEVERE_TOXICITY',
    'IDENTITY_ATTACK',
    'INSULT',
    'THREAT',
    'PROFANITY',
    'SEXUALLY_EXPLICIT',
    'FLIRTATION'
]

# Generates API client object dynamically based on service name and version.
service = discovery.build('commentanalyzer', 'v1alpha1', developerKey=PERSPECTIVE_API_KEY)

# All attributes can be found here: 
# https://github.com/conversationai/perspectiveapi/blob/master/api_reference.md#toxicity-models
def request_toxicity(text):
    analyze_request = {
      'comment': { 'text': text },
      'requestedAttributes': {attr: {} for attr in ATTRIBUTES}
    }

    return service.comments().analyze(body=analyze_request).execute()

In [None]:
def response_file_for(text_file, chunk_num=None):
    if chunk_num == None:
        response_filename = text_file.name + '.json'
    else:
        response_filename = f'{text_file.name}.chunk-{chunk_num}.json'
    return PERSPECTIVE_API_RESPONSE_DIR / response_filename

In [None]:
PERSPECTIVE_API_FAILURES = DATA_DIR / 'perspective_api_failures.txt'

pending_files = set()
for text_file in TEXTS_DIR.iterdir():
    if not response_file_for(text_file).exists():
        pending_files.add(text_file)

failure_text_filenames = PERSPECTIVE_API_FAILURES.read_text().split()
failure_text_files = set(TEXTS_DIR / filename for filename in failure_text_filenames)

# Remove all failed downloads from pending files
pending_files -= failure_text_files

In [None]:
from typing import List
from math import ceil

def chunk_text(text: str, chunk_len: int) -> List[str]:
    chunks = []
    for i in range(0, len(text), chunk_len):
        chunks.append(text[i:i + chunk_len])
    return chunks


# Test chunking
assert len(chunk_text("x" * 2048, 2048)) == 1
assert len(chunk_text("x" * 2049, 2048)) == 2
assert chunk_text("x" * 2049, 2048)[1] == 'x'
assert len(chunk_text("x" * 100, 2048)) == 1
assert chunk_text("x" * 100, 2048)[0] == "x" * 100

In [None]:
PERSPECTIVE_API_SLEEP_SECONDS = 1
PERSPECTIVE_API_LEN_LIMIT = 20480

for text_file in tqdm(pending_files):
    full_text = text_file.read_text()
    chunks = chunk_text(full_text, PERSPECTIVE_API_LEN_LIMIT)
    
    for i, text in enumerate(chunks):
        if len(chunks) > 1:
            response_file = response_file_for(text_file, chunk_num=i)
        else:
            response_file = response_file_for(text_file)

        try:
            response = request_toxicity(text)
            with response_file.open('w') as f:
                json.dump(response, f)
        except:
            with PERSPECTIVE_API_FAILURES.open('a') as f:
                print(text_file.name, file=f)

        # Sleep for 1 second due to rate limiting by API
        time.sleep(PERSPECTIVE_API_SLEEP_SECONDS)