In [1]:
from googleapiclient import discovery
from pathlib import Path
import json
import time
from tqdm import tqdm

from constants import DATA_DIR, TEXTS_DIR, PERSPECTIVE_API_RESPONSE_DIR, PERSPECTIVE_API_KEY

In [2]:
ATTRIBUTES = [
    'TOXICITY',
    'SEVERE_TOXICITY',
    'IDENTITY_ATTACK',
    'INSULT',
    'THREAT',
    'PROFANITY',
    'SEXUALLY_EXPLICIT',
    'FLIRTATION'
]

# Generates API client object dynamically based on service name and version.
service = discovery.build('commentanalyzer', 'v1alpha1', developerKey=PERSPECTIVE_API_KEY)

# All attributes can be found here: 
# https://github.com/conversationai/perspectiveapi/blob/master/api_reference.md#toxicity-models
def request_toxicity(text, response_path):
    analyze_request = {
      'comment': { 'text': text },
      'requestedAttributes': {attr: {} for attr in ATTRIBUTES}
    }

    return service.comments().analyze(body=analyze_request).execute()

In [3]:
def response_file_for(text_file):
    response_filename = text_file.name + '.json'
    return PERSPECTIVE_API_RESPONSE_DIR / response_filename


unrequested_files = {}
for text_file in TEXTS_DIR.iterdir():
    response_file = response_file_for(text_file)
    if response_file.exists():
        continue
        
    unrequested_files[text_file] = response_file

In [8]:
PERSPECTIVE_API_FAILURES = DATA_DIR / 'perspective_api_failures.txt'

failure_text_files = PERSPECTIVE_API_FAILURES.read_text().split()
failure_text_files = set(TEXTS_DIR / filename for filename in failure_text_files)

for failure_file in failure_text_files:
    try:
        del unrequested_files[failure_file]
    except:
        print(failure_file, 'not in unrequested files')

data/texts/0730921-98339e0384e794bd136e80299edd35f6.txt not in unrequested files
data/texts/0587177-c26174192fb0db31d086ed377e82fde7.txt not in unrequested files
data/texts/0796570-74e57fde3604254896788e61896a5baf.txt not in unrequested files
data/texts/0850053-ca81a29877308292ac8f2eb3f62415da.txt not in unrequested files
data/texts/0709609-01857eb399a9cb5018a9d17e5db660ee.txt not in unrequested files
data/texts/0430176-d4cd0f49fc8be4a6d277aeb71519ee41.txt not in unrequested files
data/texts/0011458-c25a122bad5473f5394b4005ed9f9c1a.txt not in unrequested files
data/texts/0643981-8dcf235402e68aed68a9579741c4299f.txt not in unrequested files
data/texts/0807943-014f40da5661e197caad6eded9b313a6.txt not in unrequested files
data/texts/0120828-e3ed9a0a359dcc7ff96c68807a7f5f24.txt not in unrequested files
data/texts/0596284-37b5a424395508cab25fc5d3143504a0.txt not in unrequested files
data/texts/0297357-7e206fa9fc4f77285f8da6b43ae26538.txt not in unrequested files
data/texts/0069738-72fabadfd

In [None]:
PERSPECTIVE_API_SLEEP_SECONDS = 1
PERSPECTIVE_API_LEN_LIMIT = 20480

for text_file, response_file in tqdm(unrequested_files.items()):
    text = text_file.read_text()
    if len(text) > PERSPECTIVE_API_LEN_LIMIT:
        continue

    try:
        response = request_toxicity(text, response_file)
        with response_path.open('w') as f:
            json.dump(response, f)
    except:
        with PERSPECTIVE_API_FAILURES.open('a') as f:
            print(text_file.name, file=f)

    # Sleep for 1 second due to rate limiting by API
    time.sleep(PERSPECTIVE_API_SLEEP_SECONDS)

  0%|          | 4278/7848977 [07:00<2295:34:39,  1.05s/it]