In [29]:
from googleapiclient import discovery
from pathlib import Path
import json
import time
from tqdm import tqdm

from constants import DATA_DIR, TEXTS_DIR, PERSPECTIVE_API_RESPONSE_DIR, PERSPECTIVE_API_KEY

In [46]:
ATTRIBUTES = [
    'TOXICITY', 
    'SEVERE_TOXICITY', 
    'IDENTITY_ATTACK', 
    'INSULT', 
    'THREAT', 
    'PROFANITY', 
    'SEXUALLY_EXPLICIT', 
    'FLIRTATION'
]

# Generates API client object dynamically based on service name and version.
service = discovery.build('commentanalyzer', 'v1alpha1', developerKey=PERSPECTIVE_API_KEY)

# All attributes can be found here: 
# https://github.com/conversationai/perspectiveapi/blob/master/api_reference.md#toxicity-models
def request_toxicity(text, response_path):
    analyze_request = {
      'comment': { 'text': text },
      'requestedAttributes': {attr: {} for attr in ATTRIBUTES}
    }

    return service.comments().analyze(body=analyze_request).execute()

In [27]:
def response_file_for(text_file):
    response_filename = text_file.name + '.json'
    return PERSPECTIVE_API_RESPONSE_DIR / response_filename


unrequested_files = {}
for text_file in TEXTS_DIR.iterdir():
    response_file = response_file_for(text_file)
    if response_file.exists():
        continue
        
    unrequested_files[text_file] = response_file

In [None]:
PERSPECTIVE_API_SLEEP_SECONDS = 1
PERSPECTIVE_API_LEN_LIMIT = 20480
PERSPECTIVE_API_FAILURES = DATA_DIR / 'perspective_api_failures.txt'

for text_file, response_file in tqdm(unrequested_files.items()):
    text = text_file.read_text()
    if len(text) > PERSPECTIVE_API_LEN_LIMIT:
        continue

    try:
        response = request_toxicity(text, response_file)
        with response_path.open('w') as f:
            json.dump(response, f)
    except:
        with PERSPECTIVE_API_FAILURES.open('a') as f:
            print(text_file.name, file=f)

    # Sleep for 1 second due to rate limiting by API
    time.sleep(PERSPECTIVE_API_SLEEP_SECONDS)



  0%|          | 0/7853819 [00:00<?, ?it/s][A[A