In [None]:
#set up packages
import pandas as pd
from googleapiclient import discovery
import requests
import time
from tqdm import tqdm
tqdm.pandas()
from dotenv import load_dotenv   

In [None]:
#insert your own API Key here
# load keys from  environmental var
load_dotenv() # .env file in cwd
api_key = os.environ.get("api_key")

In [None]:
#import csv with comment data
# Read CSV into Pandas DataFrame
df = pd.read_csv('data_w_roberta_score.csv')

In [None]:
df.head()

In [None]:
def perspective_score(text):
    """
    Get Perspective API toxicity score for text.

    Parameters:
    - text (str): The text you want toxicity score on.

    Returns:
    - float or None: The toxicity score if available, or None if there's an issue with the request.
    """
    api_url = "https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze"
    params = {'key': cloud_api}
    data = {
        'comment': {'text': text},
        'languages': ['en'],
        'requestedAttributes': {'TOXICITY': {}, 'IDENTITY_ATTACK': {}}
    }

    response = requests.post(api_url, params=params, json=data)

    if response.status_code == 200:
        results = response.json()
        if 'attributeScores' in results:
            toxicity_score = results['attributeScores']['TOXICITY']['summaryScore']['value']
            identity_attack_score = results['attributeScores']['IDENTITY_ATTACK']['summaryScore']['value']
            score = {'TOXICITY': toxicity_score, 'IDENTITY_ATTACK': identity_attack_score}
            return score

    # Introduce a pause to avoid exceeding rate limits
    time.sleep(1)
    return None


In [None]:
def perspective_score_batch(chunk):
    """
    Get Perspective API scores for a batch of texts.

      Parameters:
      - chunk: A dataframe containing a 'text' column with text to analyze.

     Returns:
     - list: A list of dictionaries containing 'TOXICITY' and 'IDENTITY_ATTACK' scores for each text.
    """
    api_url = "https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze"
    params = {'key': cloud_api}

    scores = []

    for text in tqdm(chunk['text_string']):
        data = {
            'comment': {'text': text},
            'languages': ['en'],
            'requestedAttributes': {'TOXICITY': {},'IDENTITY_ATTACK': {}}
        }
        response = requests.post(api_url, params=params, json=data)

        if response.status_code == 200:
            results = response.json()
            if 'attributeScores' in results:
                toxicity_score = results['attributeScores']['TOXICITY']['summaryScore']['value']
                identity_attack_score = results['attributeScores']['IDENTITY_ATTACK']['summaryScore']['value']
                scores.append({'TOXICITY': toxicity_score, 'IDENTITY_ATTACK': identity_attack_score})
            else:
                scores.append(None)
        else:
            scores.append(None)

        # Introduce a pause to avoid exceeding rate limits
        time.sleep(1)

    return scores

In [None]:
#split the dataframe into chunks for batch requests
chunk_size = 5000
chunks = [df[i:i + chunk_size] for i in range(0, df.shape[0], chunk_size)]

#initialize a list to store the scores
all_scores = []

#collect score for each chunk
for chunk in chunks:
    scores = perspective_score_batch(chunk)
    all_scores.extend(scores)

#add the scores to the dataframe
df[['toxicity_score', 'identity_attack_score']] = pd.DataFrame(all_scores)

In [None]:
df.head()

In [None]:
#Export to csv
df.to_csv('data/final_data.csv', index=False)