## Inspect data

In [16]:
import json
from constants import TEXTS_DIR, PERSPECTIVE_API_RESPONSE_DIR

In [12]:
# Print sample text
sample_text_file = TEXTS_DIR / '0000108-c17b018a21dba0c9c84e5f6dca782cbd.txt'
sample_text_file.read_text()

'WASHINGTON (Reuters) - Telecommunications giant AT&T struck back on Wednesday at Sen. Herb Kohl, chair of a congressional antitrust panel, saying the lawmaker’s concerns about a lack of competition in the wireless industry were unfounded.\n\nKohl wrote to the Justice Department’s top antitrust regulator Christine Varney and Federal Communications Commission Chairman Julius Genachowski on Monday to reiterate concerns over texting prices, large carriers failing to cooperate with smaller carriers to resolve roaming disputes, disputes over spectrum and deals that give one or another carrier exclusive access to popular phones like the iPhone.\n\nAT&T argued that cell service had become progressively cheaper, with revenue per minute falling 89 percent since 1994.\n\n“U.S. wireless prices are much lower than in any other major industrialized country,” wrote James Cicconi, an AT&T senior executive vice president.\n\nCicconi argued that texting prices had fallen because of package deals, “drop

In [13]:
# Print sample response
sample_response_file = PERSPECTIVE_API_RESPONSE_DIR / '0000108-c17b018a21dba0c9c84e5f6dca782cbd.txt.json'

with sample_response_file.open() as f:
    sample_response = json.load(f)

sample_response

{'attributeScores': {'TOXICITY': {'spanScores': [{'begin': 0,
     'end': 2504,
     'score': {'value': 0.09871901, 'type': 'PROBABILITY'}}],
   'summaryScore': {'value': 0.09871901, 'type': 'PROBABILITY'}}},
 'languages': ['en'],
 'detectedLanguages': ['en']}

## Load into pandas

In [3]:
import pandas as pd
from pathlib import Path
import os
from constants import TOXICITY_SCORES_PICKLE

In [7]:
# Load Perspective 

if not TOXICITY_SCORES_PICKLE.exists():
    rows = []

    for toxicity_file in PERSPECTIVE_API_RESPONSE_DIR.iterdir():
        if toxicity_file.suffix != '.json':
            continue

        toxicity_json = json.load(toxicity_file.open())
        toxicity_score = toxicity_json['attributeScores']['TOXICITY']['summaryScore']['value']

        text_filename = toxicity_file.name[:-5]
        assert text_filename.endswith('.txt')
        text_file = TEXTS_DIR / text_filename
        text = text_file.read_text()

        rows.append([text_filename, toxicity_score, text])

    df = pd.DataFrame(rows, columns=['filename', 'toxicity_score', 'text'])
    
    # Save a pickle for later
    df.to_pickle(TOXICITY_SCORES_PICKLE)
else:
    df = pd.read_pickle(TOXICITY_SCORES_PICKLE)

sorted_df = df.sort_values(by=['toxicity_score'], ascending=False)

In [8]:
sorted_df

Unnamed: 0,filename,toxicity_score,text
123084,0490610-99f159c7a4b9868addbda9199cb17cdf.txt,0.990670,"FUCK /u/SPEZ , YOU FUCKING WORTHLESS CUCK FUCK..."
85098,0672031-2e34d061fc3c5e59d8730115bf3ef6b5.txt,0.947267,From fucking fbombingmom:\n\nI’d like to submi...
85265,0015608-1d7d2627e8be693ee345f6c5a4a79786.txt,0.946782,"'No, Really, Fuck Every One Of You'\n\nWASHING..."
54476,0278354-b3bdb61b43e24d62f7a1cbc88c41598c.txt,0.946288,Air America's Randi Rhodes said suspended for ...
29478,0005265-30f6c1242730bf4f9ef4ac0a65b785a7.txt,0.927032,"Jang Moonbok, the “fuckboy” from ‘God Of Music..."
18701,0764561-709f4ecb3166356dd57ecac7e266f858.txt,0.921134,You probably heard about the deranged sorority...
57452,0027620-b27d673c1fb056b708559632f7f0ca9b.txt,0.907830,Evan Brunell is a baseball writer and the pres...
44406,0092718-0b244361bde2506bb51478de5777712b.txt,0.899773,\n\nDoes your vagina have a brand?\n\nLet your...
4740,0159720-0dffc558696a5a22303ef62e0faa1f83.txt,0.899565,feminismisimportantlove:\n\ndanny-dice:\n\nfem...
95491,0955335-decce7189212838edb4fae8130e9977e.txt,0.899428,Login to vote this up!\n\nSo my brother just l...


## Analyze with NLTK

In [10]:
from collections import Counter
import nltk
from nltk import ngrams

In [11]:
all_text = list(df['text'])

In [None]:
n = 2
counter = Counter()

for text in all_text:
    counter += Counter(ngrams(text.split(' '), n=n))

In [9]:
counter

NameError: name 'counter' is not defined