In [3]:
import pandas as pd
from sqlalchemy import create_engine

from constants import DATA_DIR, TEXTS_DIR

In [4]:
NUM_DOCS = 7770363
NUM_SPAN_SCORES = 76221963

In [5]:
db_path = DATA_DIR / 'perspective-responses-v2.db'
engine = create_engine(f'sqlite:///{db_path}', echo=False)

In [10]:
# TODO: use some percent of total documents, or percentiles
low_doc_query = """
    SELECT *
    FROM responses
    ORDER BY toxicity ASC
    LIMIT 10000
    """
low_docs = pd.read_sql(low_doc_query, con=engine)

high_doc_query = """
    SELECT *
    FROM responses
    ORDER BY toxicity DESC
    LIMIT 10000
    """
high_docs = pd.read_sql(high_doc_query, con=engine)

In [29]:
def create_df(doc_predicate: str, sent_predicate: str):
    doc_query = f"""
        SELECT *
        FROM responses
        WHERE toxicity {doc_predicate}
        """

    sent_query = f"""
        WITH 
            docs AS ({doc_query})
        SELECT 
            docs.filename AS filename,
            docs.toxicity AS doc_toxicity,
            span_scores.toxicity as prompt_toxicity,
            span_scores.end as prompt_end
        FROM 
            span_scores
                INNER JOIN 
            docs
                ON span_scores.filename = docs.filename
        WHERE 
            span_scores.begin = 0 
        and span_scores.toxicity {sent_predicate}
        """

    return pd.read_sql(sent_query, con=engine)

In [37]:
low_doc_low_sent = create_df('< 0.25', '< 0.25')
low_doc_low_sent

Unnamed: 0,filename,doc_toxicity,prompt_toxicity,prompt_end
0,0922809-2567130f37a7e2ba55581aa177b6e3c9.txt,0.152441,0.062290,271
1,0776249-6b69e163629d0603a2e57c0af9b77128.txt,0.148363,0.071782,16
2,0534310-f4ebb69ec91d1cef1ce46e57431b29cc.txt,0.154141,0.034387,184
3,0588122-39c89eab36c8e7ffbb186065859a61ea.txt,0.148574,0.117720,44
4,0681169-81bd1fdea6a7519882a8312e003528fe.txt,0.222260,0.060871,35
...,...,...,...,...
5901758,0770461-5779d7193866f1db1eeaccac0b11024c.txt,0.095454,0.014405,91
5901759,0358637-5c1732a14c7f147353e520611de4e049.txt,0.148836,0.031205,108
5901760,0737961-27ebdbbd8c8797e05f44e3b4f2580fb1.txt,0.196003,0.193898,185
5901761,0032554-1be5849a1bae64dec2e4e041a1f8b343.txt,0.120145,0.029190,205


In [38]:
low_doc_high_sent = create_df('< 0.25', '> 0.75')
low_doc_high_sent

Unnamed: 0,filename,doc_toxicity,prompt_toxicity,prompt_end
0,0284960-0f7e0ce97c71a655e09240ae657ca8a6.txt,0.106356,0.762357,105
1,0444801-84ccb6202ee0367fbbe80a225bf9299f.txt,0.209305,0.904547,63
2,0832153-708f34aae2c78e215ea6ffa4cda54f07.txt,0.248958,0.827726,61
3,0247661-681dd838e62afaa6ca053ae77bd137a2.txt,0.124193,0.784712,65
4,0487353-322719d66806573da75e8d4faf07e107.txt,0.208196,0.760951,52
...,...,...,...,...
1531,0273398-808f8b397edb1a2cb846deaa150ef55d.txt,0.211549,0.760554,90
1532,0966639-4d16b071d31567406772d83604b2b6f4.txt,0.036928,0.763508,70
1533,0416149-70f818d9a5ce5ff38703cbe9903b66ab.txt,0.231821,0.752768,122
1534,0587027-6a9720e7f584d2d508479d122e8983bb.txt,0.247755,0.802834,26


In [39]:
high_doc_low_sent = create_df('> 0.75', '< 0.25')
high_doc_low_sent

Unnamed: 0,filename,doc_toxicity,prompt_toxicity,prompt_end
0,0429481-8189b21b0b7109dde0b13043513e2c40.txt,0.766460,0.086464,72
1,0481414-d752bb13b5dde11f32b807ee7f7c7952.txt,0.760561,0.246351,127
2,0436823-8ef3da0a4302916ab85f7827b1a3c0d7.txt,0.753220,0.201156,123
3,0544771-f113947d1f3546ba45c71c0e1ba356ba.txt,0.802834,0.145544,141
4,0475845-dce5ba1b539a93d9947f5b7c03dafcc7.txt,0.778365,0.053839,32
...,...,...,...,...
6524,0398227-88bf6adb383ca5de7d2d8e393bcb92cc.txt,0.793155,0.209243,88
6525,0756726-0a5448e212a5e22c27fc4ed5faa43fc6.txt,0.801125,0.226898,181
6526,0629454-62f05069cdf97ea1cef8faf988f2bd48.txt,0.781846,0.129234,166
6527,0550120-8ecb793b66dabb368967f61f618c4dee.txt,0.760600,0.013617,11


In [40]:
high_doc_high_sent = create_df('> 0.75', '> 0.75')
high_doc_high_sent

Unnamed: 0,filename,doc_toxicity,prompt_toxicity,prompt_end
0,0708959-08f364932a934893e2db24b09305727e.txt,0.766442,0.903097,45
1,0717999-e411153278dc31e36c5e25acd0e051cc.txt,0.837749,0.956581,74
2,0636272-0cf2fa5fa5be28dab44e7c761a83b9c6.txt,0.837786,0.871854,207
3,0803235-5487606b1e5e330a223a52c3bb1e860b.txt,0.761019,0.920361,98
4,0533308-e782406013b83e7fad5989e988222cdd.txt,0.775598,0.929886,107
...,...,...,...,...
1971,0637433-ffa8e06fb892df7c5e76319219458e4f.txt,0.811586,0.871854,67
1972,0380512-4ee8582c6c34793b3379a83bcec87bc3.txt,0.766478,0.906591,153
1973,0957074-32a46d2c2530fbebeb3c96842e6f3e12.txt,0.775071,0.961841,103
1974,0844784-d4a89edf7ccbfc1f8dfb14af70582c69.txt,0.763445,0.766209,206
