In [2]:
import pandas as pd

df = pd.read_csv("crt_states.csv", usecols=['state', 'bill_number', 'title', 'bill_text'])
df.head()

Unnamed: 0,state,bill_number,title,bill_text
0,UT,HR0901,House Resolution on Critical Race Theory in Pu...,Enrolled Copy H.R. 901\n\n1 HOUSE RESOLUTION O...
1,UT,SR0901,Senate Resolution on Critical Race Theory in P...,Enrolled Copy S.R. 901\n\n1 SENATE RESOLUTION ...
2,IL,HR0365,CRITICAL RACE THEORY GRANTS,Illinois-2021-HR0365-Introduced\n\n\n\n\t \n\n...
3,IL,HB4066,CRITICAL RACE THEORY ACADEMY,Illinois-2021-HB4066-Introduced\n\n\n\n\t \n\n...
4,OK,HR1038,Resolution; discouraging schools from mandatin...,Resolution \n\nENROLLED HOUSE \n\nRESOLUTION N...


In [5]:
df['bill_id'] = df.state + "-" + df.bill_number
df = df.set_index('bill_id')
df.head()

Unnamed: 0_level_0,state,bill_number,title,bill_text
bill_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
UT-HR0901,UT,HR0901,House Resolution on Critical Race Theory in Pu...,Enrolled Copy H.R. 901\n\n1 HOUSE RESOLUTION O...
UT-SR0901,UT,SR0901,Senate Resolution on Critical Race Theory in P...,Enrolled Copy S.R. 901\n\n1 SENATE RESOLUTION ...
IL-HR0365,IL,HR0365,CRITICAL RACE THEORY GRANTS,Illinois-2021-HR0365-Introduced\n\n\n\n\t \n\n...
IL-HB4066,IL,HB4066,CRITICAL RACE THEORY ACADEMY,Illinois-2021-HB4066-Introduced\n\n\n\n\t \n\n...
OK-HR1038,OK,HR1038,Resolution; discouraging schools from mandatin...,Resolution \n\nENROLLED HOUSE \n\nRESOLUTION N...


In [6]:
df.shape

(32, 4)

### STEP ONE: Vectorize!

* We used Tfidf because it makes common phrases less important, and bills are full of common stock phrases like "I present to the speaker"
* We used ngram_range of (1,3) because... it seemed right? We can always change it later.

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(3, 6))

matrix = vectorizer.fit_transform(df.bill_text)
words_df = pd.DataFrame(matrix.toarray(),
                        columns=vectorizer.get_feature_names(),
                        index=df.index)

words_df.head()

Unnamed: 0_level_0,000 000 from,000 000 from the,000 000 from the general,000 000 from the general fund,000 000 in,000 000 in fiscal,000 000 in fiscal year,000 000 in fiscal year 2021,000 000 is,000 000 is for,...,zoning commission or zoning board,zoning commission or zoning board of,zoning commission planning,zoning commission planning and,zoning commission planning and zoning,zoning commission planning and zoning commission,zoning districts when,zoning districts when an,zoning districts when an application,zoning districts when an application petition
bill_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
UT-HR0901,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
UT-SR0901,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IL-HR0365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IL-HB4066,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OK-HR1038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Let's narrow our pool of possible text reusers

* We'll use a similarity matrix!

In [51]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute the similarities using the word counts
similarities = cosine_similarity(matrix)

# Make a fancy colored dataframe about it
similarity = pd.DataFrame(similarities, index=df.index, columns=df.index)

similarity.style.background_gradient(axis=None)

bill_id,UT-HR0901,UT-SR0901,IL-HR0365,IL-HB4066,OK-HR1038,LA-HSR3,WV-SB618,SC-H4325,AL-HB11,MO-HB952,MI-SB0460,AL-HB8,MS-HR87,MS-HC62,MS-SR56,HI-SCR66,MA-H689,HI-SR48,ID-H0375,ID-H0377,WA-SB5194,DC-PR24-0018,CA-AB1322,HI-HR142,HI-HCR168,CO-HB1250,WA-HB1477,CA-AB473,MN-SF165,MN-SF2015,MN-HF6,CT-HB06666
bill_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
UT-HR0901,1.0,0.839337,0.011486,0.002986,0.011413,0.021928,0.020014,0.005316,0.003721,0.004916,0.00906,0.007892,0.023081,0.018142,0.014411,0.014205,0.001357,0.013598,0.003683,0.012157,0.001025,0.0,0.00153,0.011829,0.011779,0.00138,0.001679,0.000471,0.000502,0.000502,0.00064,0.00046
UT-SR0901,0.839337,1.0,0.008121,0.003006,0.004966,0.017115,0.02015,0.005352,0.003746,0.00495,0.009121,0.007946,0.010171,0.010113,0.021969,0.016955,0.000857,0.017034,0.002967,0.01151,0.001249,0.0,0.00154,0.007561,0.007529,0.00041,0.000692,0.000474,0.000483,0.000483,0.000236,0.000264
IL-HR0365,0.011486,0.008121,1.0,0.004469,0.004553,0.012002,0.002958,0.007223,0.000738,0.003488,0.001523,0.000972,0.0117,0.01128,0.008588,0.007858,0.001232,0.008404,0.001187,0.001314,0.001422,0.000229,0.000163,0.009121,0.008251,0.002035,0.000651,0.000705,0.001362,0.001362,0.00022,0.000215
IL-HB4066,0.002986,0.003006,0.004469,1.0,0.001107,0.003534,0.002705,0.003972,0.000994,0.002835,0.005397,0.002649,0.001544,0.001535,0.001369,0.000601,0.010241,0.000604,0.001414,0.001529,0.004038,0.000142,0.001464,0.000642,0.00064,0.011549,0.004079,0.003974,0.003456,0.003456,0.001591,0.008151
OK-HR1038,0.011413,0.004966,0.004553,0.001107,1.0,0.009895,0.05219,0.006388,0.00527,0.003582,0.007628,0.039869,0.053477,0.052227,0.04107,0.003575,0.001101,0.003028,0.0071,0.007662,0.003055,0.000951,0.000154,0.006771,0.006742,0.002874,0.001752,0.001999,0.000983,0.000983,0.000551,0.004891
LA-HSR3,0.021928,0.017115,0.012002,0.003534,0.009895,1.0,0.007097,0.008211,0.001714,0.005561,0.002275,0.001451,0.008293,0.006873,0.003993,0.008713,0.00346,0.007933,0.002749,0.003219,0.002062,0.0,0.002226,0.004445,0.004426,0.006024,0.002127,0.002091,0.001253,0.001253,0.001036,0.000693
WV-SB618,0.020014,0.02015,0.002958,0.002705,0.05219,0.007097,1.0,0.017056,0.017843,0.005264,0.040673,0.299935,0.075814,0.075379,0.073232,0.000765,0.003023,0.000768,0.008024,0.012794,0.003311,0.0,0.000424,0.000889,0.000886,0.008034,0.003352,0.004877,0.001389,0.001389,0.004085,0.009258
SC-H4325,0.005316,0.005352,0.007223,0.003972,0.006388,0.008211,0.017056,1.0,0.143832,0.010521,0.173697,0.020303,0.001029,0.001023,0.000976,0.000291,0.000452,0.000293,0.084678,0.140778,0.000639,0.0,0.00051,0.000339,0.000338,0.003778,0.000622,0.002742,0.001356,0.001356,0.001013,0.00103
AL-HB11,0.003721,0.003746,0.000738,0.000994,0.00527,0.001714,0.017843,0.143832,1.0,0.009483,0.077,0.150192,0.005191,0.005161,0.006106,0.0,0.000745,0.0,0.075092,0.108327,0.009795,0.0,0.001339,0.0,0.0,0.001431,0.002269,0.004687,0.001246,0.001246,0.001033,0.002191
MO-HB952,0.004916,0.00495,0.003488,0.002835,0.003582,0.005561,0.005264,0.010521,0.009483,1.0,0.006276,0.01505,0.000575,0.000572,0.000614,0.002835,0.00556,0.002848,0.010337,0.014784,0.009106,0.0,0.00601,0.003195,0.003182,0.007826,0.003549,0.006906,0.003312,0.003312,0.003116,0.00701


### ...and figure out what we want to use as our baseline

We'll use `IL-HB4066` because it seems to have some lines in common with other bills

In [66]:
similarity['AL-HB8'].sort_values(ascending=False).head(20)

bill_id
AL-HB8        1.000000
WV-SB618      0.299935
AL-HB11       0.150192
MS-HR87       0.057761
MS-HC62       0.057430
MS-SR56       0.055200
OK-HR1038     0.039869
ID-H0377      0.031492
MI-SB0460     0.028747
ID-H0375      0.020405
SC-H4325      0.020303
MO-HB952      0.015050
WA-SB5194     0.011008
UT-SR0901     0.007946
UT-HR0901     0.007892
WA-HB1477     0.004778
CA-AB473      0.004770
CO-HB1250     0.003440
CT-HB06666    0.003030
IL-HB4066     0.002649
Name: AL-HB8, dtype: float64

In [67]:
# Find the top 50 most similar, sorted by similarity
# and give me their bills numbers (they're the index)
# and then convert them into a list
most_similar = list(similarity['AL-HB8'].sort_values(ascending=False).head(15).index)
most_similar[:5]

['AL-HB8', 'WV-SB618', 'AL-HB11', 'MS-HR87', 'MS-HC62']

### Learn the phrases that are inside of IL-HB4066

These are the phrases we want to highlight - maybe we didn't use ngrams for similarity, but we're using ngrams down here for highlighting??? (for example)

In [68]:
bill_text = df.loc['AL-HB8'].bill_text
bill_text[:100]

'1 HB8\n\n2 214376-1\n\n3 By Representatives Pringle and Mooney\n\n4 RFD: Education Policy \n\n5 First Read: '

In [73]:
from sklearn.feature_extraction.text import CountVectorizer

# Find everything between 2-6 words
# and token_pattern means include things like "a" or "I" or "at"
vectorizer = CountVectorizer(ngram_range=(2,6), token_pattern='(?u)\\b\\w+\\b')

# Learn the words
vectorizer.fit([bill_text])
phrases = vectorizer.get_feature_names()
phrases[:10]

['0 1',
 '0 1 214376',
 '0 1 214376 1',
 '0 1 214376 1 n',
 '0 1 214376 1 n 06',
 '01 2021',
 '01 2021 ahp',
 '01 2021 ahp cmg',
 '01 2021 ahp cmg lsa2021',
 '01 2021 ahp cmg lsa2021 1417']

### Use the highlighter

In [74]:
# You can always cut and paste this code as long as you have 'phrases'

from rich.console import Console
from rich.highlighter import RegexHighlighter
from rich.theme import Theme

# You can run this code if you have a variable called
# "phrases" that is... the phrases you want to match.
class PhraseHighlighter(RegexHighlighter):
    base_style = "highlighter."
    highlights = [f"(?P<phrase>{phrase})" for phrase in phrases]

theme = Theme({"highlighter.phrase": "on pale_turquoise1"})
highlighter = Console(highlighter=PhraseHighlighter(), theme=theme)

In [75]:
for doc in most_similar:
    content = df.loc[doc].bill_text.lower() \
        .replace(".", "") \
        .replace(",", "") \
        .replace(";", "") \
        .replace('"', "") \
        .replace('“', "") \
        .replace('”', "") \
        .replace(")", "") \
        .replace("(", "")

    highlighter.print(f"[bold white on purple]{doc}[/bold white on purple]")
    highlighter.print(content)
    print("\n\n\n\n--------\n\n\n\n")





--------










--------










--------










--------










--------










--------










--------










--------










--------










--------










--------










--------










--------










--------










--------






In [79]:
df['contains_superior'] = df.bill_text.str.lower().str.contains("is inherently superior").astype(int)
df.head()

Unnamed: 0_level_0,state,bill_number,title,bill_text,contains_superior
bill_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
UT-HR0901,UT,HR0901,House Resolution on Critical Race Theory in Pu...,Enrolled Copy H.R. 901\n\n1 HOUSE RESOLUTION O...,1
UT-SR0901,UT,SR0901,Senate Resolution on Critical Race Theory in P...,Enrolled Copy S.R. 901\n\n1 SENATE RESOLUTION ...,1
IL-HR0365,IL,HR0365,CRITICAL RACE THEORY GRANTS,Illinois-2021-HR0365-Introduced\n\n\n\n\t \n\n...,0
IL-HB4066,IL,HB4066,CRITICAL RACE THEORY ACADEMY,Illinois-2021-HB4066-Introduced\n\n\n\n\t \n\n...,0
OK-HR1038,OK,HR1038,Resolution; discouraging schools from mandatin...,Resolution \n\nENROLLED HOUSE \n\nRESOLUTION N...,1


In [80]:
pd.crosstab(df.contains_superior, df.state)

state,AL,CA,CO,CT,DC,HI,ID,IL,LA,MA,MI,MN,MO,MS,OK,SC,UT,WA,WV
contains_superior,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,0,2,1,1,1,4,0,2,1,1,0,3,1,0,0,0,0,2,1
1,2,0,0,0,0,0,2,0,0,0,1,0,0,3,1,1,2,0,0
