In [6]:
import pandas as pd

df = pd.read_csv("bills.csv")
df.head()

Unnamed: 0,bill_name,bill_text
0,Louisiana-HB167,ENROLLED\n\nACT No. 2512020 Regular Session\nH...
1,izona-SB1003,8/2/2021 Arizona-2021-SB1003-Chaptered\n\nhttp...
2,kansas-HB1715,HB1715 as engrossed on 03-29-2021 11:52:30\n\n...
3,Wyoming-HB0075,21LSO-0192\n\n\nORIGINAL HOUSE ENGROSSED\nBILL...
4,Kansas hb2183,Senate Substitute for HOUSE BILL No. 2183\n\nA...


In [7]:
df = df.set_index('bill_name')
df.head()

Unnamed: 0_level_0,bill_text
bill_name,Unnamed: 1_level_1
Louisiana-HB167,ENROLLED\n\nACT No. 2512020 Regular Session\nH...
izona-SB1003,8/2/2021 Arizona-2021-SB1003-Chaptered\n\nhttp...
kansas-HB1715,HB1715 as engrossed on 03-29-2021 11:52:30\n\n...
Wyoming-HB0075,21LSO-0192\n\n\nORIGINAL HOUSE ENGROSSED\nBILL...
Kansas hb2183,Senate Substitute for HOUSE BILL No. 2183\n\nA...


In [8]:
df.shape

(27, 1)

### STEP ONE: Vectorize!

* We used Tfidf because it makes common phrases less important, and bills are full of common stock phrases like "I present to the speaker"
* We used ngram_range of (1,3) because... it seemed right? We can always change it later.

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(2, 5))

matrix = vectorizer.fit_transform(df.bill_text)
words_df = pd.DataFrame(matrix.toarray(),
                        columns=vectorizer.get_feature_names(),
                        index=df.index)

words_df.head()

Unnamed: 0_level_0,00 1241,00 1241 election,00 1241 election day,00 1241 election day the,00 23,00 23 for,00 23 for those,00 23 for those persons,00 and,00 and before,...,zone during polling hours,zone during polling hours 1238,zone regardless,zone regardless 21,zone regardless 21 of,zone regardless 21 of the,zone such,zone such as,zone such as 1231,zone such as 1231 but
bill_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Louisiana-HB167,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
izona-SB1003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
kansas-HB1715,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Wyoming-HB0075,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Kansas hb2183,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Let's narrow our pool of possible text reusers

* We'll use a similarity matrix!

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute the similarities using the word counts
similarities = cosine_similarity(matrix)

# Make a fancy colored dataframe about it
similarity = pd.DataFrame(similarities, index=df.index, columns=df.index)

similarity.style.background_gradient(axis=None)

bill_name,Louisiana-HB167,izona-SB1003,kansas-HB1715,Wyoming-HB0075,Kansas hb2183,izona-SB1485,vada-SB84,Indiana-SB0398,Georgia-SB202,kansas-HB1244,bama-HB538,Florida S.B. 90,Montana-HB176,Iowa S.F. 413,izona-SB1819,bama-HB285,Oklahoma HB2663,Montana HB 530,Idaho HB 290,Utah-HB0012,Kentucky-HB574,Montana-SB169,Kansas hb2332,Texas-HB3920,w Hampshire-HB523,kansas-SB643,Iowa-SF568
bill_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
Louisiana-HB167,1.0,0.006751,0.011236,0.014557,0.017098,0.018081,0.009466,0.027481,0.02967,0.007427,0.017766,0.021328,0.012674,0.011049,0.041883,0.015587,0.022554,0.015183,0.014878,0.010724,0.036748,0.010123,0.022713,0.008543,0.004133,0.014297,0.019755
izona-SB1003,0.006751,1.0,0.016144,0.008682,0.024267,0.278811,0.005798,0.04294,0.038516,0.005236,0.030675,0.022188,0.026966,0.025934,0.049443,0.010036,0.022991,0.011321,0.013442,0.013029,0.046015,0.023123,0.024927,0.010172,0.009698,0.02367,0.025609
kansas-HB1715,0.011236,0.016144,1.0,0.011015,0.035341,0.034711,0.030578,0.109634,0.082407,0.041484,0.064731,0.022567,0.026869,0.059293,0.034813,0.020138,0.040725,0.007948,0.045651,0.033954,0.117248,0.019649,0.041302,0.020198,0.010225,0.097327,0.058255
Wyoming-HB0075,0.014557,0.008682,0.011015,1.0,0.018271,0.019305,0.007748,0.028372,0.036556,0.035698,0.022207,0.01945,0.015843,0.01264,0.039689,0.010792,0.030417,0.013751,0.017937,0.01222,0.028005,0.023761,0.023618,0.017586,0.0062,0.017611,0.015565
Kansas hb2183,0.017098,0.024267,0.035341,0.018271,1.0,0.062225,0.021767,0.109267,0.080906,0.012969,0.043978,0.031062,0.043216,0.036201,0.057704,0.022135,0.065819,0.040688,0.030243,0.027696,0.077216,0.023169,0.322189,0.026858,0.011474,0.04346,0.035856
izona-SB1485,0.018081,0.278811,0.034711,0.019305,0.062225,1.0,0.024411,0.085769,0.089968,0.015581,0.049364,0.041235,0.047726,0.049376,0.111562,0.028594,0.04602,0.038651,0.031137,0.036673,0.097971,0.033152,0.068002,0.023334,0.015183,0.040209,0.049494
vada-SB84,0.009466,0.005798,0.030578,0.007748,0.021767,0.024411,1.0,0.026039,0.023428,0.005187,0.011099,0.011905,0.009332,0.01514,0.027973,0.021089,0.014917,0.004761,0.025551,0.020535,0.043739,0.007825,0.021163,0.005561,0.004251,0.031952,0.016904
Indiana-SB0398,0.027481,0.04294,0.109634,0.028372,0.109267,0.085769,0.026039,1.0,0.152222,0.025226,0.099491,0.056743,0.064339,0.078655,0.110247,0.034485,0.142067,0.022207,0.048126,0.047161,0.152128,0.045523,0.104121,0.031965,0.02103,0.074634,0.082561
Georgia-SB202,0.02967,0.038516,0.082407,0.036556,0.080906,0.089968,0.023428,0.152222,1.0,0.033735,0.110206,0.069287,0.080025,0.066013,0.128632,0.040107,0.082939,0.035751,0.051148,0.043167,0.15683,0.066245,0.086314,0.020383,0.030003,0.054694,0.065614
kansas-HB1244,0.007427,0.005236,0.041484,0.035698,0.012969,0.015581,0.005187,0.025226,0.033735,1.0,0.024773,0.018759,0.016858,0.020527,0.030007,0.007762,0.015573,0.003865,0.024939,0.01191,0.035757,0.017849,0.02014,0.009458,0.020852,0.046533,0.025054


In [17]:
# Higher number implies maybe more reuse?
similarity.sum().sort_values(ascending=False)

bill_name
Indiana-SB0398       2.822433
Kentucky-HB574       2.809308
Georgia-SB202        2.754768
izona-SB1819         2.489560
izona-SB1485         2.436543
Kansas hb2332        2.387975
Kansas hb2183        2.321074
Iowa-SF568           2.237295
Iowa S.F. 413        2.174926
bama-HB538           2.115077
kansas-HB1715        2.093445
Oklahoma HB2663      2.079828
Montana-HB176        2.047043
kansas-SB643         2.033838
Montana-SB169        1.828839
izona-SB1003         1.816385
Florida S.B. 90      1.807610
Idaho HB 290         1.756620
Utah-HB0012          1.657634
Montana HB 530       1.591583
bama-HB285           1.534832
kansas-HB1244        1.533161
Wyoming-HB0075       1.507497
vada-SB84            1.448117
Louisiana-HB167      1.441705
Texas-HB3920         1.434681
w Hampshire-HB523    1.340421
dtype: float64

### ...and figure out what we want to use as our baseline

We'll use `IL-HB4066` because it seems to have some lines in common with other bills

In [18]:
similarity['Indiana-SB0398'].sort_values(ascending=False).head(20)

bill_name
Indiana-SB0398      1.000000
Georgia-SB202       0.152222
Kentucky-HB574      0.152128
Oklahoma HB2663     0.142067
izona-SB1819        0.110247
kansas-HB1715       0.109634
Kansas hb2183       0.109267
Kansas hb2332       0.104121
bama-HB538          0.099491
izona-SB1485        0.085769
Iowa-SF568          0.082561
Iowa S.F. 413       0.078655
kansas-SB643        0.074634
Montana-HB176       0.064339
Florida S.B. 90     0.056743
Idaho HB 290        0.048126
Utah-HB0012         0.047161
Montana-SB169       0.045523
izona-SB1003        0.042940
bama-HB285          0.034485
Name: Indiana-SB0398, dtype: float64

In [19]:
# Find the top 50 most similar, sorted by similarity
# and give me their bills numbers (they're the index)
# and then convert them into a list
# LET'S TAKE ALL OF THEM
most_similar = list(similarity['Indiana-SB0398'].sort_values(ascending=False).head(500).index)
most_similar[:5]

['Indiana-SB0398',
 'Georgia-SB202',
 'Kentucky-HB574',
 'Oklahoma HB2663 ',
 'izona-SB1819']

### Learn the phrases that are inside of IL-HB4066

These are the phrases we want to highlight - maybe we didn't use ngrams for similarity, but we're using ngrams down here for highlighting??? (for example)

In [20]:
bill_text = df.loc['Indiana-SB0398'].bill_text
bill_text[:100]

'First Regular Session of the 122nd General Assembly (2021)\n\nPRINTING CODE. Amendments: Whenever an e'

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

# Find everything between 2-6 words
# and token_pattern means include things like "a" or "I" or "at"
vectorizer = CountVectorizer(ngram_range=(2,6), token_pattern='(?u)\\b\\w+\\b')

# Learn the words
vectorizer.fit([bill_text])
phrases = vectorizer.get_feature_names()
phrases[:10]

['00 a',
 '00 a m',
 '00 a m on',
 '00 a m on election',
 '00 a m on election day',
 '000 the',
 '000 the absentee',
 '000 the absentee voter',
 '000 the absentee voter board',
 '000 the absentee voter board in']

### Use the highlighter

In [22]:
# You can always cut and paste this code as long as you have 'phrases'

from rich.console import Console
from rich.highlighter import RegexHighlighter
from rich.theme import Theme

# You can run this code if you have a variable called
# "phrases" that is... the phrases you want to match.
class PhraseHighlighter(RegexHighlighter):
    base_style = "highlighter."
    highlights = [f"(?P<phrase>{phrase})" for phrase in phrases]

theme = Theme({"highlighter.phrase": "on pale_turquoise1"})
highlighter = Console(highlighter=PhraseHighlighter(), theme=theme)

In [24]:
for doc in most_similar:
    content = df.loc[doc].bill_text.lower() \
        .replace(".", "") \
        .replace(",", "") \
        .replace(";", "") \
        .replace('"', "") \
        .replace('“', "") \
        .replace('”', "") \
        .replace(")", "") \
        .replace("(", "")

    highlighter.print(f"[bold white on purple]{doc}[/bold white on purple]")
    highlighter.print(content)
    print("\n\n\n\n--------\n\n\n\n")





--------










--------










--------






KeyboardInterrupt: 