In [2]:
!pip install nltk



In [None]:
import nltk
nltk.download()

In [108]:
import json
from string import punctuation
import os
import re
import datetime
import math

In [5]:
def penn2morphy(penntag):
    """ Converts Penn Treebank tags to WordNet. """
    morphy_tag = {'NN': 'n', 'JJ': 'a',
                  'VB': 'v', 'RB': 'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n'


def lemmatize_sent(text):
    return [wnl.lemmatize(word.lower(), pos=penn2morphy(tag))
            for word, tag in nltk.pos_tag(text)]



In [8]:
from google.colab import drive
drive.mount('/content/drive')
checkpoints = '/content/drive/MyDrive/colab_files/'
if not os.path.exists(checkpoints):
    os.makedirs(checkpoints)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
def nv_preprocess(nv_json_path, trim=None):    
    """
    Loads JSON into environment as dictionary
    Preprocesses the raw PDF export from previously generated json    
    Optional: Trims transcript to exclude list of those present and signature page/list of exhibits
    
    Parameters
    ----------
    nv_json_path : STRING
        Local path of nv_json generated by nv_pdftotext.
    trim: TRUE/Default(NONE)
        Provides option to trim transcript to spoken section and transcriber notes
        
    Returns
    -------
    Cleaned dictionary that excludes PDF formatting and (optional) front and back end 

    """
    
    file_path = open(nv_json_path,'rb')
    data = json.load(file_path)
    
    if trim:
        for key in data:
            if isinstance(data[key], str):
                ##Removes list of attendees on front end
                start_location = re.search(r"(CHAIR.*[A-z]\:|Chair.*[A-z]\:)", data[key]).start() #Chair speaks first
                data[key] = data[key][start_location:] #Starts transcript from when Chair first speaks
                ##Removes signature page after submission (RESPECTFULLY SUBMITTED)
                end_location = re.search(r"(Respectfully\sSUBMITTED\:|RESPECTFULLY\sSUBMITTED\:)", data[key]).start() #Signature page starts with
                data[key] = data[key][:end_location] #End transcript just before respectfully submitted            
                ##PDF formatting
                data[key] = re.sub(r"Page\s[0-9]{1,}", "", data[key]) #Removes page number
                data[key] = re.sub(r"\n", "", data[key])
                data[key] = data[key].strip()
                data[key]=" ".join(data[key].split())
            elif isinstance(data[key], list):
                for i in range(len(data[key])):
                    start_location = re.search(r"(CHAIR.*[A-z]\:|Chair.*[A-z]\:)", data[key][i]).start() #Chair speaks first
                    data[key][i] = data[key][i][start_location:] #Starts transcript from when Chair first speaks
                    end_location = re.search(r"(Respectfully\sSUBMITTED\:|RESPECTFULLY\sSUBMITTED\:|RESPECTFULLY\sSUBMITTED)",
                                             data[key][i]).start()  # Signature page starts with
                    ##Removes signature page after submission (RESPECTFULLY SUBMITTED)
                    # try:
                    #     end_location = re.search(r"(Respectfully\sSUBMITTED\:|RESPECTFULLY\sSUBMITTED\:)", data[key][i]).start() #Signature page starts with
                    # except:
                    #     end_location = -1
                    data[key][i] = data[key][i][:end_location] #End transcript just before respectfully submitted
                    ##PDF formatting
                    data[key][i] = re.sub(r"Page\s[0-9]{1,}", "", data[key][i]) #Removes page number
                    data[key][i] = re.sub(r"\n", "", data[key][i])
                    data[key][i] = data[key][i].strip()
                    data[key][i]=" ".join(data[key][i].split())
            else:
                print("Incompatible File")

        return(data)
            
    else:
        for key in data:
            if isinstance(data[key], str):          
                ##PDF formatting
                data[key] = re.sub(r"Page\s[0-9]{1,}", "", data[key]) #Removes page number
                data[key] = re.sub(r"\n", "", data[key])
                data[key] = data[key].strip()
                data[key]=" ".join(data[key].split())
            elif isinstance(data[key], list):
                for i in range(len(data[key])):      
                    ##PDF formatting
                    data[key][i] = re.sub(r"Page\s[0-9]{1,}", "", data[key][i]) #Removes page number
                    data[key][i] = re.sub(r"\n", "", data[key][i])
                    data[key][i] = data[key][i].strip()
                    data[key][i]=" ".join(data[key][i].split())
            else:
                print("Incompatible File")

        return(data)

In [49]:
file_name = "nv_hhs_m_2021.json"
data = nv_preprocess(checkpoints + file_name, trim=True)

In [84]:
def json_split_by_date(json_file):
    """
    
    Parameters
    ----------
    Local path of nv_json generated by nv_pdftotext.
        Local path of cleaned nv_json file. 
    Returns
    -------
    A new json file with month as the keys. We can call new_json_file[month] if we want the transcripts of meetings for this month.
    Eg: call new_json_file[4], we would get the transcripts for April.

    """
    json_date = {}
    month = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

    for key in json_file.keys():
      for doc in json_file[key]:
        rx = r'{0}[ ]([1-9]|[12][0-9]|3[01])[,][ ](2020|2021)'.format(month[int(key)-1])
        match = re.search(rx, doc)
        date = datetime.datetime.strptime(match.group(), '%B %d, %Y').date()
        json_date[date] = doc
    return json_date

In [85]:
data = json_split_by_date(data)
# match = re.search(r'(January|February|March|April|May|June|July|August|September|October|November|December)[ ]([1-9]|[12][0-9]|3[01])[,][ ](2020|2021)', doc)


In [99]:
raw = {}
for i in data.keys():
    raw[i] = json.dumps(data[i])

# Break up the string into words and punctuation, and create a list of words and punctuation.
text = {}
for i in raw.keys():
    text[i] = [word.lower() for word in nltk.word_tokenize(raw[i])]

# Stopwords are non-content words that primarily has only grammatical function
stopwords_en = set(nltk.corpus.stopwords.words('english'))
text_no_stopwords = {}
for i in text.keys():
    text_no_stopwords[i] = [word for word in text[i] if word not in stopwords_en]

# Remove the punctuations
text_no_stopwords_punc = {}
for i in text_no_stopwords.keys():
    text_no_stopwords_punc[i] = [word for word in text_no_stopwords[i] if word not in punctuation]

# Lemmatization
wnl = nltk.stem.WordNetLemmatizer()
for i in text_no_stopwords.keys():
    text_no_stopwords_punc[i] = lemmatize_sent(text_no_stopwords_punc[i])

# Remove the line breaks
text_no_stopwords_punc_lb={}
for i in text_no_stopwords_punc.keys():
    text_no_stopwords_punc_lb[i] = [word for word in text_no_stopwords_punc[i] if not word.startswith('\\n')] + \
                                   [word[2:] for word in text_no_stopwords_punc[i] if word.startswith('\\n')]
# Why
text_no_stopwords_punc_lb_lemma={}
for i in text_no_stopwords_punc_lb.keys():
    text_no_stopwords_punc_lb_lemma[i]=lemmatize_sent(text_no_stopwords_punc_lb[i])

text_no_stopwords_punc_lb_lemma_md={}
for i in text_no_stopwords_punc_lb_lemma.keys():
    text_no_stopwords_punc_lb_lemma_md[i]=[word for word in text_no_stopwords_punc_lb_lemma[i] if nltk.pos_tag([word])[0][1] != 'MD' ]

In [100]:
for i in text_no_stopwords_punc_lb_lemma_md.keys():
  print(text_no_stopwords_punc_lb_lemma_md[i])

['``', 'chair', 'nguyen', 'roll', 'take', 'chair', 'remind', 'committee', 'member', 'witness', 'member', 'audience', 'committee', 'rule', 'protocol', 'procedure', 'in-person', 'virtual', 'meeting', 'move', 'first', 'agenda', 'item', 'open', 'bill', 'hear', 'senate', 'bill', '21', '1st', 'reprint', 'assembly', 'committee', 'health', 'human', 'service', 'april', '28', '2021', 'senate', 'bill', '21', '1st', 'reprint', 'revise', 'requirement', 'relate', 'background', 'investigation', 'conduct', 'certain', 'institution', 'agency', 'facility', 'serve', 'child', 'bdr', '5-303', 'ross', 'e.', 'armstrong', 'administrator', 'division', 'child', 'family', 'service', 'department', 'health', 'human', 'service', 'senate', 'bill', '21', '1st', 'reprint', 'start', 'bore', 'kind', 'nerdy', 'government', 'administrative', 'bill', 'standardize', 'crime', 'check', 'child-serving', 'agency', 'term', 'background', 'check', 'hire', 'employee', 'work', 'vulnerable', 'population', 'division', 'child', 'family'

In [101]:
from nltk.probability import FreqDist
textdist={}
for i in text_no_stopwords_punc_lb_lemma_md.keys():
    textdist[i] = FreqDist(text_no_stopwords_punc_lb_lemma_md[i])

In [127]:
for i in textdist.keys():
  for sent, f_table in textdist[i].items():
    print(sent, f_table)

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
qualify 0.0014820932689305357
disability-related 0.0018905066654930957
expense 0.004985813215733687
education 0.0014195154742905757
housing 0.014840344081973428
transportation 0.0008763887978517861
aim 0.001099141142360727
ease 0.0012447994548204647
financial 0.010501767665847891
burden 0.0008880250790622683
face 0.0006238108651555516
underemployed 0.0018905066654930957
demographic 0.0007109415112942443
group 0.0003954717308823674
2019 0.002381814681628629
legislative 0.00036754893936904683
session 0.00023016026908746212
office 0.0009491321541176818
work 0.0002758790581667393
assemblywoman 0.00016900582412627654
lesley 0.0012447994548204647
e. 0.0009778704830573546
cohen 0.0007884813636025942
district 0.0009325247996249299
29 0.0007109415112942443
division 0.00046032053817492423
age 0.005552795516644666
service -0.0007629442045776539
department 0.0001519405430986023
health -0.0006398886877102903
human -0.00044299986072250875
dhhs 0.0009964898171

In [106]:
termdist = {}
for i in textdist.keys():
  count_words = len(textdist[i].keys())
  termdist[i] = textdist[i]
  for word, count in textdist[i].items():
    termdist[i][word] = count / count_words

In [None]:
for i in termdist.keys():
  for sent, f_table in termdist[i].items():
    print(sent, f_table)

In [112]:
idfdist = {}

for i in termdist.keys():
  for word, count in termdist[i].items():
    if word in idfdist:
      idfdist[word] += 1
    else:
      idfdist[word] = 1
doc_count = len(termdist.keys())
for word, count in idfdist.items():
  idfdist[word] = math.log(doc_count/(count+1))

In [113]:
for sent, f_table in idfdist.items():
  print(sent, f_table)

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
guardrails 3.2771447329921766
shortfall 2.8716796248840124
gregory 2.8716796248840124
desperate 2.178532444324067
107,000 3.2771447329921766
remote 2.3608540011180215
conserve 3.2771447329921766
scarce 3.2771447329921766
apparent 2.583997552432231
arrange 2.3608540011180215
kristyn 3.2771447329921766
leonard 2.8716796248840124
marcus 3.2771447329921766
conklin 3.2771447329921766
hailey 3.2771447329921766
lindsley 3.2771447329921766
isolated 2.583997552432231
vitally 2.3608540011180215
efficiently 2.583997552432231
demonstration 1.890850371872286
nami 2.8716796248840124
twinge 3.2771447329921766
irony 3.2771447329921766
telephonic 2.3608540011180215
necessitate 2.3608540011180215
hopeful 2.0243817644968085
helpline 2.3608540011180215
inquire 2.8716796248840124
warmline 3.2771447329921766
impacted 1.890850371872286
securely 3.2771447329921766
conveniently 3.2771447329921766
value-based 2.3608540011180215
contradictory 2.8716796248840124
cost-effec

In [114]:
tfidfdist = {}
for i in termdist.keys():
  tfidfdist[i] = termdist[i]
  for word, count in termdist[i].items():
    tfidfdist[i][word] = count * idfdist[word]

In [115]:
for i in tfidfdist.keys():
  for sent, f_table in tfidfdist[i].items():
    print(sent, f_table)

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
qualify 0.0014820932689305357
disability-related 0.0018905066654930957
expense 0.004985813215733687
education 0.0014195154742905757
housing 0.014840344081973428
transportation 0.0008763887978517861
aim 0.001099141142360727
ease 0.0012447994548204647
financial 0.010501767665847891
burden 0.0008880250790622683
face 0.0006238108651555516
underemployed 0.0018905066654930957
demographic 0.0007109415112942443
group 0.0003954717308823674
2019 0.002381814681628629
legislative 0.00036754893936904683
session 0.00023016026908746212
office 0.0009491321541176818
work 0.0002758790581667393
assemblywoman 0.00016900582412627654
lesley 0.0012447994548204647
e. 0.0009778704830573546
cohen 0.0007884813636025942
district 0.0009325247996249299
29 0.0007109415112942443
division 0.00046032053817492423
age 0.005552795516644666
service -0.0007629442045776539
department 0.0001519405430986023
health -0.0006398886877102903
human -0.00044299986072250875
dhhs 0.0009964898171

In [124]:
sort_dict = {}
for i in tfidfdist.keys():
  sort_dict[i] = dict(sorted(tfidfdist[i].items(), key=lambda item: item[1], reverse=True))

In [125]:
for i in sort_dict.keys():
  for sent, f_table in sort_dict[i].items():
    print(sent, f_table)

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
nonprofit 0.0073135179130641124
deposit 0.007170942871376126
asset 0.00700355761507269
inspection 0.00700355761507269
donor 0.00680447018415334
parallel 0.00680447018415334
manufacturer 0.00680447018415334
congenital 0.00680447018415334
poverty 0.006647750954311582
gift 0.006476232977375968
idas 0.006472306911768618
redundancy 0.006472306911768618
viral 0.006472306911768618
sti 0.006472306911768618
ratti 0.006429552703483298
s.b 0.00631110217530975
money 0.006282889135906707
regulation 0.0062532154926722315
gas 0.006216863729079714
senate 0.005918990986121167
blood 0.005816782085022634
raise 0.005774217748507688
banker 0.005671519996479288
debt 0.005671519996479288
stds 0.005671519996479288
kieckhefer 0.005626296580887214
age 0.005552795516644666
bank 0.00555105683775083
oversight 0.005495705711803635
expose 0.00548948920526029
eligibility 0.005485138434798084
financially 0.005258332787110717
hardy 0.005160598476862029
code 0.005157438532479945


In [126]:
for i in sort_dict.keys():
  print(i)

2021-04-28
2021-04-02
2021-04-05
2021-04-09
2021-04-21
2021-04-23
2021-04-27
2021-04-29
2021-04-01
2021-04-06
2021-04-08
2021-04-13
2021-04-22
2021-05-03
2021-05-05
2021-05-07
2021-05-24
2021-05-27
2021-05-04
2021-05-06
2021-05-11
2021-05-13
2021-05-18
2021-05-25
2021-02-03
2021-02-08
2021-02-10
2021-02-15
2021-02-17
2021-02-22
2021-02-24
2021-02-16
2021-02-18
2021-02-04
2021-02-09
2021-02-11
2021-03-03
2021-03-08
2021-03-10
2021-03-15
2021-03-19
2021-03-22
2021-03-24
2021-03-26
2021-03-29
2021-03-31
2021-03-02
2021-03-09
2021-03-11
2021-03-18
2021-03-23
2021-03-25
2021-03-30


In [83]:
month = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
for key in data.keys():
      for doc in data[key]:
        rx = r'{0}[ ]([1-9]|[12][0-9]|3[01])[,][ ](2020|2021)'.format(month[int(key)-1])
        match = re.search(rx, doc, re.IGNORECASE)
        print(match.group())

April 28, 2021
April 2, 2021
April 5, 2021
April 5, 2021
April 9, 2021
April 21, 2021
April 23, 2021
April 27, 2021
April 29, 2021
April 1, 2021
April 6, 2021
April 8, 2021
April 13, 2021
April 22, 2021
May 3, 2021
May 5, 2021
May 7, 2021
May 3, 2021
May 5, 2021
May 24, 2021
May 27, 2021
May 27, 2021
May 4, 2021
May 6, 2021
May 11, 2021
May 13, 2021
May 18, 2021
May 25, 2021
May 27, 2021
February 3, 2021
February 8, 2021
February 10, 2021
February 15, 2021
February 17, 2021
February 22, 2021
February 24, 2021
February 16, 2021
February 18, 2021
February 4, 2021
February 9, 2021
February 11, 2021
March 3, 2021
March 8, 2021
March 10, 2021
March 15, 2021
March 10, 2021
March 19, 2021
March 22, 2021
March 24, 2021
March 26, 2021
March 29, 2021
March 31, 2021
March 26, 2021
March 2, 2021
March 9, 2021
March 11, 2021
March 18, 2021
March 23, 2021
March 25, 2021
March 30, 2021
March 26, 2021
March 26, 2021
