In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [8]:
import re

In [11]:
CaucusFiltered = pd.read_csv('CaucusFilter.csv')

In [26]:
CaucusFiltered['Party'].value_counts() # no independents included

D    123
R     61
Name: Party, dtype: int64

In [9]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)

    tokens = word_tokenize(text)

    tokens = [word.lower() for word in tokens]

    tokens = [word for word in tokens if word.isalpha()]

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if not word in stop_words]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

AllData['Proccessed Press Release'] = AllData['Press Release'].apply(preprocess_text)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aaryakagalwala/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aaryakagalwala/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aaryakagalwala/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
CaucusFiltered['Proccessed Press Release'] = CaucusFiltered['Press Release'].apply(preprocess_text)

In [14]:
AllData.to_csv('CleanedData.csv',index=False)
CaucusFiltered.to_csv('CleanedCaucus.csv',index=False)

In [21]:
CaucusFiltered['Caucus'].value_counts()

Congress Progressive      76
New Democrat Coalition    47
Freedom Party             34
Main Street               27
Name: Caucus, dtype: int64

In [36]:
CaucusFiltered

Unnamed: 0,Date,Member,Party,State / District,Title,Press Release,Caucus,Proccessed Press Release,sentences
0,May 8,Morgan Griffith,R,Va.-9,Griffith Statement on President Biden Pause on...,"Washington, DC – Today, Democratic Leader Hake...",Freedom Party,washington dc today democratic leader hakeem j...,[washington dc today democratic leader hakeem ...
1,May 8,Ayanna Pressley,D,Mass.-7,"Pressley, Haiti Caucus, Colleagues, Advocates ...","Washington, DC – Today, Congresswoman Julia Br...",Congress Progressive,washington dc today congresswoman julia brownl...,[washington dc today congresswoman julia brown...
2,May 8,Pramila Jayapal,D,Wash.-7,"Jayapal, Padilla, Congressional Leaders, Advoc...","April 23, 2024 Text of Letter (PDF) WASHINGTON...",Congress Progressive,april text letter pdf washington haiti caucus ...,[april text letter pdf washington haiti caucus...
3,May 3,Andy Biggs,R,Ariz.-5,Opposing the Uniparty's Latest Attack on the C...,In advance of chairing a congressi...,Freedom Party,advance chairing congressional hearing tuesday...,[advance chairing congressional hearing tuesda...
4,May 6,Julia Brownley,D,Calif.-26,Brownley Introduces Legislation to Expand Acce...,Welcome to the on-line office for Congressman ...,New Democrat Coalition,welcome office congressman gregorio kilili cam...,[welcome office congressman gregorio kilili ca...
...,...,...,...,...,...,...,...,...,...
179,April 11,Mike Levin,D,Calif.-49,SEEC Clean Energy Deployment Task Force Co-Cha...,\nReference ID: 18.d83a2217.1715273339.7623682...,Congress Progressive,reference id proceed problem persists please c...,[reference id proceed problem persists please ...
180,April 12,Mike Levin,D,Calif.-49,Rep. Levin and House Natural Resources Committ...,\nReference ID: 18.d83a2217.1715273339.7623c7a...,Congress Progressive,reference id proceed problem persists please c...,[reference id proceed problem persists please ...
181,April 12,Mike Levin,D,Calif.-49,Rep. Mike Levin and San Diego Congressional De...,\nReference ID: 18.d83a2217.1715273339.762408d...,Congress Progressive,reference id proceed problem persists please c...,[reference id proceed problem persists please ...
182,April 11,Jared Huffman,D,Calif.-2,Rep. Huffman Statement on 2024 Salmon Season C...,\nReference ID: 18.d83a2217.1715273340.7625986...,Congress Progressive,reference id proceed problem persists please c...,[reference id proceed problem persists please ...


In [16]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [28]:
def tokenize_by_sentence(text):
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    return sentences

CaucusFiltered['sentences'] = CaucusFiltered['Proccessed Press Release'].apply(tokenize_by_sentence)
Caucus_sentences = CaucusFiltered.explode('sentences')
Caucus_sentences['sentence_index'] = Caucus_sentences.groupby(level=0).cumcount()
Caucus_sentences.reset_index(drop=True,inplace=True)


In [35]:
text_output = Caucus_sentences['sentences'][217:218].to_string()
print(text_output)

217    reference id proceed problem persists please c...


In [50]:
#rows that were not scraped or not possible to scrape
substring = 'webmaster'
FinalCaucusFiltered = CaucusFiltered[~CaucusFiltered['Proccessed Press Release'].str.contains(substring, case=False)]

FinalCaucusFiltered

Unnamed: 0,Date,Member,Party,State / District,Title,Press Release,Caucus,Proccessed Press Release,sentences
0,May 8,Morgan Griffith,R,Va.-9,Griffith Statement on President Biden Pause on...,"Washington, DC – Today, Democratic Leader Hake...",Freedom Party,washington dc today democratic leader hakeem j...,[washington dc today democratic leader hakeem ...
1,May 8,Ayanna Pressley,D,Mass.-7,"Pressley, Haiti Caucus, Colleagues, Advocates ...","Washington, DC – Today, Congresswoman Julia Br...",Congress Progressive,washington dc today congresswoman julia brownl...,[washington dc today congresswoman julia brown...
2,May 8,Pramila Jayapal,D,Wash.-7,"Jayapal, Padilla, Congressional Leaders, Advoc...","April 23, 2024 Text of Letter (PDF) WASHINGTON...",Congress Progressive,april text letter pdf washington haiti caucus ...,[april text letter pdf washington haiti caucus...
3,May 3,Andy Biggs,R,Ariz.-5,Opposing the Uniparty's Latest Attack on the C...,In advance of chairing a congressi...,Freedom Party,advance chairing congressional hearing tuesday...,[advance chairing congressional hearing tuesda...
4,May 6,Julia Brownley,D,Calif.-26,Brownley Introduces Legislation to Expand Acce...,Welcome to the on-line office for Congressman ...,New Democrat Coalition,welcome office congressman gregorio kilili cam...,[welcome office congressman gregorio kilili ca...
...,...,...,...,...,...,...,...,...,...
165,April 10,Jamaal Bowman,D,N.Y.-16,Rep. Jamaal Bowman Introduces the ACCESS In Me...,"Washington, D.C. – On Tuesday, the U.S. House ...",Congress Progressive,washington tuesday house representative unanim...,[washington tuesday house representative unani...
166,April 12,Jamaal Bowman,D,N.Y.-16,ADVISORY: REP. BOWMAN HOSTS FIRST HIP HOP TASK...,"Washington, D.C. – U.S. Representatives Darin ...",Congress Progressive,washington representative darin lahood jimmy p...,[washington representative darin lahood jimmy ...
171,April 11,Brian Fitzpatrick,R,Pa.-1,"Fitzpatrick, Castro Launch Bipartisan Push to ...","For far too long, Washington Democrats and the...",Main Street,far long washington democrat liberal medium wo...,[far long washington democrat liberal medium w...
174,April 12,Cori Bush,D,Mo.-1,Congresswoman Bush Announces Applications Open...,"Washington, D.C. – Today, U.S. Representative ...",Congress Progressive,washington today representative jared huffman ...,[washington today representative jared huffman...
