source: https://github.com/dallascard/us-immigration-speeches/blob/main/parsing/parse_hein_bound.py

In [1]:
import os
import json
from optparse import OptionParser
import pandas as pd
import spacy
from tqdm import tqdm


In [2]:
indir = './Congress/hein-bound/'
outdir = './Congress/hein-bound_parsed/'
first = 110
last = 111
encoding = 'Windows-1252'

In [4]:
!python -m spacy download en

2022-11-06 18:43:08.091585: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-11-06 18:43:08.091614: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the
full pipeline package name 'en_core_web_sm' instead.[0m
Collecting en-core-web-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 850 kB/s eta 0:00:01
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.3.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


# parse_hein_bound.py

In [9]:
usage = "%prog"
parser = OptionParser(usage=usage)
# parser.add_option('--indir', type=str, default='./Congress/hein-bound/',
#                   help='Hein bound dir: default=%default')
# parser.add_option('--outdir', type=str, default='./Congress/hein-bound_parsed/',
#                   help='Hein bound dir: default=%default')
# parser.add_option('--first', type=int, default=43,
#                   help='First congress: default=%default')
# parser.add_option('--last', type=int, default=111,
#                   help='Last congress: default=%default')
# parser.add_option('--encoding', type=str, default='Windows-1252',
#                   help='Infile encoding: default=%default')
#parser.add_option('--by-issue', action="store_true", default=False,
#                  help='Divide data by issue: default=%default')

# (options, args) = parser.parse_args()

# indir = options.indir
# outdir = options.outdir
# first = options.first
# last = options.last
# encoding = options.encoding

indir = './Congress/hein-bound/'
outdir = './Congress/hein-bound_parsed/'
first = 110
last = 111
encoding = 'Windows-1252'
if not os.path.exists(outdir):
    os.makedirs(outdir)

print("Loading spacy")
nlp = spacy.load("en_core_web_sm")

for congress in range(first, last+1):
    outfile = os.path.join(outdir, 'speeches_' + str(congress).zfill(3) + '.txt')
    if os.path.isfile(outfile):
        print(str(congress)+" existed")
    else:
        
        infile = os.path.join(indir, 'speeches_' + str(congress).zfill(3) + '.txt')
        descr_file = os.path.join(indir, 'descr_' + str(congress).zfill(3) + '.txt')
        basename = os.path.splitext(os.path.basename(infile))[0]

        # first read the description file to get speech dates
        speech_dates = {}
        with open(descr_file, encoding=encoding) as f:
            lines = f.readlines()
        for line in lines:
            parts = line.split('|')
            speech_id = parts[0]
            date = parts[2]
            speech_dates[speech_id] = date

        with open(infile, encoding='Windows-1252') as f:
            lines = f.readlines()

        outlines = []
        for line in tqdm(lines):
            line = line.strip()
            parts = line.split('|')
            line_id = parts[0]
            # drop the header
            if line_id != 'speech_id':
                date = speech_dates[line_id]
                # skip one day that has is corrupted by data from 1994
                if date != '18940614':
                    text = ' '.join(parts[1:])

                    # parse the text
                    parsed = nlp(text)

                    # reattach possessive 's's
                    possessives = [token.i for token in parsed if token.tag_ == 'POS' and token.text == "'s" and token.sent.start != token.i]
                    with parsed.retokenize() as retokenizer:
                        for pos in possessives:
                            retokenizer.merge(parsed[pos-1:pos+1])

                    # collect features to be saved
                    sents = []
                    tokens = []
                    lemmas = []
                    tags = []
                    deps = []
                    heads = []
                    for sent in parsed.sents:
                        sents.append(sent.text)
                        tokens.append([token.text for token in sent])
                        lemmas.append([token.lemma_ for token in sent])
                        tags.append([token.tag_ for token in sent])
                        deps.append([token.dep_ for token in sent])
                        heads.append([token.head.i - sent.start for token in sent])

                    outlines.append({'id': line_id, 'tokens': tokens, 'lemmas': lemmas, 'tags': tags, 'deps': deps, 'heads': heads})


        print("Saving {:d} lines to {:s}".format(len(outlines), outfile))
        with open(outfile, 'w') as fo:
            for line in outlines:
                fo.write(json.dumps(line) + '\n')

Loading spacy


100%|█████████████████████████████████████████████████████████████████████| 212387/212387 [1:08:20<00:00, 51.80it/s]


Saving 212386 lines to ./Congress/hein-bound_parsed/speeches_110.txt


100%|█████████████████████████████████████████████████████████████████████| 179269/179269 [1:00:58<00:00, 49.00it/s]


Saving 179268 lines to ./Congress/hein-bound_parsed/speeches_111.txt


# parse by party

In [6]:
party_full = './Congress/party_full.txt'


# Create a DataFrame from the sample data
dfparty = pd.read_csv(party_full, sep='|')
dfparty
# # Function to categorize party based on 'party_full' content
# def categorize_party(party_full):
#     if 'republican' in party_full.lower().split(' ')[-1]:
#         return 'Republican'
#     elif 'democrat' in party_full.lower().split(' ')[-1]:
#         return 'Democrat'
#     else:
#         return 'Other'

# # Apply the categorization function to the 'party_full' column
# dfparty['party_category'] = dfparty['party_full'].apply(categorize_party)


Unnamed: 0,session,party,party_full
0,1,A,anti-administration
1,1,P,pro-administration
2,1,F,federalist
3,1,R,republican
4,1,D,democratic republican
...,...,...,...
542,113,D,democrat
543,113,I,independent
544,114,R,republican
545,114,D,democrat


In [8]:
dfparty['party'].value_counts()

party
D    109
R    107
I     82
A     50
P     37
F     35
U     34
W     20
J     17
C     17
N     17
S     16
L      6
Name: count, dtype: int64

In [16]:
# list the party_full when category is Other
df[df['party_category'] == 'Other']['party_full'].value_counts()

party_full
independent                   40
unionist                      23
whig                          20
federalist                    19
farmer-labor                  11
adams                          9
jackson                        9
progressive                    9
socialist                      8
american labor                 7
american                       7
conservative                   6
populist                       6
anti masonic                   6
coalitionist                   6
jacksonian                     6
new progressive                6
nullifier                      5
national greenbacker           5
anti-jacksonian                5
anti jacksonian                5
anti-administration            4
free soil                      4
unconditional unionist         3
pro-administration             3
unknown                        3
silver                         3
anti mason                     3
prohibitionist                 3
union                          2

In [21]:
from tqdm import tqdm

# File paths
indir = './Congress/hein-bound/'
outdir = './Congress/hein-bound_parsed_party/'
party_full = './Congress/party_full.txt'
first = 43
last = 137
encoding = 'Windows-1252'
# Read and categorize party_full.txt
dfparty = pd.read_csv(party_full, sep='|')
dfparty['party_code'] = dfparty['party'].astype(str)
# Function to categorize party based on 'party_full' content
def categorize_party(party_full):
    if 'republican' in party_full.lower().split(' ')[-1]:
        return 'Republican'
    elif 'democrat' in party_full.lower().split(' ')[-1]:
        return 'Democrat'
    else:
        return 'Other'

# Apply the categorization function to the 'party_full' column
dfparty['party_category'] = dfparty['party_full'].apply(categorize_party)

# Initialize an empty list to collect speech metadata
speech_metadata = []

for congress in tqdm(range(first, last+1)):
    speaker_map_file = os.path.join(indir, str(congress).zfill(3) + '_SpeakerMap.txt')
    party_map = {}
    speech_file = f'speeches_{str(congress).zfill(3)}.txt'  # Name of the speech file being processed

    # Parse SpeakerMap to get party affiliations
    with open(speaker_map_file, encoding=encoding) as f:
        for line in f.readlines()[1:]:  # Skip header line
            parts = line.strip().split('|')
            speech_id, party_code = parts[1], parts[7]
            party_full_name = dfparty[dfparty['party'] == party_code]['party_full'].values[0] if len(dfparty[dfparty['party'] == party_code]) > 0 else 'Other'
            categorized_party = categorize_party(party_full_name)
            party_map[speech_id] = categorized_party

            # Collect metadata for each speech
            speech_metadata.append({
                'speech_id': speech_id,
                'party_code': party_code,
                'party_full_name': party_full_name,  # New column 'party_full_name
                'categorized_party': categorized_party,
                'speech_file': speech_file
            })

    # The rest of the processing remains the same

# After processing, create a DataFrame from the collected metadata
df_speech_metadata = pd.DataFrame(speech_metadata)

# Display the DataFrame to verify the content
print(df_speech_metadata.head())


  0%|          | 0/95 [00:18<?, ?it/s]


KeyboardInterrupt: 

In [18]:
p = pd.read_csv(speaker_map_file,sep = '|')
p

Unnamed: 0,speakerid,speech_id,lastname,firstname,chamber,state,gender,party,district,nonvoting
0,46045281,460000006,WALLACE,WILLIAM,S,PA,M,D,,voting
1,46046491,460000008,ROLLINS,EDWARD,S,NH,M,R,,voting
2,46045281,460000009,WALLACE,WILLIAM,S,PA,M,D,,voting
3,46045281,460000010,WALLACE,WILLIAM,S,PA,M,D,,voting
4,46047111,460000012,CONKLING,ROSCOE,S,NY,M,R,,voting
...,...,...,...,...,...,...,...,...,...,...
139459,46045781,460120688,DAVIS,DAVID,S,IL,M,I,,voting
139460,46049820,460151464,DAVIS,GEORGE,H,IL,M,R,2.0,voting
139461,46045781,460156619,DAVIS,DAVID,S,IL,M,I,,voting
139462,46045781,460168207,DAVIS,DAVID,S,IL,M,I,,voting


## parallel processing

In [11]:
import os
import pandas as pd
from tqdm.auto import tqdm
from concurrent.futures import ProcessPoolExecutor
# File paths
indir = './Congress/hein-bound/'
outdir = './Congress/hein-bound_parsed_party/'
party_full = './Congress/party_full.txt'
first = 43
last = 111
encoding = 'Windows-1252'
# Read and categorize party_full.txt
dfparty = pd.read_csv(party_full, sep='|')
dfparty['party_code'] = dfparty['party'].astype(str)

def process_congress(congress):
    speaker_map_file = os.path.join(indir, f'{congress:03}_SpeakerMap.txt')
    speech_file = f'speeches_{congress:03}.txt'

    # Read SpeakerMap file into DataFrame
    df_speaker_map = pd.read_csv(speaker_map_file, sep='|', encoding=encoding, skiprows=1, header=None,
                                 names=['speakerid', 'speech_id', 'lastname', 'firstname', 'chamber', 'state', 'gender', 'party_code', 'district', 'nonvoting'])

    # Merge with party categorization
    df_speaker_map = df_speaker_map.merge(dfparty, how='left', on='party_code')
    # df_speaker_map['categorized_party'] = df_speaker_map['party_full'].apply(categorize_party)
    
    # Add 'speech_file' column
    df_speaker_map['speech_file'] = speech_file

    return df_speaker_map[['speech_id', 'party_code', 'party_full', 'speech_file']]

# Initialize DataFrame to store results
df_speech_metadata = pd.DataFrame(columns=['speech_id', 'party_code', 'party_full', 'speech_file'])

with ProcessPoolExecutor() as executor:
    futures = [executor.submit(process_congress, congress) for congress in range(first, last+1)]
    for future in tqdm(futures, total=last-first+1, desc='Processing Congress Sessions'):
        df_speech_metadata = pd.concat([df_speech_metadata, future.result()], ignore_index=True)

print("Processing completed.")


Processing Congress Sessions:   0%|          | 0/69 [00:00<?, ?it/s]

Processing completed.


In [13]:
# save the metadata to a file
filename= 'congress_party.csv'
df_speech_metadata.to_csv(os.path.join('./Congress/',filename), index=False)
print(f"Saved metadata to {filename}")

Saved metadata to congress_party.csv


## merge party and year for parsing

In [11]:
# df_party = pd.read_csv('./Congress/congress_party.csv')
df_year = pd.read_csv('./Congress/congress_year.csv')
df_year

Unnamed: 0,speech_id,year,filenum
0,430000001,1873,43
1,430000002,1873,43
2,430000003,1873,43
3,430000004,1873,43
4,430000005,1873,43
...,...,...,...
17395879,1110179264,2010,111
17395880,1110179265,2010,111
17395881,1110179266,2010,111
17395882,1110179267,2010,111


In [13]:
grouped = df_year.groupby('filenum')['year'].unique()
for filenum, years in grouped.items():
    if len(years) > 1:
        print(f"Filenum {filenum} is associated with multiple years: {years}")
        

Filenum 43 is associated with multiple years: [1873 1874 1875]
Filenum 44 is associated with multiple years: [1875 1876 1877]
Filenum 45 is associated with multiple years: [1877 1878 1879]
Filenum 46 is associated with multiple years: [1879 1880 1881]
Filenum 47 is associated with multiple years: [1881 1882 1883]
Filenum 48 is associated with multiple years: [1883 1884 1885]
Filenum 49 is associated with multiple years: [1885 1886 1887]
Filenum 50 is associated with multiple years: [1887 1888 1889]
Filenum 51 is associated with multiple years: [1889 1890 1891]
Filenum 52 is associated with multiple years: [1891 1892 1893]
Filenum 53 is associated with multiple years: [1893 1894 1895]
Filenum 54 is associated with multiple years: [1895 1896 1897]
Filenum 55 is associated with multiple years: [1897 1898 1899]
Filenum 56 is associated with multiple years: [1899 1900 1901]
Filenum 57 is associated with multiple years: [1901 1902 1903]
Filenum 58 is associated with multiple years: [1903 190

In [14]:
# sort by speech_id
df_party = df_speech_metadata
df_party = df_party.sort_values(by='speech_id')
df_party

Unnamed: 0,speech_id,party_code,party_full,speech_file
0,430000002,R,republican,speeches_043.txt
78,430000002,R,republican,speeches_043.txt
77,430000002,R,republican,speeches_043.txt
76,430000002,R,republican,speeches_043.txt
75,430000002,R,republican,speeches_043.txt
...,...,...,...,...
1428339766,1110179211,D,democrat,speeches_111.txt
1428339767,1110179211,D,democrat,speeches_111.txt
1428339768,1110179211,D,democrat,speeches_111.txt
1428339757,1110179211,D,democrat,speeches_111.txt


In [9]:
df_party['file_num'] = df_party['speech_file'].apply(lambda x: int(x.split('_')[1].split('.')[0]))
# merge the two dataframes based on speech_id and file_num
df = pd.merge(df_party, df_year, left_on='speech_id', right_on='speech_id')
df

Unnamed: 0,speech_id,party_code,party_full_name,categorized_party,speech_file,file_num,year,filenum
0,430000002,R,republican,Republican,speeches_043.txt,43,1873,43
1,430000007,R,republican,Republican,speeches_043.txt,43,1873,43
2,430000008,R,republican,Republican,speeches_043.txt,43,1873,43
3,430000010,R,republican,Republican,speeches_043.txt,43,1873,43
4,430000011,R,republican,Republican,speeches_043.txt,43,1873,43
...,...,...,...,...,...,...,...,...
297629,450129156,D,democratic republican,Republican,speeches_045.txt,45,1879,45
297630,450129976,D,democratic republican,Republican,speeches_045.txt,45,1879,45
297631,450131027,D,democratic republican,Republican,speeches_045.txt,45,1879,45
297632,450133776,R,republican,Republican,speeches_045.txt,45,1879,45


In [2]:
import os
import json
# use conda environement "spacy" if the following command does not work
import spacy
import pandas as pd
from tqdm import tqdm

# Initialize spacy
nlp = spacy.load("en_core_web_sm")

# File paths
indir = './Congress/hein-bound/'
outdir = './Congress/hein-bound_parsed_party/'
party_full = './Congress/party_full.txt'
first = 43
last = 45
encoding = 'Windows-1252'
# Read and categorize party_full.txt
dfparty = pd.read_csv(party_full, sep='|')

# Function to categorize party based on 'party_full' content
def categorize_party(party_full):
    if 'republican' in party_full.lower().split(' ')[-1]:
        return 'Republican'
    elif 'democrat' in party_full.lower().split(' ')[-1]:
        return 'Democrat'
    else:
        return 'Other'

# Apply the categorization function to the 'party_full' column
dfparty['party_category'] = dfparty['party_full'].apply(categorize_party)

# Ensure directories exist
if not os.path.exists(outdir):
    os.makedirs(outdir)

for congress in range(first, last+1):
    speaker_map_file = os.path.join(indir, str(congress).zfill(3) + '_SpeakerMap.txt')
    party_map = {}

    # Parse SpeakerMap to get party affiliations
    with open(speaker_map_file, encoding=encoding) as f:
        for line in f.readlines()[1:]:  # Skip header line
            parts = line.strip().split('|')
            speech_id, party_code = parts[1], parts[7]
            party_full_name = dfparty[dfparty['party'] == party_code]['party_full'].values[0] if len(dfparty[dfparty['party'] == party_code]) > 0 else 'Other'
            party_map[speech_id] = categorize_party(party_full_name)

    infile = os.path.join(indir, 'speeches_' + str(congress).zfill(3) + '.txt')
    with open(infile, encoding=encoding) as f:
        lines = f.readlines()

    # Initialize party-specific outlines
outlines_by_party = {'D': [], 'R': []}  # Initialize only for Democrat and Republican


    for line in tqdm(lines):
        line = line.strip()
        parts = line.split('|')
        line_id = parts[0]
        
        if line_id != 'speech_id':  # Skip header
            party = party_map.get(line_id, 'Other')  # Default to 'Other' if not found
            text = ' '.join(parts[1:])
            parsed = nlp(text)

            tokens = []
            for sent in parsed.sents:
                tokens.append([token.text for token in sent])

            # Process text with spacy and collect features...
            # Append to the correct party list
            outlines_by_party[party].append({'id': line_id, 'tokens':tokens})

    # Save outlines by party
    for party, outlines in outlines_by_party.items():
        outfile = os.path.join(outdir, f'speeches_{str(congress).zfill(3)}_{party[0]}.txt')
        print(f"Saving {len(outlines)} lines for party {party} to {outfile}")
        with open(outfile, 'w') as fo:
            for line in outlines:
                fo.write(json.dumps(line) + '\n')
print("Done")

In [5]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("This is a test sentence. Please let me speak, or I will be forced to speak.")

sents = []
tokens = []
for sent in doc.sents:
    sents.append(sent.text)
    tokens.append([token.text for token in sent])


In [7]:
tokens

[['This', 'is', 'a', 'test', 'sentence', '.'],
 ['Please',
  'let',
  'me',
  'speak',
  ',',
  'or',
  'I',
  'will',
  'be',
  'forced',
  'to',
  'speak',
  '.']]

In [29]:
!python -m spacy download en_core_web_sm


[0mCollecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m80.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[0mInstalling collected packages: en-core-web-sm
  Attempting uninstall: en-core-web-sm
    Found existing installation: en-core-web-sm 3.3.0
    Uninstalling en-core-web-sm-3.3.0:
      Successfully uninstalled en-core-web-sm-3.3.0
Successfully installed en-core-web-sm-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [25]:
!pip uninstall spacy -y
!pip install spacy -y
!python -m spacy download en_core_web_sm



Found existing installation: spacy 3.7.4
Uninstalling spacy-3.7.4:
  Successfully uninstalled spacy-3.7.4

Usage:   
  pip install [options] <requirement specifier> [package-index-options] ...
  pip install [options] -r <requirements file> [package-index-options] ...
  pip install [options] [-e] <vcs project url> ...
  pip install [options] [-e] <local project path> ...
  pip install [options] <archive url/path> ...

no such option: -y
/home/local/PSYCH-ADS/xuqian_chen/anaconda3/envs/jupyter_env/envs/ngram/bin/python: No module named spacy


In [None]:
import os
import json
import spacy
from tqdm import tqdm

# Assuming spacy is already loaded and nlp model is available
nlp = spacy.load("en_core_web_sm")

indir = './Congress/hein-bound/'
outdir = './Congress/hein-bound_parsed_party/'

first = 110
last = 111
encoding = 'Windows-1252'

if not os.path.exists(outdir):
    os.makedirs(outdir)

for congress in range(first, last+1):
    speaker_map_file = os.path.join(indir, str(congress).zfill(3) + '_SpeakerMap.txt')
    party_map = {}

    # Parse SpeakerMap to get party affiliations
    with open(speaker_map_file, encoding=encoding) as f:
        for line in f.readlines()[1:]:  # Skip header line
            parts = line.strip().split('|')
            speech_id, party = parts[1], parts[7]
            party_map[speech_id] = party

    infile = os.path.join(indir, 'speeches_' + str(congress).zfill(3) + '.txt')
    with open(infile, encoding=encoding) as f:
        lines = f.readlines()

    # Initialize party-specific outlines
    outlines_by_party = {'R': [], 'D': [], 'O': []}  # R: Republican, D: Democratic, O: Other

    for line in tqdm(lines):
        line = line.strip()
        parts = line.split('|')
        line_id = parts[0]
        
        if line_id != 'speech_id':  # Skip header
            party = party_map.get(line_id, 'O')  # Default to 'Other' if not found
            text = ' '.join(parts[1:])
            parsed = nlp(text)

            # Process text with spacy and collect features...

            # Append to the correct party list
            outlines_by_party[party[0] if party in ['R', 'D'] else 'O'].append({'id': line_id, ...})

    # Save outlines by party
    for party, outlines in outlines_by_party.items():
        outfile = os.path.join(outdir, f'speeches_{str(congress).zfill(3)}_{party}.txt')
        print(f"Saving {len(outlines)} lines for party {party} to {outfile}")
        with open(outfile, 'w') as fo:
            for line in outlines:
                fo.write(json.dumps(line) + '\n')


import sys

# Print the number of arguments passed to the script
print(f'Number of arguments: {len(sys.argv)}')

# Print the arguments passed to the script
print('Arguments:')
for i, arg in enumerate(sys.argv):
    print(f'{i}: {arg}')

    
sys.argv[1]

## in the bash: notes the i should be converted to int(i) in the script.py

#!/bin/bash

# Iterate over the numbers from 40 to 100
for i in {40..100}
do
    # Run the 'nohup' command with the current number as an argument in the background
    nohup python script.py $i &
done


# congress_parse_speeches_year_party

In [None]:
import os
import pandas as pd
from tqdm import tqdm
import spacy
from concurrent.futures import ProcessPoolExecutor, as_completed

# File paths and settings
mainpath = '/home/local/PSYCH-ADS/xuqian_chen/YES_lab/Amber/nlp'
indir = mainpath+'/Congress/hein-bound/'
# outdir = mainpath+'/Congress/hein-bound_parsed_party_year/'
outdir = './'
party_full = mainpath+'/Congress/party_full.txt'
first = 43
last = 44
encoding = 'Windows-1252'

# Ensure output directory exists
if not os.path.exists(outdir):
    os.makedirs(outdir)

# Initialize spacy
nlp = spacy.load("en_core_web_sm")

df = pd.read_csv(mainpath+'/Congress/congress_party_year.csv')

# Only keep rows with party_code 'D' or 'R'
df = df[df['party_code'].isin(['D', 'R'])]

# Function to process each speech in a given congress session
def process_speeches(group):
    outlines_by_party = {'D': [], 'R': []}  # Initialize only for Democrat and Republican
    for _, row in group.iterrows():
        speech_file_path = os.path.join(indir, row['speech_file'])
        with open(speech_file_path, encoding=encoding) as f:
            lines = f.readlines()

        party_map = {row['speech_id']: row['party_code'] for _, row in group.iterrows()}
        
        for line in lines:
            line = line.strip()
            parts = line.split('|')
            line_id = parts[0]
            
            if line_id != 'speech_id':  # Skip header
                party = party_map.get(line_id)
                if party in outlines_by_party:  # Check if party is either 'D' or 'R'
                    text = ' '.join(parts[1:])
                    parsed = nlp(text)
                    tokens = [token.text for sent in parsed.sents for token in sent]
                    outlines_by_party[party].append({'id': line_id, 'tokens': tokens})
    
    return outlines_by_party

# Get the number of available CPU cores and leave one unoccupied
import multiprocessing
num_cores = max(1, multiprocessing.cpu_count() - 1)

# Use ProcessPoolExecutor to parallelize the processing
with ProcessPoolExecutor(max_workers=num_cores) as executor:
    future_to_congress = {executor.submit(process_speeches, group): congress for congress, group in df.groupby('filenum')}
    
    for future in as_completed(future_to_congress):
        congress = future_to_congress[future]
        outlines_by_party = future.result()
        
        for party, outlines in outlines_by_party.items():
            # save just the tokens, no speech id
            outfile = os.path.join(outdir, f'{congress}_{party}.txt')
            with open(outfile, 'w') as f:
                for outline in outlines:
                    f.write(' '.join(outline['tokens']) + '\n')

            
            print(f"Saved {len(outlines)} lines for party {party} to {outfile}")

print("Processing completed.")

## debug

In [1]:
import os
import pandas as pd
from tqdm import tqdm
import spacy
from concurrent.futures import ProcessPoolExecutor, as_completed

# File paths and settings
mainpath = '/home/local/PSYCH-ADS/xuqian_chen/YES_lab/Amber/nlp'
indir = mainpath+'/Congress/hein-bound/'
# outdir = mainpath+'/Congress/hein-bound_parsed_party_year/'
outdir = './'
party_full = mainpath+'/Congress/party_full.txt'
first = 43
last = 44
encoding = 'Windows-1252'

# Ensure output directory exists
if not os.path.exists(outdir):
    os.makedirs(outdir)

# Initialize spacy
nlp = spacy.load("en_core_web_sm")

df = pd.read_csv(mainpath+'/Congress/congress_party_year.csv')

# Only keep rows with party_code 'D' or 'R'
df = df[df['party_code'].isin(['D', 'R'])]

# Function to process each speech in a given congress session
def process_speeches(group):
    outlines_by_party = {'D': [], 'R': []}  # Initialize only for Democrat and Republican
    for _, row in group.iterrows():
        speech_file_path = os.path.join(indir, row['speech_file'])
        with open(speech_file_path, encoding=encoding) as f:
            lines = f.readlines()

        party_map = {row['speech_id']: row['party_code'] for _, row in group.iterrows()}
        
        for line in lines:
            line = line.strip()
            parts = line.split('|')
            line_id = parts[0]
            
            if line_id != 'speech_id':  # Skip header
                party = party_map.get(line_id)
                if party in outlines_by_party:  # Check if party is either 'D' or 'R'
                    text = ' '.join(parts[1:])
                    parsed = nlp(text)
                    tokens = [token.text for sent in parsed.sents for token in sent]
                    outlines_by_party[party].append({'id': line_id, 'tokens': tokens})
    
    return outlines_by_party

In [13]:
# Sort by 'filenum' and get the first two unique file numbers
unique_filenum = df['filenum'].unique()
if len(unique_filenum) > 2:
    unique_filenum = unique_filenum[:2]  # Select only the first two for testing
df = df[df['filenum'].isin(unique_filenum)]

# Sequential processing
for congress, group in df.groupby('filenum'):
    outlines_by_party = {'D': [], 'R': []}  # Initialize only for Democrat and Republican
    for _, row in group.iterrows():
        speech_file_path = os.path.join(indir, row['speech_file'])
        with open(speech_file_path, encoding=encoding) as f:
            lines = f.readlines()

        party_map = {row['speech_id']: row['party_code'] for _, row in group.iterrows()}
        
        for line in lines:
            line = line.strip()
            parts = line.split('|')
            line_id = parts[0]
            
            if line_id != 'speech_id':  # Skip header
                party = party_map.get(line_id)
                if party in outlines_by_party:  # Check if party is either 'D' or 'R'
                    text = ' '.join(parts[1:])
                    parsed = nlp(text)
                    tokens = [token.text for sent in parsed.sents for token in sent]
                    outlines_by_party[party].append({'id': line_id, 'tokens': tokens})
    
    for party, outlines in outlines_by_party.items():
        outfile = os.path.join(outdir, f'{congress}_{party}.txt')
        with open(outfile, 'w') as f:
            for outline in outlines:
                f.write(' '.join(outline['tokens']) + '\n')
        print(f"Saved {len(outlines)} lines for party {party} to {outfile}")

print("Processing completed.")


KeyboardInterrupt: 

In [21]:
party_map.get(430000046)

'D'

# tokenize_hein_bound.py

In [3]:
# import os
from glob import glob
from optparse import OptionParser

import json
import tqdm

import spacy

# usage = "%prog hein-bound-dir output-dir"
# parser = OptionParser(usage=usage)
# parser.add_option('--first', type=int, default=43,
#                   help='First congress: default=%default')
# parser.add_option('--last', type=int, default=111,
#                   help='Last congress: default=%default')

# (options, args) = parser.parse_args()

# indir = args[0]
# outdir = args[1]
indir = './Congress/hein-bound/'
outdir = './Congress/hein-bound_parsed/'


if not os.path.exists(outdir):
    os.makedirs(outdir)

first = 43
last = 44

print("Loading spacy")
nlp = spacy.load("en_core_web_sm")

for congress in range(first, last+1):
    infile = os.path.join(indir, 'speeches_' + str(congress).zfill(3) + '.txt')
    descr_file = os.path.join(indir, 'descr_' + str(congress).zfill(3) + '.txt')
    print(infile)
    basename = os.path.splitext(os.path.basename(infile))[0]
    outlines = []
    speech_dates = {}
    with open(descr_file, encoding='Windows-1252') as f:
        lines = f.readlines()
    for line in lines:
        parts = line.split('|')
        speech_id = parts[0]
        date = parts[2]
        speech_dates[speech_id] = date

    with open(infile, encoding='Windows-1252') as f:
        lines = f.readlines()
    for line in tqdm.tqdm(lines):
        line = line.strip()
        parts = line.split('|')
        line_id = parts[0]
        # drop the header
        if line_id != 'speech_id':
            date = speech_dates[line_id]
            # skip one day that has is corrupted by data from 1994
            if date != '18940614':
                text = ' '.join(parts[1:])
                parsed = nlp(text)
                sents = []
                tokens = []
                for sent in parsed.sents:
                    sents.append(sent.text)
                    tokens.append([token.text for token in sent])

                assert len(sents) == len(tokens)

                rejoined_sents = []
                rejoined_tokens = []
                if len(sents) > 0:
                    current_sent = sents[0]
                    current_tokens = tokens[0]
                    if len(sents) > 1:
                        for sent_i in range(1, len(sents)):
                            # look to see if this might be a false sentence break
                            if sents[sent_i-1][-1] == '.' and (sents[sent_i][0].islower() or sents[sent_i][0].isdigit() or sents[sent_i][0] == '$' or sents[sent_i][0] == '%'):
                                # if so, extend the previous sentence / tokens
                                current_sent += ' ' + sents[sent_i]
                                current_tokens.extend(tokens[sent_i])
                            else:
                                # otherwise, add the previous to the list, and start a new one
                                rejoined_sents.append(current_sent)
                                rejoined_tokens.append(current_tokens)
                                current_sent = sents[sent_i]
                                current_tokens = tokens[sent_i]
                    # add the current to the list
                    rejoined_sents.append(current_sent)
                    rejoined_tokens.append(current_tokens)

                outlines.append({'infile': basename, 'id': line_id, 'sents': rejoined_sents, 'tokens': rejoined_tokens})

    outfile = os.path.join(outdir, basename + '.jsonlist')
    print("Saving {:d} lines to {:s}".format(len(outlines), outfile))
    with open(outfile, 'w') as fo:
        for line in outlines:
            fo.write(json.dumps(line) + '\n')


Loading spacy
./Congress/hein-bound/speeches_043.txt


100%|████████████████████████████████████████████████████████████████████████████████████████████| 119303/119303 [19:17<00:00, 103.11it/s]


Saving 119302 lines to ./Congress/hein-bound_parsed/speeches_043.jsonlist
./Congress/hein-bound/speeches_044.txt


  7%|██████▉                                                                                        | 8425/114781 [01:43<21:46, 81.40it/s]


KeyboardInterrupt: 

# rejoin_into_pieces_by_congress.py

In [5]:
import os
import json
from optparse import OptionParser

import numpy as np

# Same as rejoin_into_pieces, except output one file per congress

def main():
    usage = "%prog outdir"
    parser = OptionParser(usage=usage)
    parser.add_option('--hein-bound-dir', type=str, default='data/speeches/Congress/hein-bound-tokenized-rejoined',
                      help='Issue: default=%default')
    parser.add_option('--hein-daily-dir', type=str, default='data/speeches/Congress/hein-daily-tokenized',
                      help='Issue: default=%default')
    parser.add_option('--first', type=int, default=43,
                      help='First congress: default=%default')
    parser.add_option('--last', type=int, default=114,
                      help='Last congress: default=%default')
    parser.add_option('--max', type=int, default=375,
                      help='Max tokens per block: default=%default')
    parser.add_option('--keep-boundaries', action="store_true", default=False,
                      help='Output each sentence on a separate line: default=%default')
    parser.add_option('--replace-periods', action="store_true", default=False,
                      help='Change periods that look wrong to commas: default=%default')
    parser.add_option('--use-sents', action="store_true", default=False,
                      help='Use sentences rather than tokens (avoid excess spaces): default=%default')

    (options, args) = parser.parse_args()

    outdir = args[0]

    hein_bound_dir = options.hein_bound_dir
    hein_daily_dir = options.hein_daily_dir
    first = options.first
    last = options.last
    max_length = options.max
    keep_boundaries = options.keep_boundaries
    replace_periods = options.replace_periods
    use_sents = options.use_sents

    if not os.path.exists(outdir):
        os.makedirs(outdir)

    for congress in range(first, last+1):
        if congress < 100:
            infile = os.path.join(hein_bound_dir, 'speeches_0' + str(congress) + '.jsonlist')
        elif congress > 111:
            infile = os.path.join(hein_daily_dir, 'speeches_' + str(congress) + '.jsonlist')
        else:
            infile = os.path.join(hein_bound_dir, 'speeches_' + str(congress) + '.jsonlist')

        print(infile)
        with open(infile) as f:
            lines = f.readlines()

        lengths = []
        outlines = []
        # read each speech, one by one
        for line in lines:
            line = json.loads(line)
            speech_id = line['id']  # speehch id
            if use_sents:
                sents = line['sents']  # list of strings
            else:
                sents = line['tokens']  # list of (list of strings)

            joined_sents = []
            n_tokens = 0
            n_chars = 0
            # first try to rejoin bad splits
            for tokens in sents:
                if use_sents:
                    # put the string in a list to conform to below
                    n_chars += len(tokens)
                    n_tokens += len(tokens.split())
                    tokens = [tokens]
                else:
                    n_chars += sum([len(t) for t in tokens]) + len(tokens) - 1
                    n_tokens += len(tokens)
                # if this is the first sentence, start a new string
                if len(joined_sents) == 0:
                    joined_sents.append(' '.join(tokens))
                # if it looks like this is not the start of a sentence:
                elif tokens[0][0].islower() or tokens[0][0].isdigit() or tokens[0][0] == '$' or tokens[0][0] == '%':
                    # change a period to a comma at the end of the last group, if it is there
                    if replace_periods and joined_sents[-1][-1] == '.':
                        joined_sents[-1] = joined_sents[-1][:-1] + ', ' + ' '.join(tokens)
                    else:
                        joined_sents[-1] += ' ' + ' '.join(tokens)
                    n_chars += 1
                # otherwise, start a new string
                else:
                    joined_sents.append(' '.join(tokens))

            if keep_boundaries:
                check = 0
                for s_i, sent in enumerate(joined_sents):
                    lengths.append(len(sent))
                    check += len(sent)
                    outlines.append({'id': speech_id + '_s' + f'{s_i:03}', 'text': sent})

                assert check == n_chars

            else:
                # then connect these sentence into blocks of up to max_length tokens or keep as sentences:
                cur_length = 0
                output_blocks = []
                for sent in joined_sents:
                    sent_len = len(sent.split())
                    if len(output_blocks) == 0:
                        output_blocks.append(sent)
                        cur_length += sent_len
                    elif cur_length + sent_len > max_length:
                        # append this as a new block and reset count
                        output_blocks.append(sent)
                        cur_length = sent_len
                    else:
                        output_blocks[-1] += ' ' + sent
                        cur_length += sent_len

                check = 0
                for block_i, block in enumerate(output_blocks):
                    block_length = len(block.split())
                    check += block_length
                    lengths.append(block_length)
                    outlines.append({'id': speech_id + '_b' + f'{block_i:03}', 'text': block})

                try:
                    assert check == n_tokens
                except AssertionError as e:
                    print(n_tokens, check, len(outlines))

        print(np.mean(lengths), np.median(lengths), np.max(lengths), sum([1 for length in lengths if length > max_length]) / len(lengths))
        longest = int(np.argmax(lengths))
        print(outlines[longest])

        outfile = os.path.join(outdir, 'segments-' + str(congress).zfill(3) + '.jsonlist')
        with open(outfile, 'w') as f:
            for line in outlines:
                f.write(json.dumps(line) + '\n')


if __name__ == '__main__':
    main()

Usage: ipykernel_launcher.py outdir

ipykernel_launcher.py: error: no such option: -f


AssertionError: 

In [10]:
infile = './Congress/hein-bound_parsed/speeches_043.jsonlist'
with open(infile) as f:
    lines = f.readlines()

lines[:2]


['{"infile": "speeches_043", "id": "430000001", "sents": ["The Secretary will read the names of the newlyelected Senators.", "The list was read as follows: lion.", "Bainbridge Wadleigh. of New Hampshire.", "Hon.", "Justin S. Morrill. of Vermont.", "Hon.", "Orris S. Ferry. of Connecticut.", "Hon.", "Roscoe Coukling. of New York.", "Hon.", "Simon Cameron. of Pennsylvania. lan.", "George h. Dennis. of Maryland.", "Hon.", "Augustus S. Merrimon. of North Carolina.", "Hon.", "John J. Patterson. of South Carolina.", "Hon.", "Simon B. Conover. of Florida.", "Ion.", "George E. Spencer. of Alabama.", "H10a.", "Stephen WV.", "Dorsey. of Arkansas. lion.", "John B. Gordon. of Georgia.", "Hon.", "Lewis V. Bogy. of Missouri.", "Hon.", "Thomas C. MeCreery. of Kentucky.", "Hon.", "John Sherman. of Ohio.", "Hon. Oliver P1.", "Morton. of Indiana.", "Hon.", "Richard 3.", "Oglesby. of Illinois.", "Hon.", "Timothy 0.", "Howe. of Wisconsin.", "Hon.", "William B. Allison. of Iowa.", "Hon.", "John J. Ingalls. 

In [11]:
with open('./Congress/hein-bound_parsed/speeches_043.txt', 'r') as f:
    # Read the first line of the file
    lines = f.readlines()
print(lines[:2])

['{"id": "430000001", "tokens": [["The", "Secretary", "will", "read", "the", "names", "of", "the", "newlyelected", "Senators", "."], ["The", "list", "was", "read", "as", "follows", ":", "lion", "."], ["Bainbridge", "Wadleigh", ".", "of", "New", "Hampshire", "."], ["Hon", "."], ["Justin", "S.", "Morrill", ".", "of", "Vermont", "."], ["Hon", "."], ["Orris", "S.", "Ferry", ".", "of", "Connecticut", "."], ["Hon", "."], ["Roscoe", "Coukling", ".", "of", "New", "York", "."], ["Hon", "."], ["Simon", "Cameron", ".", "of", "Pennsylvania", "."], ["lan", "."], ["George", "h.", "Dennis", "."], ["of", "Maryland", "."], ["Hon", "."], ["Augustus", "S.", "Merrimon", ".", "of", "North", "Carolina", "."], ["Hon", "."], ["John", "J.", "Patterson", ".", "of", "South", "Carolina", "."], ["Hon", "."], ["Simon", "B.", "Conover", ".", "of", "Florida", "."], ["Ion", "."], ["George", "E.", "Spencer", ".", "of", "Alabama", "."], ["H10a", "."], ["Stephen", "WV", "."], ["Dorsey", ".", "of", "Arkansas", "."], ["lio

In [13]:
lines[0]

'{"id": "430000001", "tokens": [["The", "Secretary", "will", "read", "the", "names", "of", "the", "newlyelected", "Senators", "."], ["The", "list", "was", "read", "as", "follows", ":", "lion", "."], ["Bainbridge", "Wadleigh", ".", "of", "New", "Hampshire", "."], ["Hon", "."], ["Justin", "S.", "Morrill", ".", "of", "Vermont", "."], ["Hon", "."], ["Orris", "S.", "Ferry", ".", "of", "Connecticut", "."], ["Hon", "."], ["Roscoe", "Coukling", ".", "of", "New", "York", "."], ["Hon", "."], ["Simon", "Cameron", ".", "of", "Pennsylvania", "."], ["lan", "."], ["George", "h.", "Dennis", "."], ["of", "Maryland", "."], ["Hon", "."], ["Augustus", "S.", "Merrimon", ".", "of", "North", "Carolina", "."], ["Hon", "."], ["John", "J.", "Patterson", ".", "of", "South", "Carolina", "."], ["Hon", "."], ["Simon", "B.", "Conover", ".", "of", "Florida", "."], ["Ion", "."], ["George", "E.", "Spencer", ".", "of", "Alabama", "."], ["H10a", "."], ["Stephen", "WV", "."], ["Dorsey", ".", "of", "Arkansas", "."], ["lion

In [None]:
nohup python script.py --hein-bound-dir /data/hein-bound-tokenized --hein-daily-dir /data/hein-daily-tokenized --first 43 --last 114 --max 375 --keep-boundaries --replace-periods


In [8]:
import pandas as pd
import os
import pandas as pd
data_dir = "./Congress/hein-bound/"

# Create an empty dataframe to store the results
df_results = pd.DataFrame()
for i in range(43,112):
    if i < 100:
        n = "0"+str(i)
    else:
        n = str(i)
    descr = pd.read_csv(os.path.join(data_dir, f'descr_{n}.txt'), 
                          encoding="ISO-8859-1", 
                          sep="|")
    dates = pd.to_datetime(descr['date'], format="%Y%m%d")
    descr['year'] = dates.dt.year
    descr = descr.assign(filenum=n)
    df_selected = descr.loc[:, ["speech_id", "year","filenum"]]
    df_results = pd.concat([df_results, df_selected])


# Print the resulting dataframe
print(df_results)
df_results.to_csv(os.path.join('./Congress/',"congressyear.csv"), index=False)

         speech_id  year filenum
0        430000001  1873     043
1        430000002  1873     043
2        430000003  1873     043
3        430000004  1873     043
4        430000005  1873     043
...            ...   ...     ...
179263  1110179264  2010     111
179264  1110179265  2010     111
179265  1110179266  2010     111
179266  1110179267  2010     111
179267  1110179268  2010     111

[17395884 rows x 3 columns]


# start of training

In [4]:
import pandas as pd
import os
import sys

pro_dir = './Congress'
data_dir = "./Congress/hein-bound/"
text_dir = "./Congress/hein-bound_parsed/"
model_dir = "./Congress/model/"

dfresults = pd.read_csv(os.path.join(pro_dir,'congressyear.csv'), dtype={'filenum': str} )
# dfresults['year'].hist(bins = 200)

In [5]:
dfresults

Unnamed: 0,speech_id,year,filenum
0,430000001,1873,043
1,430000002,1873,043
2,430000003,1873,043
3,430000004,1873,043
4,430000005,1873,043
...,...,...,...
17395879,1110179264,2010,111
17395880,1110179265,2010,111
17395881,1110179266,2010,111
17395882,1110179267,2010,111


In [47]:
for year in range(1873,2011):
    df_new = dfresults.loc[dfresults['year']==year]
    if len(list(set(df_new['filenum'])))>1:
        print(year, list(set(df_new['filenum'])))

1875 ['043', '044']
1877 ['045', '044']
1879 ['045', '046']
1881 ['047', '046']
1883 ['048', '047']
1885 ['048', '049']
1887 ['050', '049']
1889 ['051', '050']
1891 ['052', '051']
1893 ['052', '053']
1895 ['053', '054']
1897 ['055', '054']
1899 ['055', '056']
1901 ['056', '057']
1903 ['057', '058']
1905 ['059', '058']
1907 ['059', '060']
1909 ['061', '060']
1911 ['061', '062']
1913 ['063', '062']
1915 ['063', '064']
1917 ['064', '065']
1919 ['066', '065']
1921 ['067', '066']
1923 ['067', '068']
1925 ['068', '069']
1927 ['070', '069']
1929 ['070', '071']
1931 ['072', '071']
1933 ['073', '072']
1941 ['077', '076']
1951 ['081', '082']
1971 ['092', '091']
2009 ['111', '110']


In [113]:
infile = f'./Congress/hein-bound_parsed/speeches_053.jsonlist'
with open(infile, 'r') as f:
    # Read the lines from the file
    lines = f.readlines()
lines[:3]

['{"infile": "speeches_053", "id": "530000001", "sents": ["Senators. deeply impressed with a sense of its responsibilities and of its dignities.", "I now enter upon the discharge of the duties of the high office to which I have been called.", "I am not unmindful of the fact that among the occupants of this chair during the one hundred and four years of our constitutional history have been statesmen eminent alike for their talents and for their tireless devotion to public duty.", "Adams.", "Jefferson. and Calhoun honored its incumbency during the early days of the Republic. while Arthur.", "Hendricks. and Morton have at a later period of our history shed luster upon the office of President of the most august deliberative assembly known to men.", "I assume the duties of the great trust confided to me with no feeling of selfconfidence. but rather with that of grave distrust of my ability satisfactorily to meet its requirements.", "I may be pardoned for saying that it shall be my earnest e

In [127]:
df_new = dfresults.loc[dfresults['year']==year]
allline = []
infile = f'./Congress/hein-bound_parsed/speeches_053.jsonlist'
with open(infile, 'r') as f:
    # Read the lines from the file
    lines = f.readlines()
    for line in lines[:100]:
        jline = json.loads(line)
        if int(jline['id']) in df_new['speech_id'].to_list():
            print(int(jline['id']))
            # print(jline['tokens'])
            allline.extend(jline['tokens'])

In [128]:
int(jline['id'])

530000100

In [129]:
import gensim
from gensim.models import Word2Vec
import json
for year in range(1895,1896):
    df_new = dfresults.loc[dfresults['year']==year]
    allline = []
    for filenum in list(set(df_new['filenum'])):
        print(filenum)
        infile = f'./Congress/hein-bound_parsed/speeches_{filenum}.jsonlist'
        with open(infile, 'r') as f:
            # Read the lines from the file
            lines = f.readlines()
            
            for line in lines:
                jline = json.loads(line)
                if int(jline['id']) in df_new['speech_id'].to_list():
                    # print(int(jline['id']))
                    # print(jline['tokens'])
                    allline.extend(jline['tokens'])
         
            
len(alldata)
# model_1 = Word2Vec(,workers=cores-1)
# model_1.save(model_file)

053
054


0

In [147]:
len(alldata)
from gensim.models import Word2Vec

# Set values for various parameters
num_features = 300    # Word vector dimensionality
min_word_count = 3    # Minimum word count
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size
downsampling = 1e-3   # Downsample setting for frequent words

model_dir = "./Congress/model/"

model_2 = Word2Vec(allline,
            sg=1,
            seed=1,
            workers=num_workers,
            
            min_count=min_word_count,
            window=context,
            sample=downsampling)
model_file = os.path.join(model_dir,'1895.model')
model_1.save(model_file)

'./Congress/model/1999.model'

In [132]:
len(alldata)
from gensim.models import Word2Vec

# Set values for various parameters
num_features = 300    # Word vector dimensionality
min_word_count = 3    # Minimum word count
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size
downsampling = 1e-3   # Downsample setting for frequent words

model_dir = "./Congress/model/"

model_1 = Word2Vec(allline,
            sg=1,
            seed=1,
            workers=num_workers,
            
            min_count=min_word_count,
            window=context,
            sample=downsampling)
model_file = os.path.join(model_dir,'1895.model')
model_1.save(model_file)

In [153]:
pairs = [
    ('president', 'king'),   # a minivan is a kind of car
    ('president', 'law'),   # still a wheeled vehicle
    ('president', 'air'),  # ok, no wheels, but still a vehicle
    ('president', 'car'),    # ... and so on
    ('president', 'marriage'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, model_1.wv.similarity(w1, w2)))

'president'	'king'	0.59
'president'	'law'	0.30
'president'	'air'	0.41
'president'	'car'	0.59
'president'	'marriage'	0.54


In [154]:
print(model_1.wv.doesnt_match(['fire', 'water', 'land', 'sea', 'air', 'car']))

land


In [155]:
import pandas as pd
import os
import sys
import gensim
from gensim.models import Word2Vec
import json

pro_dir = './Congress'
data_dir = "./Congress/hein-bound/"
text_dir = "./Congress/hein-bound_parsed/"
model_dir = "./Congress/model/"

dfresults = pd.read_csv(os.path.join(pro_dir,'congressyear.csv'), dtype={'filenum': str} )

year = 1874

model_file = os.path.join(model_dir,str(year)+'.model')
if not os.path.exists(model_file):
    df_new = dfresults.loc[dfresults['year']==year]
    allline = []
    for filenum in list(set(df_new['filenum'])):
        print(filenum)
        infile = f'./Congress/hein-bound_parsed/speeches_{filenum}.jsonlist'
        with open(infile, 'r') as f:
            # Read the lines from the file
            lines = f.readlines()
            
            for line in lines:
                jline = json.loads(line)
                if int(jline['id']) in df_new['speech_id'].to_list():
                    # print(int(jline['id']))
                    # print(jline['tokens'])
                    allline.extend(jline['tokens'])
    # Set values for various parameters
    num_features = 100    # Word vector dimensionality
    min_word_count = 1    # Minimum word count
    num_workers = 4       # Number of threads to run in parallel
    context = 10          # Context window size
    downsampling = 1e-3   # Downsample setting for frequent words


    model_1 = Word2Vec(allline,
                sg=1,
                seed=1,
                workers=num_workers,

                min_count=min_word_count,
                window=context,
                sample=downsampling)
    model_file = os.path.join(model_dir,str(year)+'.model')
    model_1.save(model_file)
else:
	print(str(model_file), " existed")


043


In [162]:
# List the files in the folder
files = os.listdir(text_dir)

# Loop through the files in the folder
for file in files:
    # Get the full file path
    file_path = os.path.join(text_dir, file)

    # Get the file size
    file_size = os.path.getsize(file_path)

    # Convert the file size from bytes to MB
    file_size_mb = file_size / 1048576

    # Print the file name and size in MB
    print(f'{file}: {file_size_mb:.2f} MB')

speeches_043.txt: 440.00 MB
speeches_044.txt: 414.86 MB
speeches_045.txt: 410.46 MB
speeches_046.txt: 503.45 MB
speeches_047.txt: 604.26 MB
speeches_048.txt: 414.42 MB
speeches_049.txt: 498.79 MB
speeches_070.txt: 449.59 MB
speeches_060.txt: 381.74 MB
speeches_050.txt: 532.83 MB
speeches_080.txt: 752.65 MB
speeches_061.txt: 689.08 MB
speeches_100.txt: 1636.66 MB
speeches_071.txt: 766.32 MB
speeches_051.txt: 682.34 MB
speeches_090.txt: 1629.30 MB
speeches_081.txt: 1066.31 MB
speeches_052.txt: 409.06 MB
speeches_072.txt: 648.54 MB
speeches_062.txt: 821.20 MB
speeches_053.txt: 666.70 MB
speeches_082.txt: 782.93 MB
speeches_073.txt: 559.05 MB
speeches_101.txt: 1584.55 MB
speeches_054.txt: 418.15 MB
speeches_091.txt: 1800.18 MB
speeches_074.txt: 706.30 MB
speeches_063.txt: 1166.57 MB
speeches_055.txt: 548.31 MB
speeches_083.txt: 1037.30 MB
speeches_056.txt: 443.93 MB
speeches_075.txt: 648.56 MB
speeches_102.txt: 1644.02 MB
speeches_064.txt: 756.07 MB
speeches_057.txt: 404.49 MB
speeches_092

In [2]:
jline['id']

NameError: name 'jline' is not defined