In [None]:
import os
import pandas as pd
import numpy as np
import json

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')

# From the google drive page, right click and add a shortcut for the congress data folder to your colab notebook
!ls "/content/drive/My Drive/Colab Notebooks/DSBA-6156-Congress-Data"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
100.json  103.json  106.json  109.json	112.json  115.json  93.zip   96.zip   99.zip
100.zip   103.zip   106.zip   109.zip	112.zip   115.zip   94.json  97.json  all_bios.csv
101.json  104.json  107.json  110.json	113.json  116.json  94.zip   97.zip   all_bios.gsheet
101.zip   104.zip   107.zip   110.zip	113.zip   116.zip   95.json  98.json  all_bios.json
102.json  105.json  108.json  111.json	114.json  117.zip   95.zip   98.zip   bio_map.json
102.zip   105.zip   108.zip   111.zip	114.zip   93.json   96.json  99.json


In [None]:
groups = range(93, 116)  # Adjust the range to match your files
data_by_year = {}

for group in groups:
    filename = f'/content/drive/My Drive/Colab Notebooks/DSBA-6156-Congress-Data/{group}.json'
    with open(filename, 'r') as f:
        data_by_year[group] = json.load(f)

dataframes = {}

for year, data in data_by_year.items():
    df_rows = []
    for bill in data:
        row = {}
        row['congress'] = bill['congress']
        row['number'] = bill['number']
        row['bill_type'] = bill['bill_type']
        row['intro_date'] = bill['intro_date']
        row['title'] = bill['title']
        row['summary'] = bill['summary']  # Include the summary field
        row['policy_area'] = bill['policy_area']
        row['subjects'] = bill['subjects']
        row['sponsor'] = bill['sponsor']
        row['cosponsors'] = bill['cosponsors']
        row['actions'] = bill['actions']
        df_rows.append(row)

    dataframes[year] = pd.DataFrame(df_rows)
# Concatenate all dataframes in the dictionary into a single dataframe
df = pd.concat(dataframes.values(), ignore_index=True)

# Optional: Add a column to keep track of the congress
df['Congress'] = [congress for congress in dataframes.keys() for _ in range(len(dataframes[congress]))]

# Reset index if needed
df.reset_index(drop=True, inplace=True)

# Function to expand dictionary columns

def expand_column(df, column_name):
    # Check if the column contains dictionaries or lists of dictionaries
    if isinstance(df[column_name].iloc[0], dict):
        # Expand dictionaries into separate DataFrame columns
        expanded_df = pd.json_normalize(df[column_name])
    elif isinstance(df[column_name].iloc[0], list):
        # Expand lists of dictionaries into separate DataFrame columns
        # Concatenate all dictionaries in the list
        concatenated_dicts = []
        for dicts in df[column_name]:
            concatenated_dicts.extend(dicts)
        # Convert the concatenated list of dictionaries into a DataFrame
        expanded_df = pd.DataFrame(concatenated_dicts)
    else:
        raise ValueError("Column does not contain dictionaries or lists of dictionaries")

    return expanded_df

# Expand 'sponsors' column
sponsor_df = expand_column(df, 'sponsor')

df['sponsor'] = sponsor_df['name']
df['state'] = sponsor_df['state']
df = df.drop(['cosponsors', 'actions'], axis = 1)
df.head(5)

Unnamed: 0,congress,number,bill_type,intro_date,title,summary,policy_area,subjects,sponsor,Congress,state
0,93,2009,s,1973-06-18,"A bill to amend the Antidumping Act of 1921, a...",Provides that whenever the Secretary of the Tr...,Imports,"[Countervailing duties, Imports]","Fannin, Paul J.",93,AZ
1,93,4114,s,1974-10-09,A bill to authorize the President to reduce Fe...,Emergency Budget Control Act - Authorizes the ...,Federal budgets,"[Economics and public finance, Federal budgets...","Roth Jr., William V.",93,DE
2,93,225,s,1973-01-04,A bill to authorize the project for the Days C...,"Authorizes the project for the Days Creek Dam,...",Dams,"[Dams, Flood control, Oregon, Rivers]","Hatfield, Mark O.",93,OR
3,93,3969,s,1974-09-06,A bill to provide for a study of the feasibili...,Requires a one-year study of the feasibility o...,"Old age, survivors and disability insurance","[Old age, survivors and disability insurance, ...","Pell, Claiborne",93,RI
4,93,3344,s,1974-04-10,A bill to authorize appropriations for activit...,(LATEST SUMMARY) National Science Foundation A...,"Science, technology, communications","[Appropriations, National Science Foundation, ...","Kennedy, Edward M.",93,MA


In [None]:
len(df)

248860

In [None]:
subjects_onehot = df['subjects'].str.join('|').str.get_dummies()

# Concatenate the one-hot encoded 'subjects' DataFrame with the original DataFrame
df_encoded = pd.concat([df.drop('subjects', axis=1), subjects_onehot], axis=1)
df_encoded

In [None]:
# Expand 'actions' column
actions_df = expand_column(df, 'actions')
df['action_t']

Unnamed: 0,action_date,action_text,action_type,action_status,action_code
0,1973-06-18,Referred to Senate Committee on Finance.,referral,REFERRED,
1,1974-10-09,Ordered held at desk.,action,,
2,1973-01-04,Referred to Senate Committee on Public Works.,referral,REFERRED,
3,1974-09-06,Referred to Senate Committee on Finance.,referral,REFERRED,
4,1974-04-10,Referred to Senate Committee on Labor and Publ...,referral,REFERRED,
...,...,...,...,...,...
960189,2017-10-26,Referred to the House Committee on Veterans' A...,referral,REFERRED,H11100
960190,2017-10-26,Referred to the Subcommittee on Economic Oppor...,referral,,
960191,2018-02-07,Introduced in House,action,,Intro-H
960192,2018-02-07,Referred to the House Committee on Energy and ...,referral,REFERRED,H11100
