In [2]:
import numpy as np
import pandas as pd
import glob
import os
import gzip
import json

## Who Leads Who Follows Paper

### Reading in txt files and converting to dataframe

In [3]:
def read_folder(folder_name, default_user_type=None):
    print(folder_name)
    all_files = glob.glob(f"{folder_name}/*.txt")
    print("All files:", all_files)
    list_rows = []

    for filename in all_files:
        base_name = os.path.basename(filename).split('.')[0]
        date = base_name.split('_')[0]
        user_type = '_'.join(base_name.split('_')[1:]) if default_user_type is None else default_user_type
        
        with open(filename, 'r') as f:
            lines = f.readlines()
            for line in lines:
                list_rows.append([user_type, date, line.strip()])

    return pd.DataFrame(list_rows, columns=['user_type', 'date', 'post_text'])

# use os.path.join to create folder paths
base_folder = os.path.join('..', 'data', 'who_leads_who_follows')
media_folder = os.path.join(base_folder, 'media_text')
random_folder = os.path.join(base_folder, 'random_tweets')
text_folder = os.path.join(base_folder, 'text')
users_folder = os.path.join(base_folder, 'users')

In [4]:
# read each folder and append data to DataFrame
df_media = read_folder(media_folder, 'media')
df_random = read_folder(random_folder, 'random')
df_text = read_folder(text_folder)
df_users = read_folder(users_folder)

# concatenate all DataFrames
who_leads_df = pd.concat([df_media, df_random, df_text, df_users], ignore_index=True)

../data/who_leads_who_follows/media_text
All files: ['../data/who_leads_who_follows/media_text/2014-03-04_media.txt', '../data/who_leads_who_follows/media_text/2014-01-01_media.txt', '../data/who_leads_who_follows/media_text/2013-07-18_media.txt', '../data/who_leads_who_follows/media_text/2013-09-05_media.txt', '../data/who_leads_who_follows/media_text/2013-10-28_media.txt', '../data/who_leads_who_follows/media_text/2013-08-03_media.txt', '../data/who_leads_who_follows/media_text/2014-02-02_media.txt', '../data/who_leads_who_follows/media_text/2014-06-01_media.txt', '../data/who_leads_who_follows/media_text/2014-04-04_media.txt', '../data/who_leads_who_follows/media_text/2014-07-07_media.txt', '../data/who_leads_who_follows/media_text/2014-05-02_media.txt', '../data/who_leads_who_follows/media_text/2014-10-23_media.txt', '../data/who_leads_who_follows/media_text/2014-05-16_media.txt', '../data/who_leads_who_follows/media_text/2014-07-13_media.txt', '../data/who_leads_who_follows/media_

In [4]:
pickle_path = os.path.join(base_folder, 'cleaned_who_leads_df.pkl')
who_leads_df.to_pickle(pickle_path)

## Tracing the Flow of Policy Ideas in Legislatures Paper

In [5]:
file_path = '../data/text_as_policy/dataverse_files(3)/state_bills.json.gz'

data = []
with gzip.open(file_path, 'rt', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line))

In [6]:
# merge the json files into one dataframe
tracing_df = pd.DataFrame(data)

In [7]:
# remove /n from the bill_document_first column
tracing_df['bill_document_first'] = tracing_df['bill_document_first'].str.replace('\n', ' ')
tracing_df['bill_document_last'] = tracing_df['bill_document_last'].str.replace('\n', ' ')
tracing_df

Unnamed: 0,date_signed,date_introduced,bill_document_first,date_updated,short_title,bill_type,actions,summary,chamber,state,session,action_dates,unique_id,bill_document_last,date_created,bill_title,sponsers,bill_id,sunlight_id
0,,2014-01-23 00:00:00,SB2064 S...,2014-03-13 03:17:27,,[bill],"[{'date': '2014-01-23 00:00:00', 'action': 'Fi...",,upper,tn,108,"{'passed_upper': None, 'passed_lower': None, '...",tn_108_SB2064,SB2064 S...,2014-01-24 03:17:17,"Gas, Petroleum Products, Volatile Oils - As in...","[{'leg_id': 'TNL000009', 'type': 'primary', 'n...",SB 2064,TNB00016282
1,2013-05-14 00:00:00,2013-01-31 00:00:00,SB0745 S...,2014-08-26 04:09:00,,[bill],"[{'date': '2013-01-31 00:00:00', 'action': 'Fi...",,upper,tn,108,"{'passed_upper': '2013-04-04 00:00:00', 'passe...",tn_108_SB745,SB0745 S...,2013-02-01 01:28:36,"Physicians and Surgeons - As enacted, enacts ""...","[{'leg_id': 'TNL000021', 'type': 'primary', 'n...",SB 745,TNB00008271
2,2013-04-23 00:00:00,2013-01-31 00:00:00,SB0776 S...,2014-03-05 04:08:24,,[bill],"[{'date': '2013-01-31 00:00:00', 'action': 'Fi...",,upper,tn,108,"{'passed_upper': '2013-03-18 00:00:00', 'passe...",tn_108_SB776,SB0776 S...,2013-02-01 01:28:36,"Telecommunications - As enacted, repeals the T...","[{'leg_id': 'TNL000021', 'type': 'primary', 'n...",SB 776,TNB00008302
3,,2014-01-22 00:00:00,,2014-03-13 03:15:31,,[bill],"[{'date': '2014-01-22 00:00:00', 'action': 'Fi...",,upper,tn,108,"{'passed_upper': None, 'passed_lower': None, '...",tn_108_SB1937,,2014-01-23 03:11:18,"Cosmetology - As introduced, requires cosmetol...","[{'leg_id': 'TNL000016', 'type': 'primary', 'n...",SB 1937,TNB00015757
4,2014-05-16 00:00:00,2014-02-13 00:00:00,SB2596 010...,2014-08-26 04:09:09,,[bill],"[{'date': '2014-02-13 00:00:00', 'action': 'Fi...",,upper,tn,108,"{'passed_upper': '2014-04-10 00:00:00', 'passe...",tn_108_SB2596,SB2596 010...,2014-02-20 04:10:25,"Appropriations - As enacted, makes appropriati...","[{'leg_id': 'TNL000023', 'type': 'primary', 'n...",SB 2596,TNB00017284
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
571549,,2012-03-29 00:00:00,SECOND REGULAR SESSION HOUSE BILL ...,2012-08-22 02:09:31,,[bill],"[{'date': '2012-03-29 00:00:00', 'action': 'In...",,lower,mo,2012,"{'passed_upper': None, 'passed_lower': None, '...",mo_2012_HB2094,SECOND REGULAR SESSION HOUSE BILL ...,2012-03-30 02:02:39,Changes the laws regarding physician assistants,"[{'leg_id': 'MOL000080', 'type': 'primary', 'n...",HB 2094,MOB00001708
571550,,2012-02-21 00:00:00,SECOND REGULAR SESSION HOUSE BILL ...,2012-08-22 02:09:26,,[bill],"[{'date': '2012-02-21 00:00:00', 'action': 'In...",,lower,mo,2012,"{'passed_upper': None, 'passed_lower': None, '...",mo_2012_HB1787,SECOND REGULAR SESSION HOUSE BILL ...,2012-03-07 01:45:15,Requires every merchant to develop and maintai...,"[{'leg_id': 'MOL000154', 'type': 'primary', 'n...",HB 1787,MOB00001282
571551,,2012-01-19 00:00:00,SECOND REGULAR SESSION HOUSE BILL ...,2012-10-29 02:16:34,,[bill],"[{'date': '2012-01-19 00:00:00', 'action': 'In...",,lower,mo,2012,"{'passed_upper': None, 'passed_lower': None, '...",mo_2012_HB1373,SECOND REGULAR SESSION [PERFECTED]...,2012-03-07 01:45:08,Authorizes a county to amend its budget twice ...,"[{'leg_id': 'MOL000037', 'type': 'primary', 'n...",HB 1373,MOB00000868
571552,,2012-01-04 00:00:00,SECOND REGULAR SESSION HOUSE BILL ...,2012-09-15 02:15:44,,[bill],"[{'date': '2012-01-04 00:00:00', 'action': 'In...",,lower,mo,2012,"{'passed_upper': None, 'passed_lower': None, '...",mo_2012_HB1142,SECOND REGULAR SESSION HOUSE BILL ...,2012-03-07 01:45:01,Changes the requirements for the use of privat...,"[{'leg_id': 'MOL000083', 'type': 'primary', 'n...",HB 1142,MOB00000637


In [9]:
# create two new DataFrames, one for 'first' and one for 'last'
first_df = tracing_df.copy()
first_df['last_or_first_bill_version'] = 'first'
first_df['bill_text'] = first_df['bill_document_first']

last_df = tracing_df.copy()
last_df['last_or_first_bill_version'] = 'last'
last_df['bill_text'] = last_df['bill_document_last']

# concatenate them together and sort them so that each 'first' row is immediately followed by its corresponding 'last' row
final_tracing_df = pd.concat([first_df, last_df]).sort_index(kind='merge').reset_index(drop=True)

                 date_signed      date_introduced  \
0                       None  2014-01-23 00:00:00   
1                       None  2014-01-23 00:00:00   
2        2013-05-14 00:00:00  2013-01-31 00:00:00   
3        2013-05-14 00:00:00  2013-01-31 00:00:00   
4        2013-04-23 00:00:00  2013-01-31 00:00:00   
...                      ...                  ...   
1143103                 None  2012-01-19 00:00:00   
1143104                 None  2012-01-04 00:00:00   
1143105                 None  2012-01-04 00:00:00   
1143106                 None  2012-03-29 00:00:00   
1143107                 None  2012-03-29 00:00:00   

                                       bill_document_first  \
0                                          SB2064     S...   
1                                          SB2064     S...   
2                                          SB0745     S...   
3                                          SB0745     S...   
4                                          SB0776    

In [8]:
pickle_path = os.path.join('../data/text_as_policy/', 'cleaned_tracing_df.pkl')
final_tracing_df.to_pickle(pickle_path)