Created by: Nicolas Purpose: This file creates political cosine similarity between states
Modified by: Revekka Purpose: Also to obtain political cosine similarity between federal and state data

In [7]:
import pandas as pd 
import os 
from sklearn.metrics.pairwise import cosine_similarity
# os.chdir("/Users/nicolaslonguetmarx/Dropbox/StateLaws/")
# os.chdir("/Users/revekkagershovich/Dropbox (MIT)/StateLaws")

In [8]:
parent_dir = os.path.abspath("/Users/revekkagershovich/Dropbox (MIT)/StateLaws")
os.chdir(parent_dir)
assert os.path.exists(parent_dir), "parent_dir does not exist"
data_dir = "./2_data/2_intermediate/political_data"
assert os.path.exists(data_dir), "Data directory does not exist"

In [9]:
house_comp = pd.read_csv(os.path.join(data_dir, "political_composition.csv"))

In [10]:
print(house_comp.yr_rd2.max())

2026


In [11]:
house_comp.sample(5)
print(house_comp["state_abbrev"].unique())

['AL' 'CT' 'DE' 'Fed' 'GA' 'IL' 'IN' 'KY' 'LA' 'MA' 'MD' 'ME' 'MO' 'MS'
 'NC' 'NH' 'NJ' 'NY' 'OH' 'PA' 'RI' 'SC' 'TN' 'VA' 'VT' 'AR' 'MI' 'FL'
 'IA' 'TX' 'WI' 'CA' 'MN' 'OR' 'KS' 'NV' 'WV' 'NE' 'CO' 'ID' 'MT' 'ND'
 'SD' 'WA' 'WY' 'UT' 'OK' 'AZ' 'NM' 'HI' 'AK']


In [12]:
all_sim_data = []

for yr in range(1834, 2020, 2): # We don't have state data for after 2020
    year_data = house_comp[house_comp['yr_rd2']==yr] # Filtered to only work with the current year in the loop 
    print(f"Number of states for year {yr} is {len(year_data)}")
    year_data = year_data.dropna(axis=1, how='all')
    # year_data = year_data[year_data['dem_upphse'].isna()==False]
    list_col = list(house_comp.columns) # Get the list of columns of the initial df
    list_col.remove('state_abbrev')
    list_col.remove('yr_rd2')
    year_data = year_data.dropna()
    house_comp_val_only = year_data[list_col] # Filters data to only have columns in list_col
    similarity = cosine_similarity(house_comp_val_only, house_comp_val_only) # Generates the cosine similarity matrix
    sim_data = pd.DataFrame(similarity) # Converts the similarity matrix to a dataframe
    sim_data.columns = list(year_data[['state_abbrev', 'yr_rd2']].state_abbrev) 
    sim_data = year_data[['state_abbrev', 'yr_rd2']].reset_index(drop=True).merge(sim_data, left_index=True, right_index=True) 
    sim_data.to_csv("2_data/2_intermediate/political_data/cosinetemp/political_composition_"+str(yr)+".csv", index=False)

Number of states for year 1834 is 25
Number of states for year 1836 is 26
Number of states for year 1838 is 26
Number of states for year 1840 is 25
Number of states for year 1842 is 25
Number of states for year 1844 is 27
Number of states for year 1846 is 30
Number of states for year 1848 is 31
Number of states for year 1850 is 31
Number of states for year 1852 is 32
Number of states for year 1854 is 32
Number of states for year 1856 is 32
Number of states for year 1858 is 33
Number of states for year 1860 is 33
Number of states for year 1862 is 34
Number of states for year 1864 is 37
Number of states for year 1866 is 36
Number of states for year 1868 is 38
Number of states for year 1870 is 38
Number of states for year 1872 is 38
Number of states for year 1874 is 38
Number of states for year 1876 is 39
Number of states for year 1878 is 39
Number of states for year 1880 is 39
Number of states for year 1882 is 39
Number of states for year 1884 is 39
Number of states for year 1886 is 39
N

In [13]:
sim_data.columns

Index(['state_abbrev', 'yr_rd2', 'CA', 'CO', 'CT', 'FL', 'Fed', 'GA', 'ID',
       'IL', 'KS', 'KY', 'ME', 'MI', 'MN', 'MO', 'MS', 'NJ', 'NM', 'NV', 'OH',
       'OK', 'SD', 'TN', 'VA', 'WI', 'WY'],
      dtype='object')

In [14]:
sim_data.sample(10)

Unnamed: 0,state_abbrev,yr_rd2,CA,CO,CT,FL,Fed,GA,ID,IL,...,NJ,NM,NV,OH,OK,SD,TN,VA,WI,WY
21,TN,2018,0.790724,0.889602,0.896377,0.986481,0.966491,0.990271,0.99823,0.919281,...,0.909465,0.924999,0.922307,0.995162,0.999901,0.997677,1.0,0.927301,0.98272,0.997031
24,WY,2018,0.750049,0.862991,0.872163,0.973127,0.945894,0.977861,0.999022,0.888853,...,0.876128,0.8958,0.890592,0.985206,0.996818,0.999077,0.997031,0.91659,0.967948,1.0
12,MN,2018,0.943667,0.98955,0.995094,0.941279,0.942939,0.938937,0.909904,0.930573,...,0.923777,0.933096,0.927306,0.933818,0.914764,0.908406,0.916367,0.996012,0.942662,0.899142
16,NM,2018,0.928409,0.93899,0.94082,0.973997,0.991508,0.968819,0.912129,0.999849,...,0.997536,1.0,0.996698,0.957803,0.923138,0.909539,0.924999,0.918042,0.978588,0.8958
9,KY,2018,0.847166,0.920405,0.925139,0.998287,0.989888,0.999596,0.987038,0.959332,...,0.952337,0.963297,0.961396,0.999772,0.992454,0.985757,0.992934,0.938288,0.997039,0.981357
0,CA,2018,1.0,0.972298,0.968536,0.862212,0.896422,0.854669,0.770493,0.931528,...,0.936505,0.928409,0.930689,0.838116,0.789136,0.766757,0.790724,0.912851,0.870146,0.750049
7,IL,2018,0.931528,0.938794,0.939976,0.970308,0.989413,0.964978,0.905544,1.0,...,0.998462,0.999849,0.997286,0.953487,0.917481,0.902819,0.919281,0.914017,0.975194,0.888853
19,OK,2018,0.789136,0.889181,0.895361,0.98529,0.965145,0.989441,0.997582,0.917481,...,0.908269,0.923138,0.921536,0.994675,1.0,0.996964,0.999901,0.925222,0.981409,0.996818
17,NV,2018,0.930689,0.942594,0.940473,0.968918,0.98802,0.965343,0.905315,0.997286,...,0.999151,0.996698,1.0,0.955336,0.921536,0.902184,0.922307,0.907591,0.973351,0.890592
20,SD,2018,0.766757,0.872044,0.88206,0.980015,0.955499,0.98347,0.999962,0.902819,...,0.889387,0.909539,0.902184,0.98911,0.996964,1.0,0.997677,0.925069,0.975606,0.999077
