# Manifold Learning / Embedding

Here we apply selected dimensionality reduction techniques to the one-hot encoded data.

In [1]:
from sklearn.manifold import Isomap, LocallyLinearEmbedding, MDS, SpectralEmbedding, TSNE
from sklearn.decomposition import PCA

import pandas as pd
import numpy as np

# Embed data

First, define the embedding algorithms to use via a dictionary containing the function to use and other info such as function arguments.

In [2]:
random_state = 1776
n_components = 3
embeddings = [
    {'name': 'PCA', 'func': PCA, 'args': {'random_state': random_state, 'n_components': n_components}},
    {'name': 'ISOMAP', 'func': Isomap, 'args': {'n_components': n_components}},
    {'name': 'MDS', 'func': MDS, 'args': {'random_state': random_state, 'n_components': n_components}},
    {'name': 'SE', 'func': SpectralEmbedding, 'args': {'n_neighbors': 10, 'random_state': random_state, 'n_components': n_components}},
    {'name': 'TSNE', 'func': TSNE, 'args': {'init': 'pca', 'random_state': random_state, 'n_components': n_components}}
]

The algorithms are then applied to each congressional session individually. The results for all sessions are aggregated to a single dataframe.

In [3]:
pairs = [ # (Congress, Session)
    (115, 2017),
    (114, 2016),
    (114, 2015),
    (113, 2014),
    (113, 2013),
    (112, 2012),
    (112, 2011),
    (111, 2010),
    (111, 2009)
]
df_list = []
for congress, session in pairs:
    print('Processing {}-{}'.format(congress, session))
    df = pd.read_table('data/onehot/congress{}_session{}.tsv'.format(congress, session), index_col=0)
    
    ## Create congress/session columns
    first_frame =  pd.DataFrame(
        {
            'congress': [congress]*len(df.index),
            'session': [session]*len(df.index)
        },
        index = df.index
    )
    
    ## Calculate each embedding and add to a list of df's
    df_list_embeddings = [first_frame]
    for e in embeddings:

        name = e['name']
        func = e['func']
        args = e['args']
        
        algorithm = func(**args)
        embedding = algorithm.fit_transform(df)
        e1, e2, e3 = embedding[:, 0], embedding[:, 1], embedding[:, 2]

        df_e = pd.DataFrame(
            {
                name+str(1) : e1,
                name+str(2) : e2,
                name+str(3) : e3,
            }, 
            index = df.index
        )
        df_list_embeddings.append(df_e)
        print('\tFinished {}'.format(name))

    ## Merge all frames for this congress/session
    df_e = pd.concat(df_list_embeddings, axis = 1)
    print('Final shape: {}\n'.format(df_e.shape))
    df_list.append(df_e)
    
## Merge for all sessions
df = pd.concat(df_list, axis = 0)
print('Final shape for all sessions & embeddings: {}'.format(df.shape))

Processing 115-2017
	Finished PCA
	Finished ISOMAP
	Finished MDS
	Finished SE
	Finished TSNE
Final shape: (542, 17)

Processing 114-2016
	Finished PCA
	Finished ISOMAP
	Finished MDS
	Finished SE
	Finished TSNE
Final shape: (538, 17)

Processing 114-2015
	Finished PCA
	Finished ISOMAP
	Finished MDS




	Finished SE
	Finished TSNE
Final shape: (537, 17)

Processing 113-2014
	Finished PCA
	Finished ISOMAP
	Finished MDS




	Finished SE
	Finished TSNE
Final shape: (539, 17)

Processing 113-2013
	Finished PCA
	Finished ISOMAP
	Finished MDS
	Finished SE
	Finished TSNE
Final shape: (542, 17)

Processing 112-2012
	Finished PCA
	Finished ISOMAP
	Finished MDS
	Finished SE
	Finished TSNE
Final shape: (541, 17)

Processing 112-2011
	Finished PCA
	Finished ISOMAP
	Finished MDS
	Finished SE
	Finished TSNE
Final shape: (540, 17)

Processing 111-2010
	Finished PCA
	Finished ISOMAP
	Finished MDS
	Finished SE
	Finished TSNE
Final shape: (551, 17)

Processing 111-2009
	Finished PCA
	Finished ISOMAP
	Finished MDS
	Finished SE
	Finished TSNE
Final shape: (550, 17)

Final shape for all sessions & embeddings: (4880, 17)


## Meta info

Now we take the dataframe with all embeddings and merge it with the metadata frame.

In [6]:
df_meta = pd.read_table('data/meta/meta_info.tsv', index_col = 0)
print(df_meta.shape)
df_meta.head()

(12406, 57)


Unnamed: 0_level_0,bio.birthday,bio.gender,bio.religion,family,id.ballotpedia,id.bioguide,id.bioguide_previous,id.cspan,id.fec,id.google_entity_id,...,chamber2012,party2011,state2011,chamber2011,party2010,state2010,chamber2010,party2009,state2009,chamber2009
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B000944,1952-11-09,M,Lutheran,,Sherrod Brown,B000944,,5051.0,"['H2OH13033', 'S6OH00163']",kg:/m/034s80,...,sen,Democrat,OH,sen,Democrat,OH,sen,Democrat,OH,sen
C000127,1958-10-13,F,Roman Catholic,,Maria Cantwell,C000127,,26137.0,"['S8WA00194', 'H2WA01054']",kg:/m/01x68t,...,sen,Democrat,WA,sen,Democrat,WA,sen,Democrat,WA,sen
C000141,1943-10-05,M,Jewish,,Ben Cardin,C000141,,4004.0,"['H6MD03177', 'S6MD03177']",kg:/m/025k3k,...,sen,Democrat,MD,sen,Democrat,MD,sen,Democrat,MD,sen
C000174,1947-01-23,M,Presbyterian,,Tom Carper,C000174,,663.0,['S8DE00079'],kg:/m/01xw7t,...,sen,Democrat,DE,sen,Democrat,DE,sen,Democrat,DE,sen
C001070,1960-04-13,M,,,"Bob Casey, Jr.",C001070,,47036.0,['S6PA00217'],kg:/m/047ymw,...,sen,Democrat,PA,sen,Democrat,PA,sen,Democrat,PA,sen


There is minor inconvenience in the downloaded data in that for some years one form of ID code is used for senators, and in another year a different form is used. We handle each
case separately and then concatenate to get the total. Note that in this case, across all years, there are still ambiguous ID codes for 6 out of the 4877 examined legislators. I exclude these cases from the final results since it is a trivial amount. 

In [7]:
df_bioguide = pd.merge(df, df_meta, how = 'inner', left_index = True, right_index = True)
df_lis = pd.merge(df, df_meta, how = 'inner', left_index = True, right_on = 'id.lis')
df_all = pd.concat([df_bioguide, df_lis], axis = 0)
df_all.to_csv('data/all.tsv', sep = '\t', encoding = 'utf-8')

print('Shape of')
print('\tOriginal: {}'.format(df.shape))
print('\tBioguide: {}'.format(df_bioguide.shape))
print('\tLIS: {}'.format(df_lis.shape))
print('\tCombined: {}'.format(df_all.shape))
df_all.head()

Shape of
	Original: (4880, 17)
	Bioguide: (3956, 74)
	LIS: (918, 74)
	Combined: (4874, 74)


Unnamed: 0,congress,session,PCA1,PCA2,PCA3,ISOMAP1,ISOMAP2,ISOMAP3,MDS1,MDS2,...,chamber2012,party2011,state2011,chamber2011,party2010,state2010,chamber2010,party2009,state2009,chamber2009
A000014,111,2010,17.886417,-0.364373,2.622794,78.772331,-3.869093,35.18005,-21.465385,6.633417,...,rep,Democrat,HI,rep,Democrat,HI,rep,Democrat,HI,rep
A000014,111,2009,-9.76414,-11.426675,0.14099,20.383217,-52.312587,0.142307,-0.879846,-20.393423,...,rep,Democrat,HI,rep,Democrat,HI,rep,Democrat,HI,rep
A000022,112,2012,-4.062458,14.678786,0.137008,56.733401,-0.09208,-3.762569,-1.029217,10.211547,...,rep,Democrat,NY,rep,Democrat,NY,rep,Democrat,NY,rep
A000022,112,2011,-3.890014,20.377974,-1.725114,53.018176,-40.21742,24.310566,-7.12026,4.194647,...,rep,Democrat,NY,rep,Democrat,NY,rep,Democrat,NY,rep
A000022,111,2010,-8.424168,-8.607798,0.026182,-13.160872,-30.183936,-3.222374,2.727669,-13.902509,...,rep,Democrat,NY,rep,Democrat,NY,rep,Democrat,NY,rep
