In [1]:
import pyreadr
import pickle

import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import cosine_similarity

In [2]:
speech_data = pd.read_csv("data/interim/preprocessed_speech_1m.csv")

with open("data/interim/lda/1m_40_out.pickle", "rb") as f:
    lda = pickle.load(f)


In [48]:

# create dataframe to store member-committee topic vectors
columns = ["congress", "committee_code2", "govtrack"]
topics = ["topic_" + str(i) for i in list(range(40))]
columns.extend(topics)

committee_member_topic_vector = pd.DataFrame(columns=columns)



for congress in speech_data["congress"].unique():

    # filter by congress
    data = speech_data[speech_data["congress"] == congress]

    # filter by committee
    committees = data["committee_code2"].unique()
    for committee in committees:

        # get unique member ids from committee
        committee_members = data[(data["committee_code2"] == committee)]["govtrack"].unique()
        for member in committee_members:

            # there are a few nan values that didn't get cleared out in preprocessing (oops)
            if np.isnan(member):
                continue

            # get speeches by each member
            speech_indices = data.index[(data["committee_code2"] == committee) & (data["govtrack"] == member)].tolist()
            speech_indices = np.array(speech_indices) - 1  # fix off-by-one for merging with lda

            # get lda values for speeches and sum them to get member-committee topic vector
            member_lda = np.sum(lda[speech_indices], axis=0)
            
            # add member-committee topic vector to dataframe
            items = [congress, committee, member]
            items.extend(member_lda)

            df = pd.DataFrame(data=[items], columns=columns)
            # print(df)
            
            committee_member_topic_vector = pd.concat([committee_member_topic_vector, df], ignore_index=True)
        
committee_member_topic_vector.to_csv("data/interim/cosine_similarity/committee_member_topic_vector.csv")

committee_member_topic_vector

  committee_member_topic_vector = pd.concat([committee_member_topic_vector, df], ignore_index=True)


Unnamed: 0,congress,committee_code2,govtrack,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,...,topic_30,topic_31,topic_32,topic_33,topic_34,topic_35,topic_36,topic_37,topic_38,topic_39
0,105.0,HSGO,400370.0,15.373259,48.832767,29.497267,45.782071,9.762976,40.433059,134.143818,...,13.189415,69.396316,18.716273,9.808625,8.899720,319.820319,25.832019,102.688481,93.343865,12.350457
1,105.0,HSGO,400409.0,0.831803,6.741148,3.180852,4.278852,1.596537,3.984502,11.104069,...,2.061198,4.729946,1.157338,1.680351,1.315626,33.341808,5.234024,8.942163,11.822646,1.130560
2,105.0,HSGO,400506.0,13.789877,33.140710,6.416145,86.848920,32.408956,60.598376,65.786584,...,22.021183,49.744541,12.863472,13.725634,7.557203,155.807164,60.652807,13.811489,97.514167,21.659063
3,105.0,HSGO,408459.0,0.290816,3.237879,0.966131,1.773632,0.220074,1.319891,3.596687,...,0.225619,1.360627,0.193425,0.171879,0.319693,8.330383,2.168726,1.218421,2.576874,0.541891
4,105.0,HSGO,400357.0,4.995966,7.496153,3.729243,7.989488,3.586591,15.048056,17.852724,...,1.751897,16.590928,4.249577,1.547787,1.066028,49.039622,3.298147,32.800083,21.610789,2.293917
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7878,114.0,HLZI,412431.0,0.813689,0.255348,0.160967,0.545852,0.345621,15.593550,4.406780,...,0.346335,4.082794,0.148369,0.639590,3.702682,7.418487,7.190725,0.119983,6.732256,0.541623
7879,114.0,HLZI,400361.0,0.141531,0.134826,0.068427,0.386751,0.019726,4.828424,0.527384,...,0.150983,2.681315,0.195355,0.019726,1.115770,1.593359,1.250216,0.135850,1.705405,0.220164
7880,114.0,HLZI,400627.0,0.728932,0.655900,0.129575,1.070381,0.487712,10.725561,3.303954,...,0.090547,3.221258,0.406043,0.511329,4.400740,7.098118,3.928900,0.202545,6.012605,0.423365
7881,114.0,HLZI,400355.0,0.386265,0.477518,0.053251,0.784670,0.595569,5.815316,2.173034,...,0.053251,1.092792,0.108675,0.053251,1.442328,4.275186,2.422586,0.053251,1.088948,0.285135


In [29]:
# create cosine similarity matrix and network for each committee

committee_member_topic_vector = pd.read_csv("data/interim/cosine_similarity/committee_member_topic_vector.csv", index_col=0)

for congress in committee_member_topic_vector["congress"].unique():
# for congress in [105]:
    
    # filter by congress
    data = committee_member_topic_vector[committee_member_topic_vector["congress"] == congress]

    # filter by committee
    committees = data["committee_code2"].unique()
    for committee in committees:

        # get unique member ids from committee
        committee_members = data[(data["committee_code2"] == committee)]["govtrack"].dropna().unique().tolist()

        cos_sim = pd.DataFrame(index=committee_members, columns=committee_members)

        G = nx.Graph()

        # add member and attributes as node to graph
        for member in committee_members:
            G.add_node(member)

            attrs = ["thomas_name", "govtrack", "powercmt", "security", "year", "minority", "unified", "minuni", "partyloyalty",
                     "votepct100", "votepct_sq100", "seniority_rs", "seniority_sq_rs", "abs_dwnom1_rs", "dem", "freshman",
                     "female", "leader", "polar", "polar_rs"]

            member_attrs = speech_data[(speech_data["congress"] == congress) & (speech_data["govtrack"] == member)][attrs].iloc[0]

            for attr in member_attrs.index:
                G.nodes[member][attr] = str(member_attrs[attr])


        # get cosine similarity and create network edges
        for member_1 in committee_members:
            for member_2 in committee_members:

                member_1_data = data[(data["committee_code2"] == committee) & (data["govtrack"] == member_1)]
                member_2_data = data[(data["committee_code2"] == committee) & (data["govtrack"] == member_2)]

                data_cols = ["topic_" + str(i) for i in list(range(40))]

                member_1_vector = member_1_data[data_cols].values
                member_2_vector = member_2_data[data_cols].values

                cs = cosine_similarity(member_1_vector, member_2_vector)[0][0] 
                # not sure what's going on here, but this works ----------^

                cos_sim[member_1][member_2] = cs

                if member_1 != member_2:
                    G.add_edge(member_1, member_2, cosine_similarity=cs)
            

        cos_sim_out = "data/interim/cosine_similarity/" + str(int(congress)) + "_" + committee + "_cos_sim.csv"
        cos_sim.to_csv(cos_sim_out)

        G_out = "data/interim/networks/" + str(int(congress)) + "_" + committee + ".graphml"
        nx.write_graphml(G, G_out)

