In [2]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import seaborn as sns
import itertools
%matplotlib inline

In [4]:
ls

gephi_congressional_votes.ipynb


# Step 1
Import congressional voting data, simplify voting topic names, set yes votes equal to one and no/unknown to zero

In [3]:
votes = pd.read_csv('/Users/sarah/ds/metis/challenges/mcnulty/house_votes.csv', header = None, names = [
        'party', 'handicapped', 'water', 'budget', 'physician', 'elsalvador', 'religious', 'satellite', 'nicaraguan',
        'missile', 'immigration', 'synfuels', 'education', 'superfund', 'crime', 'exports', 'southafrica'])
votes = votes.replace('n', 0).replace('y', 1).replace('?', 0)

In [6]:
votes.groupby('party').sum().to_csv('votes_by_party.csv')

# Step 2
Create data set for analyzing using votes as nodes, prepare nodes and edges csv files for import into gephi

In [164]:
# Turn party into dummy variables
votes_tag = votes.copy()
votes_tag['democrat'] = votes_tag.party.apply(lambda x: 1 if x == 'democrat' else 0)
votes_tag['republican'] = votes_tag.party.apply(lambda x: 1 if x == 'republican' else 0)
votes_tag = votes_tag.drop('party', axis = 1)

In [165]:
votes_tag.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 435 entries, 0 to 434
Data columns (total 18 columns):
handicapped    435 non-null int64
water          435 non-null int64
budget         435 non-null int64
physician      435 non-null int64
elsalvador     435 non-null int64
religious      435 non-null int64
satellite      435 non-null int64
nicaraguan     435 non-null int64
missile        435 non-null int64
immigration    435 non-null int64
synfuels       435 non-null int64
education      435 non-null int64
superfund      435 non-null int64
crime          435 non-null int64
exports        435 non-null int64
southafrica    435 non-null int64
democrat       435 non-null int64
republican     435 non-null int64
dtypes: int64(18)
memory usage: 64.6 KB


In [166]:
# Export list of voting topics that will be used as nodes in the network graph and create all possible pairings of
# of topics to create edges that represent a representative voting yes on both topics
vote_topics = votes_tag.columns.tolist()
tag_combos = list(itertools.combinations(vote_topics, 2))

In [167]:
tag_combos[:10]

[('handicapped', 'water'),
 ('handicapped', 'budget'),
 ('handicapped', 'physician'),
 ('handicapped', 'elsalvador'),
 ('handicapped', 'religious'),
 ('handicapped', 'satellite'),
 ('handicapped', 'nicaraguan'),
 ('handicapped', 'missile'),
 ('handicapped', 'immigration'),
 ('handicapped', 'synfuels')]

In [168]:
# Loop through all possible vote topic combinations (edges) and give value of 1 if the two topics both got a yes vote
# from each representative and a 0 if they did not both get a yes vote
for combo in tag_combos:
    votes_tag[combo] = votes_tag[combo[0]] * votes_tag[combo[1]]

In [169]:
votes_tag.head()

Unnamed: 0,handicapped,water,budget,physician,elsalvador,religious,satellite,nicaraguan,missile,immigration,...,"(crime, exports)","(crime, southafrica)","(crime, democrat)","(crime, republican)","(exports, southafrica)","(exports, democrat)","(exports, republican)","(southafrica, democrat)","(southafrica, republican)","(democrat, republican)"
0,0,1,0,1,1,1,0,0,0,1,...,0,1,0,1,0,0,0,0,1,0
1,0,1,0,1,1,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,1,1,0,1,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,1,1,1,0,1,1,0,0,0,0,...,1,1,1,0,1,1,0,1,0,0


In [170]:
# Create nodes file to import into Gephi
nodes = pd.Series(vote_topics)

# Create a dictionary that maps the topic to the node id (index value) to use to create Source/Target edge ids that match
nodes_dict_id = nodes.to_dict()
# Reverse key, value pairs
nodes_dict_id = dict(zip(nodes_dict_id.values(),nodes_dict_id.keys()))

# Format for Gephi import
nodes_df = nodes.reset_index()
nodes_df.columns = ['Id', 'Label']

In [180]:
nodes_df.head()

Unnamed: 0,Id,Label
0,0,handicapped
1,1,water
2,2,budget
3,3,physician
4,4,elsalvador


In [None]:
# Export nodes file for Gephi import
nodes_df.to_csv('votes_nodes.csv', index = False)

In [173]:
nodes_dict_id

{'budget': 2,
 'crime': 13,
 'democrat': 16,
 'education': 11,
 'elsalvador': 4,
 'exports': 14,
 'handicapped': 0,
 'immigration': 9,
 'missile': 8,
 'nicaraguan': 7,
 'physician': 3,
 'religious': 5,
 'republican': 17,
 'satellite': 6,
 'southafrica': 15,
 'superfund': 12,
 'synfuels': 10,
 'water': 1}

In [176]:
# Create edges file to import into Gephi
edges = votes_tag.ix[:,18:].sum()
edges_weights = edges.to_frame(name='Weight').reset_index()
edges_weights.columns = ['Label', 'Weight']
edges_weights['Source_Label'] = edges_weights.Label.apply(lambda x: x[0])
edges_weights['Target_Label'] = edges_weights.Label.apply(lambda x: x[1])
edges_weights['Source'] = edges_weights.Source_Label.apply(lambda x: nodes_dict_id.get(x))
edges_weights['Target'] = edges_weights.Target_Label.apply(lambda x: nodes_dict_id.get(x))

In [179]:
edges_weights.head()

Unnamed: 0,Label,Weight,Source_Label,Target_Label,Source,Target
0,"(handicapped, water)",88,handicapped,water,0,1
1,"(handicapped, budget)",151,handicapped,budget,0,2
2,"(handicapped, physician)",34,handicapped,physician,0,3
3,"(handicapped, elsalvador)",54,handicapped,elsalvador,0,4
4,"(handicapped, religious)",77,handicapped,religious,0,5


In [None]:
edges_weights.to_csv('votes_edges.csv', index = False)

# Step 3
Create data set for analyzing using voters as nodes, prepare nodes and edges csv files for import into gephi

In [183]:
# Create vote as node set
voters = votes.copy()

In [184]:
voters.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 435 entries, 0 to 434
Data columns (total 17 columns):
party          435 non-null object
handicapped    435 non-null int64
water          435 non-null int64
budget         435 non-null int64
physician      435 non-null int64
elsalvador     435 non-null int64
religious      435 non-null int64
satellite      435 non-null int64
nicaraguan     435 non-null int64
missile        435 non-null int64
immigration    435 non-null int64
synfuels       435 non-null int64
education      435 non-null int64
superfund      435 non-null int64
crime          435 non-null int64
exports        435 non-null int64
southafrica    435 non-null int64
dtypes: int64(16), object(1)
memory usage: 61.2+ KB


In [185]:
# Create nodes file for export to gephi
voters = voters.reset_index()
voters.rename(columns={'index': 'Id'}, inplace=True)
voters_nodes = voters.ix[:,0:2]

In [186]:
voters_nodes.head()

Unnamed: 0,Id,party
0,0,republican
1,1,republican
2,2,democrat
3,3,democrat
4,4,democrat


In [215]:
voters_nodes.to_csv('voters_nodes.csv', index = False)

In [188]:
voters.head()

Unnamed: 0,Id,party,handicapped,water,budget,physician,elsalvador,religious,satellite,nicaraguan,missile,immigration,synfuels,education,superfund,crime,exports,southafrica
0,0,republican,0,1,0,1,1,1,0,0,0,1,0,1,1,1,0,1
1,1,republican,0,1,0,1,1,1,0,0,0,0,0,1,1,1,0,0
2,2,democrat,0,1,1,0,1,1,0,0,0,0,1,0,1,1,0,0
3,3,democrat,0,1,1,0,0,1,0,0,0,0,1,0,1,0,0,1
4,4,democrat,1,1,1,0,1,1,0,0,0,0,1,0,1,1,1,1


In [189]:
# Transpose file so that the voters are in columns and votes in rows to prepare edges data
voters_cols = voters.drop(['party', 'Id'], axis = 1).transpose()

In [190]:
voters_cols.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,425,426,427,428,429,430,431,432,433,434
handicapped,0,0,0,0,1,0,0,0,0,1,...,0,1,0,0,1,0,0,0,0,0
water,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1
budget,0,0,1,1,1,1,0,0,0,1,...,1,1,0,0,1,1,1,0,0,0
physician,1,1,0,0,0,0,1,1,1,0,...,0,0,1,0,0,1,0,1,1,1
elsalvador,1,1,1,0,1,1,1,1,1,0,...,0,0,1,0,0,1,0,1,1,1


In [195]:
# Create a list with all possible combinations of any two voters
voters_list = voters_cols.columns.tolist()
voter_combos = list(itertools.combinations(voters_list, 2))

In [199]:
voter_combos[:10]

[(0, 1),
 (0, 2),
 (0, 3),
 (0, 4),
 (0, 5),
 (0, 6),
 (0, 7),
 (0, 8),
 (0, 9),
 (0, 10)]

In [200]:
for combo in voter_combos:
    voters_cols[combo] = voters_cols[combo[0]] * voters_cols[combo[1]]

In [202]:
voters_cols.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,"(430, 431)","(430, 432)","(430, 433)","(430, 434)","(431, 432)","(431, 433)","(431, 434)","(432, 433)","(432, 434)","(433, 434)"
handicapped,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
water,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
budget,0,0,1,1,1,1,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
physician,1,1,0,0,0,0,1,1,1,0,...,0,1,1,1,0,0,0,1,1,1
elsalvador,1,1,1,0,1,1,1,1,1,0,...,0,1,1,1,0,0,0,1,1,1


In [217]:
# Create edges file to import into Gephi, restricting the file to only edge pairs that had at least one common vote
edges_voters = voters_cols.ix[:,435:].sum()
edges_voters
edges_weights_voters = edges_voters.to_frame(name='Weight').reset_index()
edges_weights_voters.columns = ['Label', 'Weight']
edges_weights_voters['Source'] = edges_weights_voters.Label.apply(lambda x: x[0])
edges_weights_voters['Target'] = edges_weights_voters.Label.apply(lambda x: x[1])
edges_weights_voters['Type'] = 'Undirected'
edges_weights_voters = edges_weights_voters[edges_weights_voters['Weight'] > 0]

In [218]:
edges_weights_voters.to_csv('voters_edges.csv', index = False)

In [None]:
# Create new columns conditional upon other columns
df['Normalized'] = np.where(df['Currency'] == '$', df['Budget'] * 0.78125, df['Budget'])