<a href="https://colab.research.google.com/github/anytokin/anytokin/blob/main/Modern_Python_pynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from typing import Tuple, Iterable, Sequence, List, Dict, DefaultDict
from random import sample
from math import fsum, sqrt
from collections import defaultdict

def partial(func, *args):
    "Rewrite functools.partial() in a way that doesn't confuse mypy"
    def inner(*moreargs):
        return func(*args, *moreargs)
    return inner

def mean(data: Iterable[float]) -> float:
    'Accurate arithmetic mean'
    data = list(data)
    return fsum(data) / len(data)

def transpose(matrix: Iterable[Iterable]) -> Iterable[tuple]:
    'Swap rows with columns for a 2-D array'
    return zip(*matrix)

Point = Tuple[float, ...]
Centroid = Point

def dist(p: Point, q: Point, sqrt=sqrt, fsum=fsum, zip=zip) -> float:
    'Multi-dimensional euclidean distance'
    return sqrt(fsum((x1 - x2) ** 2.0 for x1, x2 in zip(p, q)))

def assign_data(centroids: Sequence[Centroid], data: Iterable[Point]) -> Dict[Centroid, Sequence[Point]]:
    'Assign data the closest centroid'
    d : DefaultDict[Point, List[Point]] = defaultdict(list)
    for point in data:
        centroid: Point = min(centroids, key=partial(dist, point))
        d[centroid].append(point)
    return dict(d)

def compute_centroids(groups: Iterable[Sequence[Point]]) -> List[Centroid]:
    'Compute the centroid of each group'
    return [tuple(map(mean, transpose(group))) for group in groups]

def k_means(data: Iterable[Point], k:int=2, iterations:int=10) -> List[Point]:
    'Return k-centroids for the data'
    data = list(data)
    centroids = sample(data, k)
    for i in range(iterations):
        labeled = assign_data(centroids, data)
        centroids = compute_centroids(labeled.values())
    return centroids

def quality(labeled: Dict[Centroid, Sequence[Point]]) -> float:
    'Mean value of squared distances from data to its assigned centroid'
    return mean(dist(c, p) ** 2 for c, pts in labeled.items() for p in pts)


if __name__ == '__main__':

    from pprint import pprint

    print('Simple example with six 3-D points clustered into two groups')
    points = [
        (10, 41, 23),
        (22, 30, 29),
        (11, 42, 5),
        (20, 32, 4),
        (12, 40, 12),
        (21, 36, 23),
    ]

    centroids = k_means(points, k=2)
    pprint(assign_data(centroids, points))

    print('\nExample with a richer dataset.')
    print('See: https://www.datascience.com/blog/introduction-to-k-means-clustering-algorithm-learn-data-science-tutorials')

    data = [
         (10, 30),
         (12, 50),
         (14, 70),

         (9, 150),
         (20, 175),
         (8, 200),
         (14, 240),

         (50, 35),
         (40, 50),
         (45, 60),
         (55, 45),

         (60, 130),
         (60, 220),
         (70, 150),
         (60, 190),
         (90, 160),
    ]

    print('k     quality')
    print('-     -------')
    for k in range(1, 8):
        centroids = k_means(data, k, iterations=20)
        d = assign_data(centroids, data)
        print(f'{k}    {quality(d) :8,.1f}')

Simple example with six 3-D points clustered into two groups
{(13.25, 38.75, 11.0): [(10, 41, 23), (11, 42, 5), (20, 32, 4), (12, 40, 12)],
 (21.5, 33.0, 26.0): [(22, 30, 29), (21, 36, 23)]}

Example with a richer dataset.
See: https://www.datascience.com/blog/introduction-to-k-means-clustering-algorithm-learn-data-science-tutorials
k     quality
-     -------
1     5,583.5
2     1,337.8
3     1,202.2
4       666.6
5       517.9
6       399.8
7       597.3


In [1]:
%%bash
pip install pyflakes --quiet
pip install bottle --quiet
pip install pytest --quiet
pip install hypothesis --quiet
pip install hypothesis --quiet

In [2]:
%%bash
rm -fr *
wget https://www.govtrack.us/congress/votes/114-2016/s163/export/csv --quiet
wget https://www.govtrack.us/congress/votes/114-2016/s161/export/csv --quiet
wget https://www.govtrack.us/congress/votes/114-2016/s159/export/csv --quiet
wget https://www.govtrack.us/congress/votes/114-2016/s157/export/csv --quiet
wget https://www.govtrack.us/congress/votes/114-2016/s155/export/csv --quiet
wget https://www.govtrack.us/congress/votes/114-2016/s154/export/csv --quiet
wget https://www.govtrack.us/congress/votes/114-2016/s152/export/csv --quiet
wget https://www.govtrack.us/congress/votes/114-2016/s151/export/csv --quiet
wget https://www.govtrack.us/congress/votes/114-2016/s143/export/csv --quiet
wget https://www.govtrack.us/congress/votes/114-2016/s141/export/csv --quiet
wget https://www.govtrack.us/congress/votes/114-2016/s129/export/csv --quiet
wget https://www.govtrack.us/congress/votes/114-2016/s127/export/csv --quiet
wget https://www.govtrack.us/congress/votes/114-2016/s123/export/csv --quiet
wget https://www.govtrack.us/congress/votes/114-2016/s116/export/csv --quiet
wget https://www.govtrack.us/congress/votes/114-2016/s98/export/csv --quiet
wget https://www.govtrack.us/congress/votes/114-2016/s86/export/csv --quiet
wget https://www.govtrack.us/congress/votes/114-2016/s84/export/csv --quiet
wget https://www.govtrack.us/congress/votes/114-2016/s83/export/csv --quiet
wget https://www.govtrack.us/congress/votes/114-2016/s82/export/csv --quiet
wget https://www.govtrack.us/congress/votes/114-2016/s71/export/csv --quiet
wget https://www.govtrack.us/congress/votes/114-2016/s54/export/csv --quiet
wget https://www.govtrack.us/congress/votes/114-2016/s47/export/csv --quiet
wget https://www.govtrack.us/congress/votes/114-2016/s39/export/csv --quiet
wget https://www.govtrack.us/congress/votes/114-2016/s38/export/csv --quiet
wget https://www.govtrack.us/congress/votes/114-2016/s34/export/csv --quiet
wget https://www.govtrack.us/congress/votes/114-2016/s26/export/csv --quiet
wget https://www.govtrack.us/congress/votes/114-2016/s22/export/csv --quiet
wget https://www.govtrack.us/congress/votes/114-2016/s20/export/csv --quiet

In [10]:
import csv
from collections import namedtuple, defaultdict, Counter
from pprint import pprint
import glob
#from kmeans import k_means, assign_data
NUM_OF_SENATORS = 100
Senator = namedtuple('Senator',['name', 'party', 'state'])

#Load votes witch were arranged
vote_value = {'Yea': 1, 'Nay': -1, 'Not Voting':0}
accumulated_record = defaultdict(list)
for filename in glob.glob("*csv*"):
  with open(filename) as f:
    reader = csv.reader(f)
    vote_topic = next(reader)
    headers = next(reader)
    for person, state, ditrict, vote, name, party in reader:
      senator =  Senator(name, party, state)
      accumulated_record[senator].append(vote_value[vote])

# Transform the record into a plain dict that maps to tuple of votes
record = {senator: tuple(votes) for senator, votes in accumulated_record.items()}

#Use k-means to laxate the cluster centroids from pattern of votes, assign senator to the nearest cluster
centroids = k_means(record.values(), k=3)
clustered_votes = assign_data(centroids, record.values())

#Build a reverse mapping from a vote history to a list of senators who voted that way
votes_to_senators = defaultdict(list)
for senator, votehistory in record.items():
  votes_to_senators[votehistory].append(senator)
assert sum((len(cluster)) for cluster in votes_to_senators.values()) == NUM_OF_SENATORS

#Display the clusters and the members (senators) of each cluster
for i, votes_in_cluster in enumerate(clustered_votes.values(), start=1):
  print(f'=========== Voting Cluster #{i} ===========')
  party_totals = Counter()
  for votes in set(votes_in_cluster):
    for senator in votes_to_senators[votes]:
      party_totals[senator.party] += 1
      print(senator)
  print(party_totals)


#pprint(accumulated_record, width=500)
# for centroid in centroids:
#   for x in centroid:
#     print(f'{x:.2f}', end =" ")
#   print()

Senator(name='Sen. Mike Rounds [R, 2015-2026]', party='Republican', state='SD')
Senator(name='Sen. Dean Heller [R, 2011-2018]', party='Republican', state='NV')
Senator(name='Sen. Mark Kirk [R, 2010-2016]', party='Republican', state='IL')
Senator(name='Sen. Richard Shelby [R, 1987-2022]', party='Republican', state='AL')
Senator(name='Sen. Thad Cochran [R, 1979-2018]', party='Republican', state='MS')
Senator(name='Sen. Pat Roberts [R, 1997-2020]', party='Republican', state='KS')
Senator(name='Sen. Jerry Moran [R, 2011-2022]', party='Republican', state='KS')
Senator(name='Sen. Robert “Bob” Casey Jr. [D, 2007-2024]', party='Democrat', state='PA')
Senator(name='Sen. Kelly Ayotte [R, 2011-2016]', party='Republican', state='NH')
Senator(name='Sen. Tom Cotton [R, 2015-2026]', party='Republican', state='AR')
Senator(name='Sen. Tammy Baldwin [D, 2013-2024]', party='Democrat', state='WI')
Senator(name='Sen. James Risch [R, 2009-2026]', party='Republican', state='ID')
Senator(name='Sen. Debbie Sta