# Clustering

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd 
import os
import xml.etree.ElementTree as et 

In [2]:
# read all files from one directory 
file_list = os.listdir('FT-data-DSpace/')
file_list

['EdixiXMLExport_20091.xml',
 'EdixiXMLExport_20101.xml',
 'EdixiXMLExport_20102.xml',
 'EdixiXMLExport_20111.xml',
 'EdixiXMLExport_20121.xml',
 'EdixiXMLExport_20131.xml',
 'EdixiXMLExport_20141.xml',
 'EdixiXMLExport_20142.xml',
 'EdixiXMLExport_20151.xml',
 'EdixiXMLExport_20161.xml']

In [3]:
for file in file_list:
    file_path = 'FT-data-DSpace/' + file
    print(file_path)

FT-data-DSpace/EdixiXMLExport_20091.xml
FT-data-DSpace/EdixiXMLExport_20101.xml
FT-data-DSpace/EdixiXMLExport_20102.xml
FT-data-DSpace/EdixiXMLExport_20111.xml
FT-data-DSpace/EdixiXMLExport_20121.xml
FT-data-DSpace/EdixiXMLExport_20131.xml
FT-data-DSpace/EdixiXMLExport_20141.xml
FT-data-DSpace/EdixiXMLExport_20142.xml
FT-data-DSpace/EdixiXMLExport_20151.xml
FT-data-DSpace/EdixiXMLExport_20161.xml


In [4]:
data = []

for file in file_list:
    xmlfile = 'FT-data-DSpace/' + file
    xtree = et.parse(xmlfile)
    xroot = xtree.getroot()
    for leaf in xroot:
        if leaf.findall('Møde'): ## if we have meetings in the leaf node
            for meeting in leaf:
                if meeting.tag == 'Møde':

                    meeting_id = meeting.find('MeetingId').text
                    meeting_date = meeting.find('DateOfSitting').text
                    meeting_number = meeting.find('Mødenummer').text
                    for agenda in meeting:
                        if agenda.tag == 'Dagsordenpunkt':
                            ### extract basic information about the meeting
                            try:
                                agenda_number = agenda.find('Punktnummer').text
                            except:
                                agenda_number = ''
                            try:
                                names = agenda.find('Mødetitel').findall('Exitus')
                                agenda_name = []
                                for name in names:
                                    agenda_name.append(name[0][0].text)
                                ## missing PreText
                            except Exception as e:
                                agenda_name = ''
                                #print(e)
                            try:
                                file_type = agenda.find('Sagstype').text
                            except:
                                file_type = ''
                            try:
                                file_number = agenda.find('Sagsnummer').text
                            except:
                                file_number = ''
                            try:
                                file_step = agenda.find('Sagstrin').text
                            except:
                                file_step = ''
                            ### extract speakers from the meeting
                            for speech in agenda.findall('Tale'):
                                try:
                                    person = speech.find('Navn').text
                                except:
                                    person = ''
                                try:
                                    role = speech.find('Rolle').text
                                except:
                                    role = ''
                                try:
                                    start = speech.find('Starttid').text
                                except:
                                    start = ''
                                try:
                                    end = speech.find('Sluttid').text
                                except:
                                    end = ''
                                try:
                                    text = speech.find('Tekst').text
                                except:
                                    text = ''
                                #print(person, role, start, end, text)
                                data.append([meeting_id, meeting_date, meeting_number, agenda_name, agenda_number,
                                          file_type, file_number, file_step, person, role, start, end, text])



df = pd.DataFrame(data,  columns=['ID', 'Date', 'MeetingNumber', 'AgendaName', 'AgendaNumber', 'FileType', 
                          'FileNumber', 'FileStep', 'SpeakerName', 'SpeakerRole', 'SpeakerStart',
                          'SpeakerEnd', 'Speech'])

###ALL DATA IS HERE
df.head(10)

Unnamed: 0,ID,Date,MeetingNumber,AgendaName,AgendaNumber,FileType,FileNumber,FileStep,SpeakerName,SpeakerRole,SpeakerStart,SpeakerEnd,Speech
0,CBAEA326-B201-41AB-A756-029BA1F58D82,2009-10-30T10:00:00,9. møde,[None],0,Formandens meddelelser,,,,formand,2009-10-30T10:00:18.063,2009-10-30T10:01:04.997,Mødet er åbnet.I dag er der følgende anmeldels...
1,CBAEA326-B201-41AB-A756-029BA1F58D82,2009-10-30T10:00:00,9. møde,[None],0,Formandens meddelelser,,,Mogens Lykketoft,formand,2009-10-30T11:32:26.200,2009-10-30T11:33:02.087,Der er ikke mere at foretage i dette møde.Folk...
2,CBAEA326-B201-41AB-A756-029BA1F58D82,2009-10-30T10:00:00,9. møde,"[1) 1. behandling af lovforslag nr. L 43:, For...",1,Lovforslag,43.0,BEH1,,formand,2009-10-30T10:01:04.997,2009-10-30T10:01:18.080,"Forhandlingen er åbnet, og den første, der får..."
3,CBAEA326-B201-41AB-A756-029BA1F58D82,2009-10-30T10:00:00,9. møde,"[1) 1. behandling af lovforslag nr. L 43:, For...",1,Lovforslag,43.0,BEH1,Peter Juel Jensen,medlem,2009-10-30T10:01:18.080,2009-10-30T10:02:53.290,Jeg holder ordførertalen for Venstre og på veg...
4,CBAEA326-B201-41AB-A756-029BA1F58D82,2009-10-30T10:00:00,9. møde,"[1) 1. behandling af lovforslag nr. L 43:, For...",1,Lovforslag,43.0,BEH1,,formand,2009-10-30T10:02:53.290,2009-10-30T10:03:07.543,Tak til hr. Peter Juel Jensen. Der er ikke øns...
5,CBAEA326-B201-41AB-A756-029BA1F58D82,2009-10-30T10:00:00,9. møde,"[1) 1. behandling af lovforslag nr. L 43:, For...",1,Lovforslag,43.0,BEH1,Niels Sindal,medlem,2009-10-30T10:03:07.543,2009-10-30T10:03:26.793,Socialdemokratiet kan tilslutte sig lovforslag...
6,CBAEA326-B201-41AB-A756-029BA1F58D82,2009-10-30T10:00:00,9. møde,"[1) 1. behandling af lovforslag nr. L 43:, For...",1,Lovforslag,43.0,BEH1,,formand,2009-10-30T10:03:26.793,2009-10-30T10:03:38.597,Tak til hr. Niels Sindal. Så er det hr. Henrik...
7,CBAEA326-B201-41AB-A756-029BA1F58D82,2009-10-30T10:00:00,9. møde,"[1) 1. behandling af lovforslag nr. L 43:, For...",1,Lovforslag,43.0,BEH1,Henrik Brodersen,medlem,2009-10-30T10:03:38.597,2009-10-30T10:04:24.267,"Jeg skal på Dansk Folkepartis vegne sige, at v..."
8,CBAEA326-B201-41AB-A756-029BA1F58D82,2009-10-30T10:00:00,9. møde,"[1) 1. behandling af lovforslag nr. L 43:, For...",1,Lovforslag,43.0,BEH1,,formand,2009-10-30T10:04:24.267,2009-10-30T10:04:36.837,Tak. Var det en venlig hilsen fra hr. Niels He...
9,CBAEA326-B201-41AB-A756-029BA1F58D82,2009-10-30T10:00:00,9. møde,"[1) 1. behandling af lovforslag nr. L 43:, For...",1,Lovforslag,43.0,BEH1,Høgni Hoydal,medlem,2009-10-30T10:04:36.837,2009-10-30T10:05:13.277,Jeg vil stille et spørgsmål til ordføreren for...


In [5]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import SpectralClustering
from kmodes.kmodes import KModes
from sklearn.cluster import AgglomerativeClustering

In [6]:
data = df[['SpeakerRole', 'SpeakerName']]
data

Unnamed: 0,SpeakerRole,SpeakerName
0,formand,
1,formand,Mogens Lykketoft
2,formand,
3,medlem,Peter Juel Jensen
4,formand,
...,...,...
380878,minister,Anders Samuelsen
380879,formand,Henrik Dam Kristensen
380880,medlem,René Gade
380881,minister,Anders Samuelsen


In [7]:
X = pd.get_dummies(data)
X

Unnamed: 0,SpeakerRole_aldersformanden,SpeakerRole_formand,SpeakerRole_fungerende minister,SpeakerRole_medlem,SpeakerRole_midlertidig formand,SpeakerRole_minister,"SpeakerName_, fg.",SpeakerName_Aaja Chemnitz Larsen,SpeakerName_Aleqa Hammond,SpeakerName_Alex Ahrendtsen,...,SpeakerName_Ulla Sandbæk,SpeakerName_Ulla Tørnæs,SpeakerName_Vibeke Grave,SpeakerName_Villum Christensen,SpeakerName_Villy Søvndal,SpeakerName_Vivi Kier,SpeakerName_Yildiz Akdogan,SpeakerName_Zenia Stampe,SpeakerName_fg.,SpeakerName_Özlem Sara Cekic
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380878,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
380879,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
380880,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
380881,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## K-Means

In [16]:
kmeans = KMeans(n_clusters=6)
kmeans.fit(X.values)
y_kmeans = kmeans.predict(X)
centers = kmeans.cluster_centers_
shape = centers.shape

In [17]:
for i in range(shape[0]):
    if sum(centers[i,:]) == 0:
        print("\ncluster " + str(i) + ": ")
        print("no politician")
    else:
        print("\ncluster " + str(i) + ": ")
        cent = centers[i,:]
        for j in X.columns[np.nonzero(cent)]:
            print(j)


cluster 0: 
SpeakerRole_aldersformanden
SpeakerRole_formand
SpeakerRole_fungerende minister
SpeakerRole_medlem
SpeakerRole_midlertidig formand
SpeakerRole_minister
SpeakerName_, fg.
SpeakerName_Aaja Chemnitz Larsen
SpeakerName_Aleqa Hammond
SpeakerName_Alex Ahrendtsen
SpeakerName_Anders Johansson
SpeakerName_Anders Samuelsen
SpeakerName_Andreas Steenberg
SpeakerName_Ane Halsboe-Jørgensen
SpeakerName_Ane Halsboe-Larsen
SpeakerName_Anita Christensen
SpeakerName_Anita Knakkergaard
SpeakerName_Anna Kirsten Olesen
SpeakerName_Anne Baastrup
SpeakerName_Anne Grete Holmsgaard
SpeakerName_Anne Marie Geisler Andersen
SpeakerName_Anne Paulin
SpeakerName_Anne Sina
SpeakerName_Anne-Marie Meldgaard
SpeakerName_Anne-Mette Winther Christiansen
SpeakerName_Annette Lind
SpeakerName_Annette Vilhelmsen
SpeakerName_Anni Matthiesen
SpeakerName_Annika Smith
SpeakerName_Astrid Krag
SpeakerName_Benedikte Kiær
SpeakerName_Benny Engelbrecht
SpeakerName_Bent Bøgsted
SpeakerName_Bente Dahl
SpeakerName_Bente Kronb

SpeakerName_Sanne Rubinke
SpeakerName_Sara Olsvig
SpeakerName_Sarah Glerup
SpeakerName_Sarah Nørris
SpeakerName_Simon Emil Ammitzbøll
SpeakerName_Simon Kollerup
SpeakerName_Sisse Marie Welling
SpeakerName_Sjúrður Skaale
SpeakerName_Sofia Rossen
SpeakerName_Sofie Carsten Nielsen
SpeakerName_Sophie Hæstorp Andersen
SpeakerName_Sophie Løhde
SpeakerName_Steen Gade
SpeakerName_Steen Holm Iversen
SpeakerName_Steen Konradsen
SpeakerName_Stine Brix
SpeakerName_Susanne Eilersen
SpeakerName_Søren Egge Rasmussen
SpeakerName_Søren Espersen
SpeakerName_Søren Gade
SpeakerName_Søren Krarup
SpeakerName_Søren Pape Poulsen
SpeakerName_Søren Pind
SpeakerName_Søren Søndergaard
SpeakerName_Tage Leegaard
SpeakerName_Thomas Danielsen
SpeakerName_Thomas Jensen
SpeakerName_Thor Möger Pedersen
SpeakerName_Thor Pedersen
SpeakerName_Thyra Frank
SpeakerName_Tilde Bork
SpeakerName_Tina Nedergaard
SpeakerName_Tina Petersen
SpeakerName_Tom Behnke
SpeakerName_Torben Hansen
SpeakerName_Torsten Gejl
SpeakerName_Torsten 

SpeakerName_Kenneth Kristensen Berth
SpeakerName_Kim Andersen
SpeakerName_Kim Christiansen
SpeakerName_Kim Mortensen
SpeakerName_Kirsten Brosbøl
SpeakerName_Kirsten Normann Andersen
SpeakerName_Kisser Franciska Lehnert
SpeakerName_Klaus Hækkerup
SpeakerName_Klaus Markussen
SpeakerName_Knud Kristensen
SpeakerName_Kristen Touborg
SpeakerName_Kristian Hegaard
SpeakerName_Kristian Jensen
SpeakerName_Kristian Pihl Lorentzen
SpeakerName_Kristian Thulesen Dahl
SpeakerName_Kurt Scheelsbeck
SpeakerName_Lars Aslan Rasmussen
SpeakerName_Lars Barfoed
SpeakerName_Lars Christian Lilleholt
SpeakerName_Lars Dohn
SpeakerName_Lars Løkke Rasmussen
SpeakerName_Lars-Emil Johansen
SpeakerName_Laura Lindahl
SpeakerName_Lea Wermelin
SpeakerName_Leif Lahn Jensen
SpeakerName_Leif Mikkelsen
SpeakerName_Lene Espersen
SpeakerName_Lennart Damsbo-Andersen
SpeakerName_Linda Kristiansen
SpeakerName_Line Barfod
SpeakerName_Lisbeth Bech Poulsen
SpeakerName_Lise Bech
SpeakerName_Lise von Seelen
SpeakerName_Liselott Blixt

In [18]:
cols = X.columns
bt = X.apply(lambda x: x > 0)
nonzero = bt.apply(lambda x: list(cols[x.values]), axis=1)
nonzero

0                                     [SpeakerRole_formand]
1         [SpeakerRole_formand, SpeakerName_Mogens Lykke...
2                                     [SpeakerRole_formand]
3         [SpeakerRole_medlem, SpeakerName_Peter Juel Je...
4                                     [SpeakerRole_formand]
                                ...                        
380878    [SpeakerRole_minister, SpeakerName_Anders Samu...
380879    [SpeakerRole_formand, SpeakerName_Henrik Dam K...
380880          [SpeakerRole_medlem, SpeakerName_René Gade]
380881    [SpeakerRole_minister, SpeakerName_Anders Samu...
380882    [SpeakerRole_formand, SpeakerName_Henrik Dam K...
Length: 380883, dtype: object

In [19]:
kmeans.labels_

array([0, 0, 0, ..., 1, 3, 0])

In [20]:
labels = pd.Series(kmeans.labels_)
labels

0         0
1         0
2         0
3         1
4         0
         ..
380878    3
380879    0
380880    1
380881    3
380882    0
Length: 380883, dtype: int32

In [21]:
kmeans.cluster_centers_

array([[ 9.99837691e-17,  1.00000000e+00, -2.42286659e-15, ...,
        -2.38004061e-15,  7.99870153e-16,  2.49193027e-15],
       [ 4.66480075e-05, -1.58351110e-12,  1.63934426e-03, ...,
         3.78515261e-03,  9.59616154e-04,  4.91136879e-03],
       [-3.08319993e-18,  1.00000000e+00, -8.13151629e-17, ...,
         1.99926881e-16, -2.46655994e-17,  2.60208521e-18],
       [ 3.44911816e-17,  2.55240273e-13,  2.38849739e-16, ...,
        -7.00177763e-16,  2.75929453e-16, -1.63757896e-15],
       [ 8.45243584e-04,  9.99154756e-01, -1.53739868e-16, ...,
         3.92047506e-16,  5.59990422e-17,  2.81892565e-18],
       [-3.20517267e-18,  1.00000000e+00, -1.28152697e-16, ...,
         2.77989437e-16, -2.56413814e-17,  2.60208521e-18]])

In [22]:
clusters = pd.concat([labels, nonzero], axis = 1)
clusters

Unnamed: 0,0,1
0,0,[SpeakerRole_formand]
1,0,"[SpeakerRole_formand, SpeakerName_Mogens Lykke..."
2,0,[SpeakerRole_formand]
3,1,"[SpeakerRole_medlem, SpeakerName_Peter Juel Je..."
4,0,[SpeakerRole_formand]
...,...,...
380878,3,"[SpeakerRole_minister, SpeakerName_Anders Samu..."
380879,0,"[SpeakerRole_formand, SpeakerName_Henrik Dam K..."
380880,1,"[SpeakerRole_medlem, SpeakerName_René Gade]"
380881,3,"[SpeakerRole_minister, SpeakerName_Anders Samu..."


## K-Modes

In [15]:
# # define the k-modes model
# km = KModes(n_clusters=6, init='Huang', n_init=7, verbose=1)
# # fit the clusters to the skills dataframe
# clusters = km.fit_predict(X)
# # get an array of cluster modes
# kmodes = km.cluster_centroids_
# shape = kmodes.shape
# # For each cluster mode (a vector of "1" and "0")
# # find and print the column headings where "1" appears.
# # If no "1" appears, assign to "no-skills" cluster.
# for i in range(shape[0]):
#     if sum(kmodes[i,:]) == 0:
#         print("\ncluster " + str(i) + ": ")
#         print("no-skills cluster")
#     else:
#         print("\ncluster " + str(i) + ": ")
#         cent = kmodes[i,:]
#         for j in X.columns[np.nonzero(cent)]:
#             print(j)