In [36]:
import pandas as pd
import os
import numpy as np
from sklearn.cluster import KMeans
import sklearn.decomposition as skl
import pyecharts as pch

In [37]:
os.chdir('/Users/chenzhikai/AnacondaProjects/1984 Voting Records and the Third Way/data/')

In [38]:
def normalize(arr):
    return [n/float(arr.sum()) for n in arr]

In [39]:

def preprocess():
    df=pd.read_csv('Records.txt',header=None)
    df.columns=['label','handicapped-infants','water-project-cost-sharing',
           'adoption-of-the-budget-resolution','physician-fee-freeze',
            'el-salvador-aid','religious-groups-in-schools',
            'anti-satellite-test-ban','aid-to-nicaraguan-contras',
            'mx-missile','immigration','synfuels-corporation-cutback',
            'education-spending','superfund-right-to-sue','crime',
            'duty-free-exports','export-administration-act-south-africa'            
           ]
    df.reset_index(inplace=True)
    df.rename(columns={'index':'Sample'},inplace=True)
    df.replace('y',1,inplace=True)
    df.replace('n',0,inplace=True)
    df.replace('?',0,inplace=True)
    label=df.label
    df.drop(df.columns[1],axis=1,inplace=True)
    
    SampFeat=pd.melt(df,
            id_vars='Sample',var_name='Feature').sort_values(
            'Sample').reset_index().drop('index',axis=1)
    SampFeat=SampFeat[SampFeat['value']==1]
    #mask=SampFeat.Sample.value_counts()>=5
    #freqSamp=SampFeat.Sample.value_counts()[mask].index
    #SampFeat=SampFeat.set_index('Sample').loc[freqSamp,:].reset_index()
    pivot=df.set_index('Sample')
    pivot=pivot.replace('',0).apply(normalize,axis=1)
    return pivot,SampFeat,label

In [40]:
pivot,SampFeat,label=preprocess()

  from ipykernel import kernelapp as app


In [48]:

class Cluster(object):
    def __init__(self,SampFeat,pivot,numcluster):
        self.SampFeat=SampFeat
        self.pivot=pivot
        self.numcluster=numcluster
    def draw(self):
        mat=self.pivot.replace(np.nan,0).as_matrix()
        kmeans=KMeans(n_clusters=self.numcluster).fit(mat)
        self.clusters=pd.DataFrame({'Sample':pivot.index,'cluster':kmeans.labels_})
        
        #Inspect the results of clustering
        pivotNorm=pd.merge(self.SampFeat,self.clusters,how='left').groupby('cluster')
        ForEachCluster=lambda x:x.pivot(index='Sample',columns='Feature',
                                values='value').apply(normalize,axis=1)
        self.pivotNorm= pivotNorm.apply(ForEachCluster) 
        SampFeatNorm=self.pivotNorm.reset_index(level=1).melt(id_vars='Sample')
        SampFeatNorm=SampFeatNorm[SampFeatNorm.value>0]
        self.SampFeatNorm=SampFeatNorm
        pca = skl.PCA(n_components=3).fit(mat)
        xyz=pca.transform(mat)
        scatter=pch.Scatter3D('Clustering people by stance on issues',is_grid=True,
             width=1200,height=600)
        
        for cls,group in pd.concat([self.clusters,pd.DataFrame(xyz)],axis=1).groupby('cluster'):
            scatter.add('Cluster'+str(cls),xyz[group.index].tolist(),grid_eight='75%',legend_top='10%',legend_pos="0%",is_grid3D_rotate=True,grid3D_rotate_speed=40) 
        freqFeat=self.SampFeat['Feature'].value_counts()#[:10]
        self.switch=pd.DataFrame({'Feature':freqFeat,'boo':True})
        #draw Radar chart
        radar=pch.Radar('Radar Chart of ' + 'Party Affliation' + ' on ' + 'issues', is_grid=True)
        #normalize dict
        #normalize_list=[]
        #for idx in a.switch.index:
        #   adict={'name':idx,
        #   'max':a.SampFeatNorm.groupby('variable')['value'].max()[idx],
        #   'min':a.SampFeatNorm.groupby('variable')['value'].max()[idx]}
        #    normalize_list.append(adict)
        #schema=[(idx,switch.loc[idx,'Feature']) for idx in self.switch.index]
        schema=[(idx,0.25) for idx in self.switch.index]
        
        radar.config(schema)
        for i in np.arange(self.numcluster):
            data= self.pivotNorm.loc[i].as_matrix().tolist()
            c=['#d94e5d','#4e79a7','#eac763','#50a3ba',  '#f9713c','#b3e4a1']
            radar.add('Cluster '+str(i), data, item_color=c[i],legend_pos='0%',legend_top="10%",
                      is_area_show=True,area_color=c[i],area_opacity=0.5)
        radar.render('Clustering people by stance on issues_radar.html')
        scatter.render('people issues'+'_scatter.html')
        

In [53]:
a=Cluster(SampFeat,pivot,6)

In [54]:
a.draw()

In [9]:
a.SampFeatNorm.value.mean()

0.12686349020754223

In [10]:
#Analysis
#Compare Cluster labels with original democrat and republican labels
CluLab=pd.concat([a.clusters,pd.DataFrame({'label':label})],axis=1)

In [27]:
CluLab.set_index('cluster').sort_index().loc[1,'label'].value_counts()

democrat      218
republican      6
Name: label, dtype: int64

In [14]:
# 49 Samples which was characterzied as republicans by K-Means, but are democrats
idx=CluLab[(CluLab.cluster==0)&CluLab.label.str.match('democrat')].index
len(idx)

49

In [15]:
#The top issues these Reagan Democrats
top_issues=a.SampFeatNorm.set_index('Sample'
                  ).loc[idx,:].groupby('Feature'
                                      )['value'].sum().sort_values(ascending=False)

In [16]:
dem=CluLab[CluLab.label.str.contains('democrat')].Sample.index
rep=CluLab[CluLab.label.str.contains('republican')].Sample.index

In [17]:
dem_issues=a.SampFeatNorm.set_index('Sample'
                  ).loc[dem,:].groupby('Feature'
                                      )['value'].sum().sort_values(ascending=False)
rep_issues=a.SampFeatNorm.set_index('Sample'
                  ).loc[rep,:].groupby('Feature'
                                      )['value'].sum().sort_values(ascending=False)


In [60]:
analysis=pd.DataFrame({'Traditional Democrats':dem_issues.index,
              'Democrats (Emerging Wing)':top_issues.index,
              'Republicans':rep_issues.index})

In [61]:
analysis.index.name='Favored issues'

In [64]:
analysis.to_csv('Topic issues.csv')

In [18]:
dem_issues

Feature
adoption-of-the-budget-resolution         30.485015
aid-to-nicaraguan-contras                 28.998399
anti-satellite-test-ban                   26.635004
mx-missile                                25.533319
export-administration-act-south-africa    21.624265
duty-free-exports                         20.767940
handicapped-infants                       20.417111
synfuels-corporation-cutback              15.744106
religious-groups-in-schools               15.186999
immigration                               14.719900
water-project-cost-sharing                14.669070
crime                                     10.888586
superfund-right-to-sue                     8.460376
el-salvador-aid                            6.908381
education-spending                         4.329773
physician-fee-freeze                       1.631757
Name: value, dtype: float64

In [19]:
rep_issues

Feature
crime                                     21.203630
physician-fee-freeze                      21.125455
el-salvador-aid                           20.486133
religious-groups-in-schools               19.291689
superfund-right-to-sue                    17.529412
education-spending                        17.411891
export-administration-act-south-africa    11.275852
immigration                               11.036169
water-project-cost-sharing                 9.381421
anti-satellite-test-ban                    4.159185
handicapped-infants                        3.388900
aid-to-nicaraguan-contras                  2.524229
synfuels-corporation-cutback               2.315487
adoption-of-the-budget-resolution          2.215065
mx-missile                                 2.074254
duty-free-exports                          1.581227
Name: value, dtype: float64

In [35]:
top_issues

<bound method Series.item of Feature
religious-groups-in-schools               6.258777
crime                                     5.701634
el-salvador-aid                           5.550444
superfund-right-to-sue                    4.606396
synfuels-corporation-cutback              4.253619
water-project-cost-sharing                3.860761
adoption-of-the-budget-resolution         3.490920
export-administration-act-south-africa    2.990884
handicapped-infants                       2.797258
education-spending                        2.535329
immigration                               1.948821
duty-free-exports                         1.430988
physician-fee-freeze                      1.409535
anti-satellite-test-ban                   0.805988
mx-missile                                0.751166
aid-to-nicaraguan-contras                 0.607479
Name: value, dtype: float64>