In [41]:
import pandas as pd 
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

In [42]:
df=pd.read_csv("ClustersCountiesVoting.csv").drop('Unnamed: 0',axis=1)

In [43]:
for i in ['Current_Cluster',
       'Demographic_Cluster', 'Economic_Cluster', 'Owned_Cluster',
       'Age_Cluster', 'Education_Cluster', 'Housing_Cluster','Combined_Cluster']:
    cols=pd.get_dummies(df[i])
    strings=[]
    for j in range(len(cols.columns)):
        strings.append(i+str(j))
        df[i+str(j)]=cols[cols.columns[j]]
    print(i,len(cols.columns))
    cols.columns=strings

Current_Cluster 6
Demographic_Cluster 7
Economic_Cluster 9
Owned_Cluster 7
Age_Cluster 5
Education_Cluster 6
Housing_Cluster 7
Combined_Cluster 10


In [44]:
df['Winner2016']=0
df.ix[df['per_point_diff_2016']>0,'Winner2016']=1
df['Winner2012']=0
df.ix[df['per_point_diff_2012']>0,'Winner2012']=1
dems=df[df['Winner2016']==1]

In [45]:
demographic=["Demographic_Cluster0","Demographic_Cluster1","Demographic_Cluster2","Demographic_Cluster3",
            "Demographic_Cluster4","Demographic_Cluster5","Demographic_Cluster6"]
economic=["Economic_Cluster0","Economic_Cluster1","Economic_Cluster2","Economic_Cluster3",
            "Economic_Cluster4","Economic_Cluster5","Economic_Cluster6","Economic_Cluster7","Economic_Cluster8"]
econ=["Economic_Cluster0"]
education=["Education_Cluster0","Education_Cluster1","Education_Cluster2","Education_Cluster3","Education_Cluster4",
                "Education_Cluster5"]

In [84]:
def get_classifier_metrics(truth_2012, truth_2016, truth_2012_sub, truth_2016_sub, predict_maps:dict, ):
    metrics = {}
    for k, v in predict_maps.items():
        if "2012" in k:
            if "all" in k:
                metrics[k] = precision_recall_fscore_support(truth_2012, v, average="binary")
            else:
                metrics[k] = precision_recall_fscore_support(truth_2012_sub, v, average="binary")
        else:
            if "all" in k:
                metrics[k] = precision_recall_fscore_support(truth_2016, v, average="binary")
            else:
                metrics[k] = precision_recall_fscore_support(truth_2016_sub, v, average="binary")
    
    
    return metrics
    

In [110]:
def clusteringalgo_fscore(x, y, y2, df, df2):
    preds_names = ["svc_2012_all", "knn_2012_all", "svc_2016_all", "knn_2016_all",
             "svc_2012_sub","knn_2012_sub", "svc_2016_sub","knn_2016_sub"]
    predictions = []
    # init 
    svc = SVC()
    knn = KNeighborsClassifier()
    # train using overall 2012 results.
    svc.fit(df[x], df[y])
    knn.fit(df[x], df[y])
    ## predict overall 2012 election results 
    predictions.append(svc.predict(df[x]))
    predictions.append(knn.predict(df[x]))
    ##predict overall 2016 election results
    predictions.append(svc.predict(df[x]))
    predictions.append(knn.predict(df[x]))
    ## predict subset 2012 election results
    predictions.append(svc.predict(df2[x]))
    predictions.append(knn.predict(df2[x]))
    ## predict subset 2016 election results
    predictions.append(svc.predict(df2[x]))
    predictions.append(knn.predict(df2[x]))
    
    results = get_classifier_metrics(df[y], df[y2], df2[y], df2[y2], {k:v for k,v in list(zip(preds_names, predictions))})
    for k, result in sorted(results.items()):
        print(k, " - ", result[2])
    
    
    

In [111]:
#Params:
#x- predictors (represented as list of strings)
#y- response (represented as list of strings)
#y2- secondary response
#df- original dataframe
#df2- subset
def clusteringalgo(x,y,y2,df,df2):
    model=SVC()
    model2=KNeighborsClassifier()
    model.fit(df[x],df[y])
    model2.fit(df[x],df[y])
    
    print("SVC overall accuracy for training is"+str(model.score(df[x],df[y])))
    print("KMeans overall accuracy for training is"+str(model2.score(df[x],df[y])))
    print("SVC overall accuracy for predictive testing is"+str(model.score(df[x],df[y2])))
    print("KMeans overall accuracy for predictive testing is"+str(model2.score(df[x],df[y2])))
    
    print("SVC subset accuracy for testing is"+str(model.score(df2[x],df2[y])))
    print("KMeans subset accuracy for testing is"+str(model2.score(df2[x],df2[y])))
    print("SVC subset accuracy for predictive testing is"+str(model.score(df2[x],df2[y2])))
    print("KMeans subset accuracy for predictive testing is"+str(model2.score(df2[x],df2[y2])))

In [112]:
model=SVC()

In [113]:
clusteringalgo_fscore(["Education_Cluster0","Education_Cluster1","Education_Cluster2","Education_Cluster3","Education_Cluster4",
                "Education_Cluster5"],'Winner2012',"Winner2016",df,dems)

knn_2012_all  -  0.297684674752
knn_2012_sub  -  0.4332247557
knn_2016_all  -  0.411847672779
knn_2016_sub  -  0.460567823344
svc_2012_all  -  0.297684674752
svc_2012_sub  -  0.4332247557
svc_2016_all  -  0.411847672779
svc_2016_sub  -  0.460567823344


In [114]:
clusteringalgo_fscore(["Education_Cluster0","Education_Cluster1","Education_Cluster2","Education_Cluster3","Education_Cluster4",
                "Education_Cluster5","White%"],'Winner2012',"Winner2016",df,dems)

knn_2012_all  -  0.629370629371
knn_2012_sub  -  0.795918367347
knn_2016_all  -  0.668076109937
knn_2016_sub  -  0.786069651741
svc_2012_all  -  0.538538538539
svc_2012_sub  -  0.684283727399
svc_2016_all  -  0.626716604245
svc_2016_sub  -  0.679296346414


In [115]:
model2=KNeighborsClassifier()
clusteringalgo_fscore(["Demographic_Cluster0","Demographic_Cluster1","Demographic_Cluster2","Demographic_Cluster3",
            "Demographic_Cluster4","Demographic_Cluster5","Demographic_Cluster6","Education_Cluster0",
              "Education_Cluster1","Education_Cluster2","Education_Cluster3","Education_Cluster4",
                "Education_Cluster5"],'Winner2012',"Winner2016",df,dems)

knn_2012_all  -  0.594117647059
knn_2012_sub  -  0.841854934602
knn_2016_all  -  0.641996557659
knn_2016_sub  -  0.86643437863
svc_2012_all  -  0.543739279588
svc_2012_sub  -  0.737254901961
svc_2016_all  -  0.613636363636
svc_2016_sub  -  0.756687898089


In [116]:
clusteringalgo_fscore(["Demographic_Cluster0","Demographic_Cluster1","Demographic_Cluster2","Demographic_Cluster3",
            "Demographic_Cluster4","Demographic_Cluster5","Demographic_Cluster6","MedianIncome"],'Winner2012',"Winner2016",df,dems)

knn_2012_all  -  0.407294832827
knn_2012_sub  -  0.503987240829
knn_2016_all  -  0.403041825095
knn_2016_sub  -  0.491499227202
svc_2012_all  -  0.891935483871
svc_2012_sub  -  0.906542056075
svc_2016_all  -  0.74472168906
svc_2016_sub  -  0.885844748858


In [None]:
clusteringalgo_fscore(["Economic_Cluster0","Economic_Cluster1","Economic_Cluster2","Economic_Cluster3",
            "Economic_Cluster4","Economic_Cluster5","Economic_Cluster6","Economic_Cluster7","Economic_Cluster8",],
               "Winner2012","Winner2016",df,dems)

knn_2012_all  -  0.115384615385
knn_2012_sub  -  0.170542635659
knn_2016_all  -  0.164948453608
knn_2016_sub  -  0.179104477612
svc_2012_all  -  0.0340909090909
svc_2012_sub  -  0.05
svc_2016_all  -  0.0474308300395
svc_2016_sub  -  0.048


In [None]:
clusteringalgo_fscore(["Pop2014","Pop2010EST","Pop%Change","Pop2010",
                    "Under5%","Under18%","Over65%","Female%",
                     "White%","Black%","NativeA%","Asian%",
                    "PIsland%","TwoRace%","Latino", "WhiteNLat%",
                     "SameHouse1yr%","ForeignBorn%","NonEnglish%","HighSchoolGrad%",
                    "Bachelor%","Veteran", "TravelTime","HousingUnits", 
                     "OwnershipRate","UnitsinMultiUnit%","MedianValueHousing",
                     "Households","PerCapitaIncome","MedianIncome","%BelowPoverty",
                     "PrivateNotFarmsEstablish","PrivateNotFarmEmploy","NotFarm%Change","NonEmployerEstablish",
                    "Black-owned%","Native-owned%","Asian-owned%","PIslnder-owned%",
                     "Hispanic-owned%","Woman-owned","Manfact shipments 1k",
                    "Merchant sales 1k","Retail sales 1k","Retail sales/capita",
                     "Accomd/FoodServ sales 1k","BuildingPermits","LandArea","Pop/SqMile"],"Winner2012","Winner2016",df,dems)

In [131]:
clusteringalgo_fscore(["Female%",
                     "White%","Black%","NativeA%","Asian%",
                    "PIsland%","TwoRace%","Latino", "WhiteNLat%"],"Winner2012","Winner2016",df,dems)

knn_2012_all  -  0.682969432314
knn_2012_sub  -  0.826405867971
knn_2016_all  -  0.739176346357
knn_2016_sub  -  0.835322195704
svc_2012_all  -  0.751123090746
svc_2012_sub  -  0.877990430622
svc_2016_all  -  0.804371584699
svc_2016_sub  -  0.859813084112


In [None]:
clusteringalgo_fscore(["Female%",
                     "White%","Black%","NativeA%","Asian%",
                    "PIsland%","TwoRace%","Latino", "WhiteNLat%"],"Winner2012","Winner2016",df,df[df['Demographic_Cluster4']!=0])

In [None]:
clusteringalgo_fscore(["Female%",
                     "White%","Black%","NativeA%","Asian%",
                    "PIsland%","TwoRace%","Latino", "WhiteNLat%"],"Winner2012","Winner2016",df,df[df['Demographic_Cluster0']!=0])

In [None]:
clusteringalgo_fscore(["Female%",
                     "White%","Black%","NativeA%","Asian%",
                    "PIsland%","TwoRace%","Latino", "WhiteNLat%"],"Winner2012","Winner2016",df,df[df['Demographic_Cluster3']!=0])

In [None]:
swing_states=["AZ","FL","GA","NH","NC","CO","MI","NV","VA","PA","WI","OH"]

In [None]:
swingers=df[df['state_abbreviation'].isin(swing_states)]

In [None]:
clusteringalgo_fscore(["Female%",
                     "White%","Black%","NativeA%","Asian%",
                    "PIsland%","TwoRace%","Latino", "WhiteNLat%"],"Winner2012","Winner2016",swingers,swingers[swingers['Demographic_Cluster0']==1])

In [None]:
clusteringalgo_fscore(["Female%",
                     "White%","Black%","NativeA%","Asian%",
                    "PIsland%","TwoRace%","Latino", "WhiteNLat%"],"Winner2012","Winner2016",swingers,swingers[swingers['Demographic_Cluster1']==1])

In [None]:
clusteringalgo_fscore(["Female%",
                     "White%","Black%","NativeA%","Asian%",
                    "PIsland%","TwoRace%","Latino", "WhiteNLat%"],"Winner2012","Winner2016",swingers,swingers[swingers['Demographic_Cluster3']==1])

In [None]:
clusteringalgo_fscore(['Female%',
 'White%',
 'Black%',
 'NativeA%',
 'Asian%',
 'PIsland%',
 'TwoRace%',
 'Latino',
 'WhiteNLat%',
 'PerCapitaIncome',
 'MedianIncome',
 '%BelowPoverty'],"Winner2012","Winner2016",swingers,swingers[swingers['Combined_Cluster0']==1])

In [None]:
clusteringalgo_fscore(['Female%',
 'White%',
 'Black%',
 'NativeA%',
 'Asian%',
 'PIsland%',
 'TwoRace%',
 'Latino',
 'WhiteNLat%',
 'PerCapitaIncome',
 'MedianIncome',
 '%BelowPoverty'],"Winner2012","Winner2016",swingers,swingers[swingers['Combined_Cluster4']==1])

In [None]:
clusteringalgo_fscore(['Female%',
 'White%',
 'Black%',
 'NativeA%',
 'Asian%',
 'PIsland%',
 'TwoRace%',
 'Latino',
 'WhiteNLat%',
 'PerCapitaIncome',
 'MedianIncome',
 '%BelowPoverty'],"Winner2012","Winner2016",swingers,swingers[swingers['Combined_Cluster5']==1])