In [227]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import os

pd.set_option('max_columns', None)
pd.set_option('max_rows', None)
from sklearn.model_selection import train_test_split

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [228]:
rootfolder = '/Users/amandahutter/Documents/CS_7641/HW1'

os.chdir(rootfolder)

# Data from: https://www.kaggle.com/devvret/congressional-voting-records
data = pd.read_csv("Data/datasets_500979_928273_house-votes-84.csv")

In [229]:
data.head()

Unnamed: 0,Class Name,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,republican,n,y,n,y,y,y,n,n,n,y,?,y,y,y,n,y
1,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,?
2,democrat,?,y,y,?,y,y,n,n,n,n,y,n,y,y,n,n
3,democrat,n,y,y,n,?,y,n,n,n,n,y,n,y,n,n,y
4,democrat,y,y,y,n,y,y,n,n,n,n,y,?,y,y,y,y


In [230]:
data.describe()

Unnamed: 0,Class Name,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
count,435,435,435,435,435,435,435,435,435,435,435,435,435,435,435,435,435
unique,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
top,democrat,n,y,y,n,y,y,y,y,y,y,n,n,y,y,n,y
freq,267,236,195,253,247,212,272,239,242,207,216,264,233,209,248,233,269


In [231]:
## TODO: CONVERT TO NUMERIC FOR ALGORITHMS 
## EITEHR DROP ? or make categorical columns 

data.loc[ data['Class Name'] == "democrat", "Class Name"] = 1
data.loc[ data['Class Name'] == 'republican', "Class Name"] = 0
data['Class Name'] = data['Class Name'].astype(int)

In [232]:
data.dtypes

Class Name                                  int64
 handicapped-infants                       object
 water-project-cost-sharing                object
 adoption-of-the-budget-resolution         object
 physician-fee-freeze                      object
 el-salvador-aid                           object
 religious-groups-in-schools               object
 anti-satellite-test-ban                   object
 aid-to-nicaraguan-contras                 object
 mx-missile                                object
 immigration                               object
 synfuels-corporation-cutback              object
 education-spending                        object
 superfund-right-to-sue                    object
 crime                                     object
 duty-free-exports                         object
 export-administration-act-south-africa    object
dtype: object

In [233]:
categorical = [' handicapped-infants', ' water-project-cost-sharing',
       ' adoption-of-the-budget-resolution', ' physician-fee-freeze',
       ' el-salvador-aid', ' religious-groups-in-schools',
       ' anti-satellite-test-ban', ' aid-to-nicaraguan-contras', ' mx-missile',
       ' immigration', ' synfuels-corporation-cutback', ' education-spending',
       ' superfund-right-to-sue', ' crime', ' duty-free-exports',
       ' export-administration-act-south-africa']

In [234]:
# Split Data into Target and Inputs 

X = data.iloc[:,1:]

y = data.iloc[:,0]

In [235]:
print(X.shape)
print(y.shape)

(435, 16)
(435,)


In [236]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)

X_encoded = pd.DataFrame(encoder.fit_transform(X[categorical]))

X_encoded.columns = encoder.get_feature_names(categorical)

X.drop(categorical, axis = 1, inplace = True)

print(X.shape)
print(X_encoded.shape)
OH_X = pd.concat([X, pd.DataFrame(X_encoded, index=X.index)], axis = 1)

print(OH_X.shape)

(435, 0)
(435, 48)
(435, 48)


Change the Target Variable to make it easier to know what is being predicted. Predict that a house member is a democrat 

In [237]:
data['Class Name'].describe()

count    435.000000
mean       0.613793
std        0.487440
min        0.000000
25%        0.000000
50%        1.000000
75%        1.000000
max        1.000000
Name: Class Name, dtype: float64

Use the One-Hot Encoding Data and split into Train and Test Data Sets

In [238]:
X_Train, X_Test, Y_Train, Y_Test = train_test_split(OH_X, y, test_size = .30, train_size = .70, random_state = 42)

print(X_Train.shape)
print(X_Test.shape)
print(Y_Train.shape)
print(Y_Test.shape)

(304, 48)
(131, 48)
(304,)
(131,)


#### 1) Decision Trees

In [251]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import confusion_matrix 

accuracy_list = []
f1_list = []
precision_1_list = []
precision_0_list = []
recall_1_list = []
recall_0_list = []


def make_decision_tree(criterion = 'gini', splitter = 'best', max_depth = 10):

    classifier = DecisionTreeClassifier(criterion, splitter, max_depth)
    classifier.fit(X_Train, Y_Train)

    Y_Pred_Test = classifier.predict(X_Test)

    cm = confusion_matrix(Y_Test, Y_Pred_Test)

    tn, fp, fn, tp = confusion_matrix(Y_Test, Y_Pred_Test).ravel()

    accuracy = (tn + tp) / (tn + tp + fn + fp)
    f1_score = (2*tp) / (2*tp + fp + fn)
    precision_1 = tp / (tp + fp)
    precision_0 = tn / (tn + fn)
    recall_1 =  tp / (tp + fn)
    recall_0 =  tn / (tn + fp)
    return(accuracy, f1_score, precision_1, precision_0, recall_1, recall_0, tp, tn, fp, fn)

In [256]:
accuracy_list = []
f1_list = []
precision_1_list = []
precision_0_list = []
recall_1_list = []
recall_0_list = []
tp_list = []
tn_list = []
fp_list = []
fn_list = []

criterions = ['gini', 'entropy']
splitters = ['best', 'random']
max_depths = list(np.arange(1, X_Test.shape[1], 5))

c_list = []
s_list = []
d_list = []

for c in criterions:
    for s in splitters:
        for d in max_depths:  
            accuracy, f1_score, precision_1, precision_0, recall_1, recall_0, tp, tn, fp, fn = make_decision_tree(criterion='gini', splitter='best',max_depth=5)
            
            accuracy_list.append(accuracy)
            f1_list.append(f1_score)
            precision_1_list.append(precision_1)
            precision_0_list.append(precision_0)
            recall_1_list.append(recall_1)
            recall_0_list.append(recall_0)
            c_list.append(c)
            s_list.append(s)
            d_list.append(d)
            tp_list.append(tp)
            tn_list.append(tn)
            fp_list.append(fp)
            fn_list.append(fn)

results2 = pd.DataFrame(zip(c_list, s_list, d_list, accuracy_list, f1_list, precision_1_list,precision_0_list,recall_1_list,recall_0_list, tp_list, tn_list, fp_list, fn_list), 
                        columns = ["Criterion", 'Splitter','Max_Depth','Accuracy','F1_Score','Precision_1','Precision_0','Recall_1','Recall_0', 'TP', 'TN', 'FP', 'FN'])
results2

Unnamed: 0,Criterion,Splitter,Max_Depth,Accuracy,F1_Score,Precision_1,Precision_0,Recall_1,Recall_0,TP,TN,FP,FN
0,gini,best,1,0.954198,0.965517,0.94382,0.97619,0.988235,0.891304,84,41,5,1
1,gini,best,6,0.938931,0.954545,0.923077,0.975,0.988235,0.847826,84,39,7,1
2,gini,best,11,0.961832,0.971098,0.954545,0.976744,0.988235,0.913043,84,42,4,1
3,gini,best,16,0.954198,0.965517,0.94382,0.97619,0.988235,0.891304,84,41,5,1
4,gini,best,21,0.931298,0.949153,0.913043,0.974359,0.988235,0.826087,84,38,8,1
5,gini,best,26,0.931298,0.949153,0.913043,0.974359,0.988235,0.826087,84,38,8,1
6,gini,best,31,0.954198,0.965517,0.94382,0.97619,0.988235,0.891304,84,41,5,1
7,gini,best,36,0.931298,0.949153,0.913043,0.974359,0.988235,0.826087,84,38,8,1
8,gini,best,41,0.931298,0.949153,0.913043,0.974359,0.988235,0.826087,84,38,8,1
9,gini,best,46,0.931298,0.949153,0.913043,0.974359,0.988235,0.826087,84,38,8,1


In [241]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import confusion_matrix 

accuracy_list = []
f1_list = []
precision_1_list = []
precision_0_list = []
recall_1_list = []
recall_0_list = []


def make_decision_tree(criterion = 'gini', splitter = 'best', max_depth = 10):

    classifier = DecisionTreeClassifier(criterion, splitter, max_depth)
    classifier.fit(X_Train, Y_Train)
    
    Y_Pred_Test = classifier.predict(X_Test)
    
    cm = confusion_matrix(Y_Test, Y_Pred_Test)
    
    tn, fp, fn, tp = confusion_matrix(Y_Test, Y_Pred_Test).ravel()

    accuracy = (tn + tp) / (tn + tp + fn + fp)
    accuracy_list.append(accuracy)
    
    f1_score = (2*tp) / (2*tp + fp + fn)
    f1_list.append(f1_score)
    
    precision_1 = tp / (tp + fp)
    precision_1_list.append(precision_1)
    
    precision_0 = tn / (tn + fn)
    precision_0_list.append(precision_0)
    
    recall_1 =  tp / (tp + fn)
    recall_1_list.append(recall_1)
    
    recall_0 =  tn / (tn + fp)
    recall_0_list.append(recall_0) 
    

In [242]:
criterions = ['gini', 'entropy']
splitters = ['best', 'random']
max_depths = [5, 10]

rows =  len(criterions)*len(splitters)*len(max_depths)

results = pd.DataFrame(index=range(rows), columns = ['Criterion', 'Splitter', 'Max_Depth'])

# Parameter 1 
results.loc[0:(rows/2)-1,'Criterion'] = 'gini' 
results.loc[rows/2:, 'Criterion'] = 'entropy'

# Parameter 2 
inds3 = np.arange(0,2**1)
inds5 = list(inds3) + list(inds3 + 2**2)
inds4 = np.arange(2,2**2)
inds6 = list(inds4) + list(inds4+2**2)
results.loc[inds5, 'Splitter'] = 'best'
results.loc[inds6, 'Splitter'] = 'random'

# Parameter 3 

inds = np.arange(0,rows,2)
inds2 = np.arange(1,rows,2)
results.loc[inds, 'Max_Depth'] = 5
results.loc[inds2, 'Max_Depth'] = 10

results.apply(lambda x: make_decision_tree(criterion = x.Criterion, splitter = x.Splitter, max_depth = x.Max_Depth), axis=1)

results['Accuracy'] = accuracy_list
results['F1_Score'] = f1_list
results['Precision_1'] = precision_1_list
results['Precision_0'] = precision_0_list
results['Recall_1'] = recall_1_list
results['Recall_0'] = recall_0_list

print(results)


  Criterion Splitter Max_Depth  Accuracy  F1_Score  Precision_1  Precision_0  \
0      gini     best         5  0.954198  0.965517     0.943820     0.976190   
1      gini     best        10  0.938931  0.954023     0.932584     0.952381   
2      gini   random         5  0.931298  0.949153     0.913043     0.974359   
3      gini   random        10  0.938931  0.954545     0.923077     0.975000   
4   entropy     best         5  0.961832  0.971098     0.954545     0.976744   
5   entropy     best        10  0.938931  0.954023     0.932584     0.952381   
6   entropy   random         5  0.954198  0.965517     0.943820     0.976190   
7   entropy   random        10  0.938931  0.954545     0.923077     0.975000   

   Recall_1  Recall_0  
0  0.988235  0.891304  
1  0.976471  0.869565  
2  0.988235  0.826087  
3  0.988235  0.847826  
4  0.988235  0.913043  
5  0.976471  0.869565  
6  0.988235  0.891304  
7  0.988235  0.847826  


In [243]:
from sklearn.tree import DecisionTreeClassifier 
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_Train, Y_Train)

Y_Pred = classifier.predict(X_Test)
print(Y_Pred)

[1 1 0 0 1 0 1 0 1 0 0 1 1 1 0 1 0 1 0 1 1 1 0 1 0 1 1 1 1 0 0 1 1 1 1 1 1
 1 1 0 1 1 1 0 1 0 1 1 0 1 1 0 1 1 0 1 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 0 1 0 0 1 1 0 0 1 1 0 1 1 0 1 0 1 1 0 1 1 1 0 1 0 1 0 1 1 1
 0 1 1 1 1 1 1 0 0 1 1 1 1 1 0 1 1 1 0 1]


In [157]:
from sklearn.metrics import confusion_matrix 

cm = confusion_matrix(Y_Test, Y_Pred)
print(cm)

[[39  7]
 [ 1 84]]


In [158]:
tn, fp, fn, tp = confusion_matrix(Y_Test, Y_Pred).ravel()

In [159]:
print("TN",tn)
print("FN",fn)
print("TP",tp)
print("FP",fp)

TN 39
FN 1
TP 84
FP 7


In [160]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_true = Y_Test, y_pred = Y_Pred))

0.9389312977099237


In [161]:
print("Accuracy:")
print((tn + tp) / (tp + tn + fp + fn))
print()

print("Precision of predicting Democrat:")
print(tp / (tp + fp))
print("When we predict Democrat, we get it right "+ str(round(tp/(tp+fp)*100)) +" percent of the time")
print()

print("Precision of predicting Republican:")
print(tn / (tn + fn))
print("When we predict Democrat, we get it right " + str(round(tn/(tn+fn)*100)) + " percent of the time")
print()

print("Recall of Democrat:")
print(tp / (tp + fn))
print("We find "+ str(round(tp/(tp+fn)*100))  + " percent of Democrats with our predictions")
print()

print("Recall of Republicans:")
print(tn / (tn + fp))
print("We find " + str(round(tn/(tn+fp)*100)) +" percent of Democrat with our predictions")
print()

print("F-1 Score:")
print((2*tp) / (2*tp + fp + fn))

Accuracy:
0.9389312977099237

Precision of predicting Democrat:
0.9230769230769231
When we predict Democrat, we get it right 92.0 percent of the time

Precision of predicting Republican:
0.975
When we predict Democrat, we get it right 98.0 percent of the time

Recall of Democrat:
0.9882352941176471
We find 99.0 percent of Democrats with our predictions

Recall of Republicans:
0.8478260869565217
We find 85.0 percent of Democrat with our predictions

F-1 Score:
0.9545454545454546


In [162]:
from sklearn.metrics import classification_report 

print(classification_report(Y_Test, Y_Pred))

              precision    recall  f1-score   support

           0       0.97      0.85      0.91        46
           1       0.92      0.99      0.95        85

    accuracy                           0.94       131
   macro avg       0.95      0.92      0.93       131
weighted avg       0.94      0.94      0.94       131



#### 2) ANN - Artifical Neural Network

In [166]:
# Feature Scaling 
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_Train = sc.fit_transform(X_Train)
X_Test = sc.transform(X_Test)

In [167]:
X_Train.shape

(304, 48)

In [168]:
Y_Train.dtypes
Y_Test.dtypes

dtype('int64')

In [169]:
# Import Keras. Based on Tensorflow. 
import keras 

from keras.models import Sequential 
from keras.layers import Dense

# Initialize the Artificial Neural Net 
classifier = Sequential()

# Adding the input layer and first hidden layer 
num_inputs = X_Train.shape[1]
classifier.add(Dense(units = 6, activation = 'relu', input_dim = num_inputs, kernel_initializer = 'uniform')) # Use Dense function
# Output layer - use the average of number of nodes input layer + output layer. 
# Number of Nodes in input layer = Number of Independent Variables = 11 
# Number of Nodes in output layer = 1 = binary 
# So,... use 6 nodes for output_dimension 'units'
# Activation Function is Relu, or 'Rectifier' function for the hidden layers. 

# Adding the second hidden layer. Knows what to expect. Don't specify input parameter. 
classifier.add(Dense(units = 6, activation = 'relu', kernel_initializer = 'uniform'))

# Adding the output layer 
classifier.add(Dense(units = 1, activation = 'sigmoid', kernel_initializer = 'uniform'))
# Activation layer for output layer, we want probabilities, replace Activation Function with SIGMOID function. 

# Compile the ANN 
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
# optimizer - determines the optimal number of weights in neural network
# loss - similar to logistic regression. Logarithmic loss. 
# metrics - used to evaluate your model 

# Fit to training set - Chose number of Epochs - see how each round improves accruacy 
classifier.fit(x = X_Train, y = Y_Train, batch_size = 10, epochs = 100 )


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x7fcfbab9aa90>

In [170]:
# Predict the DV Test set using the classifer 
Y_Pred = classifier.predict(X_Test)
print(Y_Pred) # Predictions based on Y Test characteristics 

#Predict the test set results - Bin Them Into Categories 
Y_Pred = classifier.predict(X_Test)
Y_Pred =  (Y_Pred > 0.5)

[[0.99607337]
 [0.99997973]
 [0.01439175]
 [0.01439175]
 [0.8921393 ]
 [0.01439175]
 [0.16803959]
 [0.01439175]
 [0.9996543 ]
 [0.0529508 ]
 [0.01439175]
 [0.9938512 ]
 [0.9999977 ]
 [0.999999  ]
 [0.01439175]
 [1.        ]
 [0.05085152]
 [0.999987  ]
 [0.01439175]
 [1.        ]
 [1.        ]
 [1.        ]
 [0.01439175]
 [1.        ]
 [0.01439175]
 [1.        ]
 [1.        ]
 [1.        ]
 [1.        ]
 [0.0529508 ]
 [0.01439175]
 [1.        ]
 [0.01439175]
 [1.        ]
 [0.9995704 ]
 [1.        ]
 [1.        ]
 [0.9999918 ]
 [1.        ]
 [0.01439175]
 [1.        ]
 [1.        ]
 [0.99999714]
 [0.01439175]
 [1.        ]
 [0.01439175]
 [0.99985874]
 [1.        ]
 [0.7884794 ]
 [1.        ]
 [1.        ]
 [0.01439175]
 [1.        ]
 [1.        ]
 [0.01439175]
 [1.        ]
 [0.01439175]
 [1.        ]
 [0.01439175]
 [0.08566928]
 [1.        ]
 [1.        ]
 [1.        ]
 [1.        ]
 [0.02714428]
 [0.01439175]
 [0.9999977 ]
 [1.        ]
 [1.        ]
 [1.        ]
 [1.        ]
 [0.81

In [171]:
# Making the confusion matrix 
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true = Y_Test, y_pred = Y_Pred) # create an instance of confusion matrix class 
print(cm)

tn, fp, fn, tp = confusion_matrix(Y_Test, Y_Pred).ravel()

print("TN",tn)
print("FN",fn)
print("TP",tp)
print("FP",fp)

[[43  3]
 [ 1 84]]
TN 43
FN 1
TP 84
FP 3


In [172]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_true = Y_Test, y_pred = Y_Pred))

0.9694656488549618


In [173]:
print("Accuracy:")
print((tn + tp) / (tp + tn + fp + fn))
print()

print("Precision of predicting Democrat:")
print(tp / (tp + fp))
print("When we predict Democrat, we get it right "+ str(round(tp/(tp+fp)*100)) +" percent of the time")
print()

print("Precision of predicting Republican:")
print(tn / (tn + fn))
print("When we predict Democrat, we get it right " + str(round(tn/(tn+fn)*100)) + " percent of the time")
print()

print("Recall of Democrat:")
print(tp / (tp + fn))
print("We find "+ str(round(tp/(tp+fn)*100))  + " percent of Democrats with our predictions")
print()

print("Recall of Republicans:")
print(tn / (tn + fp))
print("We find " + str(round(tn/(tn+fp)*100)) +" percent of Democrat with our predictions")
print()

print("F-1 Score:")
print((2*tp) / (2*tp + fp + fn))

Accuracy:
0.9694656488549618

Precision of predicting Democrat:
0.9655172413793104
When we predict Democrat, we get it right 97.0 percent of the time

Precision of predicting Republican:
0.9772727272727273
When we predict Democrat, we get it right 98.0 percent of the time

Recall of Democrat:
0.9882352941176471
We find 99.0 percent of Democrats with our predictions

Recall of Republicans:
0.9347826086956522
We find 93.0 percent of Democrat with our predictions

F-1 Score:
0.9767441860465116


In [174]:
from sklearn.metrics import classification_report 

print(classification_report(Y_Test, Y_Pred))

              precision    recall  f1-score   support

           0       0.98      0.93      0.96        46
           1       0.97      0.99      0.98        85

    accuracy                           0.97       131
   macro avg       0.97      0.96      0.97       131
weighted avg       0.97      0.97      0.97       131



#### 3) Boosted Decision Tree

In [175]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

#classifier = AdaBoostClassifier(DecisionTreeClassifier,n_estimators = 50, learning_rate = 1.0, random_state = 42)
classifier = AdaBoostClassifier()
# Defaults: 
# base_estimator: DecisionTreeClassifier(max_depth=1)
# n_estimators: 50
# learning_Rate: 1.0 
# algorithm: SAMME.R 
# random_state: None 
classifier.fit(X_Train, Y_Train)

Y_Pred = classifier.predict(X_Test)
print(Y_Pred)

[1 1 0 0 0 0 1 0 1 0 0 1 1 1 0 1 0 1 0 1 1 1 0 1 0 1 1 1 1 0 0 1 0 1 1 1 1
 1 1 0 1 1 1 0 1 0 1 1 1 1 1 0 1 1 0 1 0 1 0 0 1 1 1 1 0 0 1 1 1 1 1 1 1 1
 1 0 0 1 1 1 1 0 1 0 0 1 1 0 0 1 1 0 1 1 0 1 0 1 1 0 1 1 1 0 0 0 1 0 1 1 1
 0 1 1 1 1 1 1 0 0 1 1 1 1 1 0 1 1 1 0 1]


In [176]:
# Making the confusion matrix 
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true = Y_Test, y_pred = Y_Pred) # create an instance of confusion matrix class 
print(cm)

tn, fp, fn, tp = confusion_matrix(Y_Test, Y_Pred).ravel()

print("TN",tn)
print("FN",fn)
print("TP",tp)
print("FP",fp)

[[44  2]
 [ 1 84]]
TN 44
FN 1
TP 84
FP 2


In [177]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_true = Y_Test, y_pred = Y_Pred))

print("Accuracy:")
print((tn + tp) / (tp + tn + fp + fn))
print()

print("Precision of predicting Democrat:")
print(tp / (tp + fp))
print("When we predict Democrat, we get it right "+ str(round(tp/(tp+fp)*100)) +" percent of the time")
print()

print("Precision of predicting Republican:")
print(tn / (tn + fn))
print("When we predict Democrat, we get it right " + str(round(tn/(tn+fn)*100)) + " percent of the time")
print()

print("Recall of Democrat:")
print(tp / (tp + fn))
print("We find "+ str(round(tp/(tp+fn)*100))  + " percent of Democrats with our predictions")
print()

print("Recall of Republicans:")
print(tn / (tn + fp))
print("We find " + str(round(tn/(tn+fp)*100)) +" percent of Democrat with our predictions")
print()

print("F-1 Score:")
print((2*tp) / (2*tp + fp + fn))

0.9770992366412213
Accuracy:
0.9770992366412213

Precision of predicting Democrat:
0.9767441860465116
When we predict Democrat, we get it right 98.0 percent of the time

Precision of predicting Republican:
0.9777777777777777
When we predict Democrat, we get it right 98.0 percent of the time

Recall of Democrat:
0.9882352941176471
We find 99.0 percent of Democrats with our predictions

Recall of Republicans:
0.9565217391304348
We find 96.0 percent of Democrat with our predictions

F-1 Score:
0.9824561403508771


In [178]:
from sklearn.metrics import classification_report 

print(classification_report(Y_Test, Y_Pred))

              precision    recall  f1-score   support

           0       0.98      0.96      0.97        46
           1       0.98      0.99      0.98        85

    accuracy                           0.98       131
   macro avg       0.98      0.97      0.97       131
weighted avg       0.98      0.98      0.98       131



#### 4) SVM

##### Linear Kernel

In [179]:
# Use Feature scaling 
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_Train = sc_X.fit_transform(X_Train)
X_Test = sc_X.transform(X_Test)


In [180]:
# Fit SVM Classifier to the training data set  
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_Train, Y_Train)

# Predict the DV Test set using the classifer 
Y_Pred = classifier.predict(X_Test)
print(Y_Pred) # Predictions based on Y Test characteristics 



[1 1 0 0 1 0 0 0 1 0 0 0 1 1 0 1 0 1 0 1 1 1 0 1 0 1 1 1 1 0 0 1 0 1 1 1 1
 1 1 0 1 1 1 0 1 0 1 1 0 1 1 0 1 1 0 1 0 1 0 0 1 1 1 1 0 0 1 1 1 1 1 0 1 1
 1 0 0 1 1 1 1 0 1 0 0 1 1 0 0 1 1 0 1 1 0 1 0 1 1 0 1 1 1 0 1 0 1 0 1 1 1
 0 1 1 0 1 1 1 0 0 1 0 1 1 1 0 1 1 1 0 1]


In [181]:
# Making the confusion matrix 
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true = Y_Test, y_pred = Y_Pred) # create an instance of confusion matrix class 
print(cm)

tn, fp, fn, tp = confusion_matrix(Y_Test, Y_Pred).ravel()

print("TN",tn)
print("FN",fn)
print("TP",tp)
print("FP",fp)

[[44  2]
 [ 5 80]]
TN 44
FN 5
TP 80
FP 2


In [182]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_true = Y_Test, y_pred = Y_Pred))

print("Accuracy:")
print((tn + tp) / (tp + tn + fp + fn))
print()

print("Precision of predicting Democrat:")
print(tp / (tp + fp))
print("When we predict Democrat, we get it right "+ str(round(tp/(tp+fp)*100)) +" percent of the time")
print()

print("Precision of predicting Republican:")
print(tn / (tn + fn))
print("When we predict Democrat, we get it right " + str(round(tn/(tn+fn)*100)) + " percent of the time")
print()

print("Recall of Democrat:")
print(tp / (tp + fn))
print("We find "+ str(round(tp/(tp+fn)*100))  + " percent of Democrats with our predictions")
print()

print("Recall of Republicans:")
print(tn / (tn + fp))
print("We find " + str(round(tn/(tn+fp)*100)) +" percent of Democrat with our predictions")
print()

print("F-1 Score:")
print((2*tp) / (2*tp + fp + fn))

0.9465648854961832
Accuracy:
0.9465648854961832

Precision of predicting Democrat:
0.975609756097561
When we predict Democrat, we get it right 98.0 percent of the time

Precision of predicting Republican:
0.8979591836734694
When we predict Democrat, we get it right 90.0 percent of the time

Recall of Democrat:
0.9411764705882353
We find 94.0 percent of Democrats with our predictions

Recall of Republicans:
0.9565217391304348
We find 96.0 percent of Democrat with our predictions

F-1 Score:
0.9580838323353293


In [183]:
from sklearn.metrics import classification_report 

print(classification_report(Y_Test, Y_Pred))

              precision    recall  f1-score   support

           0       0.90      0.96      0.93        46
           1       0.98      0.94      0.96        85

    accuracy                           0.95       131
   macro avg       0.94      0.95      0.94       131
weighted avg       0.95      0.95      0.95       131



##### SVM - RBF Kernel

In [184]:
print(X_Test.shape)
print(X_Train.shape)

(131, 48)
(304, 48)


In [185]:
# Fit SVM Classifier to the training data set  
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_Train, Y_Train)

# Predict the DV Test set using the classifer 
Y_Pred = classifier.predict(X_Test)
print(Y_Pred) # Predictions based on Y Test characteristics 


[1 1 0 0 0 0 1 0 1 0 0 0 1 1 0 1 0 1 0 1 1 1 0 1 0 1 1 1 1 0 0 1 0 1 1 1 1
 1 1 0 1 1 1 0 1 0 1 1 0 1 1 0 1 1 0 1 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1
 1 0 0 1 1 1 1 0 1 0 0 1 1 0 0 1 1 0 1 1 0 1 0 1 1 0 1 1 1 0 0 0 1 1 1 1 1
 0 1 1 0 1 1 1 0 0 1 1 1 1 1 0 1 1 1 0 1]


In [186]:
# Making the confusion matrix 
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true = Y_Test, y_pred = Y_Pred) # create an instance of confusion matrix class 
print(cm)

tn, fp, fn, tp = confusion_matrix(Y_Test, Y_Pred).ravel()

print("TN",tn)
print("FN",fn)
print("TP",tp)
print("FP",fp)

[[43  3]
 [ 3 82]]
TN 43
FN 3
TP 82
FP 3


In [187]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_true = Y_Test, y_pred = Y_Pred))


print("Accuracy:")
print((tn + tp) / (tp + tn + fp + fn))
print()

print("Precision of predicting Democrat:")
print(tp / (tp + fp))
print("When we predict Democrat, we get it right "+ str(round(tp/(tp+fp)*100)) +" percent of the time")
print()

print("Precision of predicting Republican:")
print(tn / (tn + fn))
print("When we predict Democrat, we get it right " + str(round(tn/(tn+fn)*100)) + " percent of the time")
print()

print("Recall of Democrat:")
print(tp / (tp + fn))
print("We find "+ str(round(tp/(tp+fn)*100))  + " percent of Democrats with our predictions")
print()

print("Recall of Republicans:")
print(tn / (tn + fp))
print("We find " + str(round(tn/(tn+fp)*100)) +" percent of Democrat with our predictions")
print()

print("F-1 Score:")
print((2*tp) / (2*tp + fp + fn))

0.9541984732824428
Accuracy:
0.9541984732824428

Precision of predicting Democrat:
0.9647058823529412
When we predict Democrat, we get it right 96.0 percent of the time

Precision of predicting Republican:
0.9347826086956522
When we predict Democrat, we get it right 93.0 percent of the time

Recall of Democrat:
0.9647058823529412
We find 96.0 percent of Democrats with our predictions

Recall of Republicans:
0.9347826086956522
We find 93.0 percent of Democrat with our predictions

F-1 Score:
0.9647058823529412


In [188]:
from sklearn.metrics import classification_report 

print(classification_report(Y_Test, Y_Pred))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93        46
           1       0.96      0.96      0.96        85

    accuracy                           0.95       131
   macro avg       0.95      0.95      0.95       131
weighted avg       0.95      0.95      0.95       131



#### 5) KNN Classifier

In [189]:
# Use Feature scaling 
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_Train = sc_X.fit_transform(X_Train)
X_Test = sc_X.transform(X_Test)

In [190]:
# Fit KNN Classifier to the training data set  
from sklearn.neighbors import KNeighborsClassifier
# Standard Euclidean distance metric 
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_Train, Y_Train)

# Predict the DV Test set using the classifer 
Y_Pred = classifier.predict(X_Test)
print(Y_Pred) # Predictions based on Y Test characteristics 

[1 1 0 0 0 0 1 0 0 0 0 0 1 1 0 1 0 1 0 1 1 1 0 1 0 1 1 1 1 0 0 1 0 1 1 1 1
 1 1 0 1 1 0 0 1 0 1 1 0 1 1 0 1 1 0 1 0 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1
 1 0 1 1 1 1 1 0 1 0 0 1 1 0 0 1 1 0 1 1 0 1 0 1 1 0 0 1 1 0 0 0 1 1 1 1 1
 0 1 1 0 1 1 1 0 0 1 1 1 1 1 0 1 1 1 1 1]


In [191]:
# Making the confusion matrix 
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true = Y_Test, y_pred = Y_Pred) # create an instance of confusion matrix class 
print(cm)

tn, fp, fn, tp = confusion_matrix(Y_Test, Y_Pred).ravel()

print("TN",tn)
print("FN",fn)
print("TP",tp)
print("FP",fp)

[[40  6]
 [ 6 79]]
TN 40
FN 6
TP 79
FP 6


In [192]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_true = Y_Test, y_pred = Y_Pred))


print("Accuracy:")
print((tn + tp) / (tp + tn + fp + fn))
print()

print("Precision of predicting Democrat:")
print(tp / (tp + fp))
print("When we predict Democrat, we get it right "+ str(round(tp/(tp+fp)*100)) +" percent of the time")
print()

print("Precision of predicting Republican:")
print(tn / (tn + fn))
print("When we predict Democrat, we get it right " + str(round(tn/(tn+fn)*100)) + " percent of the time")
print()

print("Recall of Democrat:")
print(tp / (tp + fn))
print("We find "+ str(round(tp/(tp+fn)*100))  + " percent of Democrats with our predictions")
print()

print("Recall of Republicans:")
print(tn / (tn + fp))
print("We find " + str(round(tn/(tn+fp)*100)) +" percent of Democrat with our predictions")
print()

print("F-1 Score:")
print((2*tp) / (2*tp + fp + fn))

0.9083969465648855
Accuracy:
0.9083969465648855

Precision of predicting Democrat:
0.9294117647058824
When we predict Democrat, we get it right 93.0 percent of the time

Precision of predicting Republican:
0.8695652173913043
When we predict Democrat, we get it right 87.0 percent of the time

Recall of Democrat:
0.9294117647058824
We find 93.0 percent of Democrats with our predictions

Recall of Republicans:
0.8695652173913043
We find 87.0 percent of Democrat with our predictions

F-1 Score:
0.9294117647058824


In [193]:
from sklearn.metrics import classification_report 

print(classification_report(Y_Test, Y_Pred))

              precision    recall  f1-score   support

           0       0.87      0.87      0.87        46
           1       0.93      0.93      0.93        85

    accuracy                           0.91       131
   macro avg       0.90      0.90      0.90       131
weighted avg       0.91      0.91      0.91       131

