# SVM and Random Forest

## Support Vector Machines (SVM) 

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
%matplotlib inline

In [15]:
# read in data 
df = pd.read_csv('clean_data/2019-fec-contr-census.csv', index_col=0)

In [16]:
# set target column 
y = df.target

In [17]:
# set predictor columns
df.contbr_zip = df.contbr_zip.astype(str)
X_feats = ['contbr_zip',
           'converted_date',
           'contb_receipt_amt']
X = pd.get_dummies(df[X_feats],
                   drop_first=True)
print(X.shape)

(11502, 23)


In [18]:
# split data into train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state=0, 
                                                    stratify=y, 
                                                    test_size=0.2
                                                   )

In [19]:
# scale train data 
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_train.shape

(9201, 23)

In [21]:
svm = SVC(kernel='linear', 
          class_weight='balanced')

In [22]:
# fit to scaled train data
svm.fit(scaled_X_train, y_train)
y_hat_train = svm.predict(scaled_X_train)
score = accuracy_score(y_train, y_hat_train)

# print train score
print('SVM:', score)

SVM: 0.35876535159221823


In [24]:
# run test data 
scaled_X_test = scaler.transform(X_test)
svm.fit(scaled_X_train, y_train)
y_hat_test = svm.predict(scaled_X_test)
score = accuracy_score(y_test, y_hat_test)

# print test score 
print('SVM:', score)

SVM: 0.3550630160799652


In [28]:
# run svm with radial basis function 
svm = SVC(kernel='rbf', 
          class_weight='balanced')

# fit to scaled train data
svm.fit(scaled_X_train, y_train)
y_hat_train = svm.predict(scaled_X_train)
score = accuracy_score(y_train, y_hat_train)

# print train score
print('SVM Train:', score)

SVM Train: 0.3845234213672427


In [29]:
# run test data 
scaled_X_test = scaler.transform(X_test)
svm.fit(scaled_X_train, y_train)
y_hat_test = svm.predict(scaled_X_test)
score = accuracy_score(y_test, y_hat_test)

# print test score 
print('SVM Test:', score)

SVM Test: 0.37244676227727075


## Random Forest

In [30]:
# run train data 
rf = RandomForestClassifier()
rf.fit(scaled_X_train, y_train)
y_hat_train = rf.predict(scaled_X_train)
score = accuracy_score(y_train, y_hat_train)

# print train score 
print('RF Train:', score)

RF Train: 0.7002499728290403




In [31]:
scaled_X_test = scaler.transform(X_test)
rf.fit(scaled_X_train, y_train)
y_hat_test = rf.predict(scaled_X_test)
score = accuracy_score(y_test, y_hat_test)

# print test score 
print('RF Test:', score)

RF Test: 0.5241199478487614
