# Support Vector Machines

Here we explore the usage of SVM

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
pd.options.display.max_rows = 4000
pd.options.display.max_columns = 4000

In [None]:
from sklearn.model_selection import train_test_split
raw_all_features_df = pd.read_csv("training_set_features.csv", index_col="respondent_id")
all_labels_df = pd.read_csv("training_set_labels.csv", index_col="respondent_id")

test_set_features_df = pd.read_csv("test_set_features.csv", index_col="respondent_id")

submission_df = pd.read_csv("submission_format.csv", index_col="respondent_id")

raw_train_features_df, raw_test_features_df, train_labels_df, test_labels_df = train_test_split(
    raw_all_features_df,
    all_labels_df,
    test_size=0.33,
    shuffle=True,
    stratify=all_labels_df,
    random_state=25519
)

raw_train_features_df

In [None]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
def ordinalize(features: pd.DataFrame, raw: pd.DataFrame, name: str) -> pd.DataFrame:
    ordinal_arr = OrdinalEncoder().fit_transform(raw[[name]])
    
    features.drop(name, axis=1)
    features[name] = ordinal_arr
    return features

    
def onehotize(features: pd.DataFrame, raw: pd.DataFrame, name: str) -> pd.DataFrame:
    enc = OneHotEncoder(sparse=False)
    onehot_df = pd.DataFrame(
        enc.fit_transform(raw[name]),
        columns=enc.get_feature_names_out(),
    )
    onehot_df.index = raw.index
    return pd.concat(
        [features.drop(name, axis=1), onehot_df],
        axis=1,
    )


def normalize(in_df: pd.DataFrame) -> pd.DataFrame:
    out_df = in_df
    out_df = onehotize(out_df, in_df, ["employment_industry", "employment_occupation", "race", "employment_status", "hhs_geo_region", "census_msa"])
    out_df = ordinalize(out_df, in_df, "age_group")
    out_df = ordinalize(out_df, in_df, "education")
    out_df = ordinalize(out_df, in_df, "sex")
    out_df = ordinalize(out_df, in_df, "income_poverty")
    out_df = ordinalize(out_df, in_df, "marital_status")
    out_df = ordinalize(out_df, in_df, "rent_or_own")
    # out_df = StandardScaler().fit_transform(out_df.fillna(out_df.mean()))
    # return DBSCAN(eps=10).fit(out_df).labels_
    return StandardScaler().fit_transform(out_df.fillna(out_df.mean()))

train_features_df = normalize(raw_train_features_df)

test_features_df = normalize(raw_test_features_df)

train_features_df

In [34]:
%%time
from sklearn.svm import SVC
from sklearn.cluster import DBSCAN
from sklearn.multioutput import MultiOutputClassifier

svclassifier = MultiOutputClassifier( SVC(random_state=31415, kernel='linear', probability=True))
svclassifier.fit(train_features_df, train_labels_df)

In [None]:
preds = svclassifier.predict(test_features_df)

preds_df = pd.DataFrame({"h1h1_vaccine": preds[:, 0], "seasonal_vaccine": preds[:, 1]}, index=test_labels_df.index)
preds_df

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(test_labels_df, preds_df)