# Support Vector Machines | Exercise Solution

In [1]:
import pandas as pd

df = pd.read_csv("data/holiday_rental_consumer_segmentation.csv")
df.head()

Unnamed: 0,Price,Size,Bedroom,Bathroom,Floor,Car_Parking,Internet,Location,Pet,Event
0,420.0,292.5,3,1,2,Yes,No,Area_2,No,Spa
1,660.0,208.0,3,1,1,Yes,No,Area_1,No,Spa
2,660.0,194.0,3,2,2,Yes,No,Area_3,No,Spa
3,838.0,240.0,3,1,1,Yes,No,Area_1,No,Spa
4,900.0,360.0,3,2,1,Yes,No,Area_4,No,Spa


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 191 entries, 0 to 190
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Price        191 non-null    float64
 1   Size         191 non-null    float64
 2   Bedroom      191 non-null    int64  
 3   Bathroom     191 non-null    int64  
 4   Floor        191 non-null    int64  
 5   Car_Parking  191 non-null    object 
 6   Internet     191 non-null    object 
 7   Location     191 non-null    object 
 8   Pet          191 non-null    object 
 9   Event        191 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 15.1+ KB


In [3]:
from sklearn.utils import resample

In [4]:
df_label = df["Event"].value_counts()
df_label

Event
Spa                   116
Shows                  55
Outdoor activities     20
Name: count, dtype: int64

In [5]:
df_spa = df.loc[df["Event"] == "Spa",]
df_shows = df.loc[df["Event"] == "Shows",]
df_outdoor = df.loc[df["Event"] == "Outdoor activities",]

In [6]:
df_shows_upsampled = resample(df_shows, replace=True, n_samples=df_label["Spa"])
df_outdoor_upsampled = resample(df_outdoor, replace=True, n_samples=df_label["Spa"])

In [7]:
df2 = pd.concat([df_spa, df_shows_upsampled], ignore_index=True)
df2 = pd.concat([df2, df_outdoor_upsampled], ignore_index=True)

In [8]:
df2["Event"].value_counts()

Event
Spa                   116
Shows                 116
Outdoor activities    116
Name: count, dtype: int64

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

x = pd.get_dummies(df2.iloc[:, :-1], drop_first=True, dtype="int8").to_numpy()
x[:, :5] = StandardScaler().fit_transform(x[:, :5])
print(x)
y = LabelEncoder().fit_transform(df2["Event"])
print(y)

[[-1.08053762  0.34827273  0.01881522 ...  0.          0.
   0.        ]
 [-0.18941657 -0.42777794  0.01881522 ...  0.          0.
   0.        ]
 [-0.18941657 -0.55635438  0.01881522 ...  1.          0.
   0.        ]
 ...
 [-0.45303988  0.44470506 -1.29072386 ...  0.          0.
   0.        ]
 [-0.96914749 -0.50767901 -1.29072386 ...  0.          0.
   0.        ]
 [-0.85775736 -0.87549947 -1.29072386 ...  1.          0.
   0.        ]]
[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [10]:
random_seed_val = 100
model_list = {}

In [11]:
from sklearn.linear_model import LogisticRegression

model_list["LR"] = LogisticRegression(random_state=random_seed_val)

In [12]:
from sklearn.neural_network import MLPClassifier

model_list["MLP"] = MLPClassifier(
    random_state=random_seed_val, activation="logistic", max_iter=10000
)

In [13]:
from sklearn.tree import DecisionTreeClassifier

model_list["DT (CART)"] = DecisionTreeClassifier(
    criterion="gini", random_state=random_seed_val
)

In [14]:
from sklearn.ensemble import RandomForestClassifier

model_list["RF"] = RandomForestClassifier(
    criterion="entropy", random_state=random_seed_val
)

In [15]:
from sklearn.ensemble import AdaBoostClassifier

model_list["AB"] = AdaBoostClassifier(random_state=random_seed_val)

In [16]:
from sklearn.svm import SVC

model_list["SVM (Linear)"] = SVC(gamma="auto", kernel="linear")

model_list["SVM (RBF)"] = SVC(gamma="auto", kernel="rbf")

model_list["SVM (Poly)"] = SVC(gamma="auto", kernel="poly")

In [17]:
from mixed_naive_bayes import MixedNB

model_list["MNB"] = MixedNB(categorical_features=list(range(5, 11)))

In [18]:
from sklearn import neighbors

model_list["KNN"] = neighbors.KNeighborsClassifier(
    n_neighbors=5, p=2, metric="minkowski", weights="uniform", algorithm="auto"
)

In [19]:
from sklearn.model_selection import cross_val_score
import numpy as np

k_fold = 5
results_cv = {}
for algo in model_list.keys():
    print(algo)
    results_cv[algo] = np.mean(cross_val_score(model_list[algo], x, y, cv=k_fold))

LR
MLP
DT (CART)
RF
AB
SVM (Linear)
SVM (RBF)
SVM (Poly)
MNB
KNN


In [20]:
results_cv

{'LR': np.float64(0.9224844720496893),
 'MLP': np.float64(0.9454658385093169),
 'DT (CART)': np.float64(0.9913871635610766),
 'RF': np.float64(0.9856314699792961),
 'AB': np.float64(0.5409523809523809),
 'SVM (Linear)': np.float64(0.9282815734989647),
 'SVM (RBF)': np.float64(0.9052173913043479),
 'SVM (Poly)': np.float64(0.7787163561076603),
 'MNB': np.float64(0.5939544513457558),
 'KNN': np.float64(0.8307246376811595)}

In [None]:
import matplotlib.pyplot as plt

%matplotlib inline
import seaborn as sns

sns.set_style("whitegrid")

x_plot = [x for y, x in sorted(zip(results_cv.values(), results_cv.keys()))]
y_plot = [y for y, x in sorted(zip(results_cv.values(), results_cv.keys()))]
x_pos = [i for i, _ in enumerate(x_plot)]

plt.barh(x_pos, y_plot, color="royalblue")
plt.ylabel("Model")
plt.xlabel("Accuracy")
plt.title("Model performance (CV=5)")

plt.yticks(x_pos, x_plot)
plt.show()