In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(context="talk", style="darkgrid", font="MS GOTHIC")

read csv

In [55]:
df = pd.read_csv('./data/TravelInsurancePrediction.csv', index_col=0)
df.head()

Unnamed: 0,Age,Employment Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad,TravelInsurance
0,31,Government Sector,Yes,400000,6,1,No,No,0
1,31,Private Sector/Self Employed,Yes,1250000,7,0,No,No,0
2,34,Private Sector/Self Employed,Yes,500000,4,1,No,No,1
3,28,Private Sector/Self Employed,Yes,700000,3,1,No,No,0
4,28,Private Sector/Self Employed,Yes,700000,8,1,Yes,No,0


In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1987 entries, 0 to 1986
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Age                  1987 non-null   int64 
 1   Employment Type      1987 non-null   object
 2   GraduateOrNot        1987 non-null   object
 3   AnnualIncome         1987 non-null   int64 
 4   FamilyMembers        1987 non-null   int64 
 5   ChronicDiseases      1987 non-null   int64 
 6   FrequentFlyer        1987 non-null   object
 7   EverTravelledAbroad  1987 non-null   object
 8   TravelInsurance      1987 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 155.2+ KB


In [57]:
for feature in df.columns:
    print(f"{feature}: {df[feature].nunique()}")
    print(df[feature].unique())

Age: 11
[31 34 28 25 33 26 32 29 35 30 27]
Employment Type: 2
['Government Sector' 'Private Sector/Self Employed']
GraduateOrNot: 2
['Yes' 'No']
AnnualIncome: 30
[ 400000 1250000  500000  700000 1150000 1300000 1350000 1450000  800000
 1400000  850000 1500000 1050000  350000 1100000  600000  900000  550000
  300000  750000 1200000 1000000  950000 1700000 1750000  650000  450000
 1650000 1800000 1550000]
FamilyMembers: 8
[6 7 4 3 8 9 5 2]
ChronicDiseases: 2
[1 0]
FrequentFlyer: 2
['No' 'Yes']
EverTravelledAbroad: 2
['No' 'Yes']
TravelInsurance: 2
[0 1]


separate str features and int features

In [58]:
cat_features = ["Employment Type", "GraduateOrNot", "ChronicDiseases", "FrequentFlyer", "EverTravelledAbroad"]
num_features = ["Age", "AnnualIncome", "FamilyMembers"]

Analyize categorical features

In [59]:
for feature in cat_features:
    print(feature)
    print(df[feature].unique())
    print("-----------------------------------------------------------")
    print("-----------------------------------------------------------")

Employment Type
['Government Sector' 'Private Sector/Self Employed']
-----------------------------------------------------------
-----------------------------------------------------------
GraduateOrNot
['Yes' 'No']
-----------------------------------------------------------
-----------------------------------------------------------
ChronicDiseases
[1 0]
-----------------------------------------------------------
-----------------------------------------------------------
FrequentFlyer
['No' 'Yes']
-----------------------------------------------------------
-----------------------------------------------------------
EverTravelledAbroad
['No' 'Yes']
-----------------------------------------------------------
-----------------------------------------------------------


In [60]:
from sklearn.model_selection import train_test_split

In [61]:
df_y = df.iloc[:,-1]
df_X = df.iloc[:,:-1]

In [63]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, shuffle=True, test_size=0.2, random_state=17, stratify=df_y)

In [64]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [65]:
sc = StandardScaler()
enc = OneHotEncoder(drop="first")

ColumnTransformer 一気に前処理を行う

In [66]:
from sklearn.compose import ColumnTransformer

In [68]:
ct = ColumnTransformer(
    [
        ("num_process", sc, num_features),
        ("cat_process", enc, cat_features)
    ],
    remainder="passthrough"
)

In [79]:
X_train_transformed = ct.fit_transform(X_train)
X_test_transformed = ct.transform(X_test)
X_test_transformed[:3]

array([[-0.91253204, -0.37974587,  0.14760243,  1.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [-1.25371167, -1.18198321, -0.47452026,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ],
       [ 1.47572534, -0.64715832,  0.76972512,  1.        ,  1.        ,
         0.        ,  1.        ,  0.        ]])

training

In [80]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [81]:
kn_model = KNeighborsClassifier()
sv_model = SVC(random_state=17)

In [82]:
kn_model.fit(X_train_transformed, y_train)
sv_model.fit(X_train_transformed, y_train)

SVC(random_state=17)

In [85]:
print(kn_model.score(X_test_transformed, y_test))
print(sv_model.score(X_test_transformed, y_test))

0.7889447236180904
0.8015075376884422


In [89]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [88]:
models = [{"model": kn_model, "params": {"n_neighbors":[10, 15, 20, 25], "weights":["uniform", "distance"]}}, 
          {"model": sv_model, "params": {"C": [0.1, 1, 10, 20], "kernel":["linear", "poly", "rbf", "sigmoid"], "gamma": ["auto", "scale"]}}]

In [90]:
k = StratifiedKFold(5, shuffle=True, random_state=17)

In [96]:
dfs = []
for model in models:
    grid = GridSearchCV(
        estimator=model["model"],
        param_grid=model["params"],
        cv=k)
    grid.fit(X_train_transformed, y_train)
    df = pd.DataFrame(grid.cv_results_).loc[:, "params":].sort_values("rank_test_score", ascending=True)[:5]
    dfs.append(df)
    print(f"best params of {model['model']} : {grid.best_params_}")

best params of KNeighborsClassifier() : {'n_neighbors': 25, 'weights': 'uniform'}
best params of SVC(random_state=17) : {'C': 1, 'gamma': 'scale', 'kernel': 'poly'}


In [99]:
kn_best_model = KNeighborsClassifier(n_neighbors=25, weights="uniform")
sv_best_model = SVC(C=1, gamma='scale', kernel='poly')

In [100]:
kn_best_model.fit(X_train_transformed, y_train)
sv_best_model.fit(X_train_transformed, y_train)

SVC(C=1, kernel='poly')

In [101]:
print(kn_best_model.score(X_test_transformed, y_test))
print(sv_best_model.score(X_test_transformed, y_test))

0.8015075376884422
0.8040201005025126
