<a href="https://colab.research.google.com/github/allyj92/deeplearning/blob/main/travel_insurance_ml_modeling_ipynb%EC%9D%98_%EC%82%AC%EB%B3%B8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Travel Insurance Prediction

- 데이터 다운로드 : https://www.kaggle.com/datasets/tejashvi14/travel-insurance-prediction-data

- 제공된 데이터
    - train.csv
    - test.csv
    - target columns: `TravelInsurance`
    - 0: 미가입, 1: 가입

- 주어진 학습 데이터세트를 사용하여 고객별 여행 보험 가입 여부를 예측하시오

## 데이터 읽기

In [4]:
# 판다스 임포트
import pandas as pd


In [5]:
# 데이터 불러오기
X = pd.read_csv('train.csv')
X_test = pd.read_csv('test.csv')
y_test = pd.read_csv('y_test.csv')
y_test

Unnamed: 0.1,Unnamed: 0,TravelInsurance
0,1115,0
1,1193,0
2,1804,0
3,1461,0
4,332,1
...,...,...
393,563,1
394,32,1
395,1349,1
396,1400,0


In [6]:
X_test

Unnamed: 0.1,Unnamed: 0,Age,Employment Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad
0,1115,34,Private Sector/Self Employed,Yes,700000,2,0,No,No
1,1193,34,Private Sector/Self Employed,No,950000,3,0,No,No
2,1804,28,Private Sector/Self Employed,Yes,1100000,5,0,No,No
3,1461,31,Government Sector,No,400000,8,0,No,No
4,332,31,Private Sector/Self Employed,Yes,1650000,7,1,Yes,No
...,...,...,...,...,...,...,...,...,...
393,563,33,Private Sector/Self Employed,Yes,550000,3,1,No,No
394,32,25,Government Sector,Yes,750000,4,0,No,No
395,1349,26,Private Sector/Self Employed,Yes,1400000,3,0,No,Yes
396,1400,29,Private Sector/Self Employed,Yes,1050000,6,1,Yes,No


## 둘러보기

In [7]:
# train head: 타겟 컬럼을 포함하고 있음
y_train = X['TravelInsurance']
X_train = X.drop(columns = ['TravelInsurance'])

In [8]:
# test head
X_test = X_test.drop(columns=['Unnamed: 0'])

In [9]:
y_test = y_test.drop(columns=['Unnamed: 0'])

In [10]:
# 타겟 분포 확인: 0, 1이 균일 한지 판단


In [11]:
# 수치형 변수 기초 통계 획인


In [12]:
# ChronicDiseases컬럼은 기초 통계를 보면
# 0, 1로 인코딩 되어 있지만 그냥 범주형으로 보는게 맞음
X['ChronicDiseases'].value_counts()

Unnamed: 0_level_0,count
ChronicDiseases,Unnamed: 1_level_1
0,1149
1,440


In [13]:
# 범주형 변수 기초 통계 확인


In [14]:
# 데이터 타입 및 결측치 확인
y_train.isnull().sum()

0

## 데이터 전처리

In [15]:
# 컬럼 분리
# 둘러보기에서 범주형 변수로 처리하기로 한 ChronicDiseases 도 범주형 변수로 함께 처리

# 다음 변수를 ['Employment Type', 'GraduateOrNot', 'FrequentFlyer', 'EverTravelledAbroad', 'ChronicDiseases']
# 로 설정
CAT_VARS = ['Employment Type', 'GraduateOrNot', 'FrequentFlyer', 'EverTravelledAbroad', 'ChronicDiseases']

# 다음 변수를 ['Age', 'AnnualIncome', 'FamilyMembers']
# 로 설정
NUM_VARS = ['Age', 'AnnualIncome', 'FamilyMembers']

print(CAT_VARS)
print(NUM_VARS)

['Employment Type', 'GraduateOrNot', 'FrequentFlyer', 'EverTravelledAbroad', 'ChronicDiseases']
['Age', 'AnnualIncome', 'FamilyMembers']


In [16]:
# train, valid 분리, test_size=0.2, stratify 옵션 지정
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X_train,y_train, test_size=0.2)

### 범주형 변수 인코딩



In [17]:
# CAT_VARS encoding

X_train = pd.get_dummies(X_train, columns= ['Employment Type', 'GraduateOrNot', 'FrequentFlyer', 'EverTravelledAbroad', 'ChronicDiseases'],drop_first=True)
X_train

Unnamed: 0.1,Unnamed: 0,Age,AnnualIncome,FamilyMembers,Employment Type_Private Sector/Self Employed,GraduateOrNot_Yes,FrequentFlyer_Yes,EverTravelledAbroad_Yes,ChronicDiseases_1
646,1936,33,1500000,7,True,True,True,True,False
593,559,29,1200000,3,True,True,False,False,False
927,995,27,850000,5,False,True,False,False,False
1512,243,26,750000,6,True,True,False,False,False
535,1439,33,400000,7,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...
621,280,32,800000,6,False,True,True,False,False
235,1313,29,900000,7,True,True,False,False,False
635,327,34,1300000,3,False,True,False,False,False
703,587,29,900000,9,True,True,False,False,True


In [18]:
X_test = pd.get_dummies(X_test, columns= ['Employment Type', 'GraduateOrNot', 'FrequentFlyer', 'EverTravelledAbroad', 'ChronicDiseases'],drop_first=True)

In [19]:
X_valid = pd.get_dummies(X_valid, columns= ['Employment Type', 'GraduateOrNot', 'FrequentFlyer', 'EverTravelledAbroad', 'ChronicDiseases'],drop_first=True)

### 수치형 변수 스케일링

In [20]:
X_train.dtypes

Unnamed: 0,0
Unnamed: 0,int64
Age,int64
AnnualIncome,int64
FamilyMembers,int64
Employment Type_Private Sector/Self Employed,bool
GraduateOrNot_Yes,bool
FrequentFlyer_Yes,bool
EverTravelledAbroad_Yes,bool
ChronicDiseases_1,bool


In [21]:
# NUM_VARS scaling, StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train[NUM_VARS] = scaler.fit_transform(X_train[NUM_VARS])

X_train

Unnamed: 0.1,Unnamed: 0,Age,AnnualIncome,FamilyMembers,Employment Type_Private Sector/Self Employed,GraduateOrNot_Yes,FrequentFlyer_Yes,EverTravelledAbroad_Yes,ChronicDiseases_1
646,1936,1.139153,1.510535,1.397868,True,True,True,True,False
593,559,-0.230957,0.709190,-1.113355,True,True,False,False,False
927,995,-0.916012,-0.225713,0.142257,False,True,False,False,False
1512,243,-1.258539,-0.492829,0.770062,True,True,False,False,False
535,1439,1.139153,-1.427732,1.397868,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...
621,280,0.796626,-0.359271,0.770062,False,True,True,False,False
235,1313,-0.230957,-0.092156,1.397868,True,True,False,False,False
635,327,1.481681,0.976305,-1.113355,False,True,False,False,False
703,587,-0.230957,-0.092156,2.653480,True,True,False,False,True


In [22]:
X_test[NUM_VARS] = scaler.fit_transform(X_test[NUM_VARS])

X_test

Unnamed: 0,Age,AnnualIncome,FamilyMembers,Employment Type_Private Sector/Self Employed,GraduateOrNot_Yes,FrequentFlyer_Yes,EverTravelledAbroad_Yes,ChronicDiseases_1
0,1.474105,-0.577815,-1.654038,True,True,False,False,False
1,1.474105,0.069624,-1.047304,True,False,False,False,False
2,-0.599018,0.458088,0.166166,True,True,False,False,False
3,0.437544,-1.354743,1.986370,False,False,False,False,False
4,0.437544,1.882455,1.379636,True,True,True,False,True
...,...,...,...,...,...,...,...,...
393,1.128584,-0.966279,-1.047304,True,True,False,False,True
394,-1.635579,-0.448327,-0.440569,False,True,False,False,False
395,-1.290059,1.235015,-1.047304,True,True,False,True,False
396,-0.253497,0.328600,0.772901,True,True,True,False,True


In [23]:
X_valid[NUM_VARS] = scaler.fit_transform(X_valid[NUM_VARS])

X_valid

Unnamed: 0.1,Unnamed: 0,Age,AnnualIncome,FamilyMembers,Employment Type_Private Sector/Self Employed,GraduateOrNot_Yes,FrequentFlyer_Yes,EverTravelledAbroad_Yes,ChronicDiseases_1
1481,613,0.534760,0.834086,0.798006,True,True,False,False,False
1308,1658,1.914505,0.032355,0.798006,True,False,False,True,False
325,1916,-0.500049,0.433220,0.798006,False,True,False,False,True
1296,924,0.534760,-1.437485,0.798006,False,True,False,False,False
429,212,-0.500049,-0.502132,0.182069,True,True,False,False,True
...,...,...,...,...,...,...,...,...,...
453,1950,-0.500049,0.834086,0.798006,True,True,False,True,False
1261,861,0.189823,1.368573,-1.049804,True,False,True,True,True
165,1537,-0.155113,-1.170241,-0.433867,True,True,False,False,False
19,1369,1.569568,-0.502132,-0.433867,True,False,True,False,False


## 모델링 및 평가

In [24]:
# model import, RandomForestClassifier

from sklearn.ensemble import RandomForestClassifier

In [40]:
# model fit
rf = RandomForestClassifier(n_estimators=5, random_state=0)
rf.fit(X_train, y_train)
rf_predict = rf.predict(X_train)

In [43]:
# roc_auc_score로 X_train, X_valid에 대해서 성능 측정
from sklearn.metrics import roc_auc_score
roc_auc_score=roc_auc_score(y_train, rf_predict)
roc_auc_score

0.9741685144124168

## 예측

In [27]:
# 확률로 예측


In [28]:
# 레이블로 예측


## 채점

- 이 과정은 `submit.csv`를 사용하여 시험 출제자가 채점하는 코드로 시험과는 상관없는 부분


In [29]:
# 예측을 데이터 프레임으로 만들어 csv파일로 저장
pd.DataFrame(
    {'TravelInsurance':rf.predict_proba(X_test)[:,1]},
    index=X_test.index
).to_csv('submit.csv')

# 정답 파일 불러오기
y_test = pd.read_csv('y_test.csv', index_col=0)
y_test

y_pred_test = pd.read_csv('submit.csv', index_col=0)

def score(file):
    submit = pd.read_csv(file, index_col=0)
    return roc_auc_score(
        y_test.sort_index()['TravelInsurance'],
        y_pred_test.sort_index()['TravelInsurance']
    )

print(score('submit.csv'))

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- Unnamed: 0


---

In [30]:
# [추가 수행 과제] 그리드 서치

param_grid = {
    'n_estimators':[100, 130, 150],
    'max_depth':[8, 10, 12],
    'min_samples_leaf':[2, 3, 5]
}