In [1]:
import pandas as pd
import numpy as np
import random
import os
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(7) # Seed 고정

In [3]:
train = pd.read_csv("./../DAT/train.csv")
train.head()
test = pd.read_csv("./../DAT/test.csv")

In [4]:
x_train = train.drop(['ID',"TARGET"], axis = 1)
y_train = train['TARGET']

x_test = test.drop('ID', axis = 1)

In [5]:
train

Unnamed: 0,ID,월,요일,시간,소관경찰서,소관지역,사건발생거리,강수량(mm),강설량(mm),적설량(cm),풍향,안개,짙은안개,번개,진눈깨비,서리,연기/연무,눈날림,범죄발생지,TARGET
0,TRAIN_00000,9,화요일,10,137,8.0,2.611124,0.000000,0.0,0.00,245.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,차도,2
1,TRAIN_00001,11,화요일,6,438,13.0,3.209093,0.000000,0.0,0.00,200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,차도,0
2,TRAIN_00002,8,일요일,6,1729,47.0,1.619597,0.000000,0.0,0.00,40.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,인도,1
3,TRAIN_00003,5,월요일,6,2337,53.0,1.921615,11.375000,0.0,0.00,225.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,주거지,1
4,TRAIN_00004,9,일요일,11,1439,41.0,1.789721,0.000000,0.0,0.00,255.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,주유소,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84401,TRAIN_84401,4,일요일,7,336,11.0,3.808190,99.111111,0.0,0.00,165.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,차도,1
84402,TRAIN_84402,8,목요일,12,2149,38.0,1.458490,0.000000,0.0,0.00,200.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,차도,0
84403,TRAIN_84403,7,일요일,6,29,46.0,2.944913,105.888889,0.0,0.00,315.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,차도,0
84404,TRAIN_84404,1,화요일,11,536,25.0,0.493679,2.285714,8.6,10.75,330.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,주거지,1


In [5]:
ordinal_features = ['요일', '범죄발생지']

for feature in ordinal_features:
    le = LabelEncoder()
    le = le.fit(x_train[feature])
    x_train[feature] = le.transform(x_train[feature])

    # x_train데이터에서 존재하지 않았던 값이 x_test 데이터에 존재할 수도 있습니다.
    # 따라서 x_test 데이터를 바로 변형시키지 않고 고윳값을 확인후 x_test 데이터를 변환합니다.
    for label in np.unique(x_test[feature]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    x_test[feature] = le.transform(x_test[feature])

In [6]:
train_X, val_X, train_Y, val_Y = train_test_split(x_train, y_train, test_size = 0.2, random_state = 7)

#### 모델 학습

In [7]:
xgb_model = xgb.XGBClassifier(random_state=0)
xgb_model.fit(train_X, train_Y)

In [34]:
# 후보 파라미터 선정
params = {"max_depth" : [7],
         "learning_rate" : [0.1,0.01,0.001,0.0001],
         "colsample_bytree" : [0.8,0.9],
         "subsample" : [0.8,0.9]}

In [35]:
# gridsearchcv 객체 정보 입력(어떤 모델, 파라미터 후보, 교차검증 몇 번)
gridcv = GridSearchCV(xgb_model, param_grid=params, cv=5)

In [36]:
# 파라미터 튜닝 시작
gridcv.fit(train_X, train_Y)

In [38]:
#최적의 파라미터
print(gridcv.best_params_)

{'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 7, 'subsample': 0.8}


In [47]:
params = {"max_depth" : 7,
         "learning_rate" : 0.1,
         "colsample_bytree" : 0.8,
         "subsample" : 0.8}

xgb_model = xgb.XGBClassifier(n_estimators = 100)
xgb_model.set_params(**params)

xgb_model.fit(train_X, train_Y)

In [48]:
pred_val = xgb_model.predict(val_X)

In [49]:
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score
import seaborn as sns

In [50]:
print(f1_score(val_Y, pred_val, average="micro"))
print(confusion_matrix(val_Y, pred_val))

0.5533112190498756
[[5278  917 1086]
 [1923 2325  839]
 [2048  728 1738]]


In [30]:
pred_test = xgb_model.predict(x_test)

In [31]:
submit = pd.read_csv("./../DAT/sample_submission.csv")

In [32]:
submit["TARGET"] = pred_test
submit.head()

Unnamed: 0,ID,TARGET
0,TEST_00000,2
1,TEST_00001,0
2,TEST_00002,0
3,TEST_00003,0
4,TEST_00004,0


In [33]:
submit.to_csv("sample_submission.csv", index=False)