# <b>월간 데이콘 3 게임 행동 데이터 분석

-팀 : 도발하려던건 아니었습니다만-

### <b>라이브러리 및 데이터

In [0]:
#### 라이브러리 설치

In [0]:
import os                                            # 디렉토리 설정
os.chdir("/content/gdrive/My Drive/starcraft/really_last")
import warnings                                      # 경고 메세지 무시
warnings.filterwarnings('ignore')
import pandas as pd                                  # 데이터 조작, 분석
import numpy as np                                   # 행렬 연산
import random                                        # 난수 생성
random.seed(2020)
random_seed = 2020
import time                                          # 시간 측정
import re                                            # 정규표현식

from sklearn.model_selection import train_test_split # train, validation 데이터 나누기
from sklearn import metrics                          # AUC 측정
!pip install catboost
from catboost import CatBoostClassifier, Pool        # CatBoost 모델링
import lightgbm as lgb                               # lightGBM 모델링
from sklearn.model_selection import KFold            # K-fold CV    
!pip install bayesian-optimization
from bayes_opt import BayesianOptimization           # 베이지안 최적화 라이브러리  
from functools import partial                        # 함수 변수 고정

#### 데이터 불러오기

In [0]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

### <b>데이터 전처리

X

In [0]:
# game_id 개수만큼의 index를 가진 DataFrame X 생성
n = train.game_id.max()+1
X = pd.DataFrame(index=range(n))

# time 변수
X['time'] = train.drop_duplicates(['game_id'],keep='last').time.reset_index(drop=True)
X['time'] = (X.time*100//100*60 + X.time*100%100).astype(int)

# species 더미 변수
X = pd.concat([pd.get_dummies(train[train.player == 0].drop_duplicates(['game_id']).set_index(['game_id']).species).rename(columns={'P':'0_protoss','T':'0_terran','Z':'0_zerg'}),
               pd.get_dummies(train[train.player == 1].drop_duplicates(['game_id']).set_index(['game_id']).species).rename(columns={'P':'1_protoss','T':'1_terran','Z':'1_zerg'}),
               X],axis=1)

# event 카운트
contents = train.loc[:,['player','game_id','time']].groupby(['player', 'game_id']).count().unstack(level=0)
contents.columns = ['0_event', '1_event']
X['0_event'], X['1_event'] = contents['0_event'], contents['1_event']

# event 카운트 / time
X['0_event_per_sec'], X['1_event_per_sec'] = X['0_event'] /X.time, X['1_event'] /X.time

# event == Ability, AddToControlGroup, Camera, ControlGroup, GetControlGroup, Right Click, Selection, SetControlGroup일 때 각각 카운트
contents = train.loc[:,['player','event','game_id','time']].groupby(['player', 'event', 'game_id']).count().unstack(level=[0,1]).fillna(0).astype(int)
contents.columns = ['0_'+x for x in sorted(train.event.unique())] + ['1_'+x for x in sorted(train.event.unique())]
for i in contents.columns:
  X[i] = contents[i]

# event == Camera일 때 event_contents의 2차원 좌표 간 euclidean distance sum, min, median, max
def move_sum(i):
  return sum(np.sqrt(np.diff(i.map(lambda x: x[4:x.find(',')]).astype(float)) **2 +
                     np.diff(i.map(lambda x: x[x.find(',')+2:len(x)-1]).astype(float)) **2))
def move_min(i):
  if len(i) == 1:
    return 0
  return min(np.sqrt(np.diff(i.map(lambda x: x[4:x.find(',')]).astype(float)) **2 +
                     np.diff(i.map(lambda x: x[x.find(',')+2:len(x)-1]).astype(float)) **2))
def move_median(i):
  if len(i) == 1:
    return 0
  return np.median(np.sqrt(np.diff(i.map(lambda x: x[4:x.find(',')]).astype(float)) **2 + 
                        np.diff(i.map(lambda x: x[x.find(',')+2:len(x)-1]).astype(float)) **2))
def move_max(i):
  if len(i) == 1:
    return 0
  return max(np.sqrt(np.diff(i.map(lambda x: x[4:x.find(',')]).astype(float)) **2 +
                     np.diff(i.map(lambda x: x[x.find(',')+2:len(x)-1]).astype(float)) **2))
contents = (train[train.event == 'Camera'].loc[:,['player','game_id','event_contents']].
            groupby(['player','game_id'])).agg([move_sum,move_min,move_median,move_max]).unstack(level=0)
contents.columns = [y+x for x in ['sum','min','median','max'] for y in ['0_move_','1_move_']]
for i in contents.columns:
  X[i] = contents[i].fillna(0)

# 30초 이내 move_sum
contents = (train[(train.time < 0.3) & (train.event == 'Camera')].loc[:,['player','game_id','event_contents']].
            groupby(['player','game_id'])).agg(move_sum).unstack(level=0)
contents.columns = ['0_move_sum_30sec','1_move_sum_30sec']
for i in contents.columns:
  X[i] = contents[i]

# event == Ability일 때 event_contents 더미 변수 생성, 카운트
contents = pd.DataFrame(train.event_contents[(train.event == 'Ability')].map(lambda x: x[x.find('(')+1:x.find(')')]))  # event_contents의 16진수 코드만 추출
contents['game_id'], contents['player'], contents['count'] = train.game_id, train.player, 1
contents_X = pd.DataFrame(columns=[x+y for x in ['0_','1_'] for y in contents.event_contents.unique()])
contents = contents.groupby(['player','event_contents','game_id']).count().unstack(level=[0,1])
contents.columns = contents.columns.map(lambda x: str(x[1])+'_'+x[2])
contents_X = pd.concat([contents_X, contents])
for i in contents_X.columns:
  X[i] = contents_X[i]
  X[i] = X[i].fillna(0).astype(int)

# event == Ability일 때 event_contents 더미 변수 생성 / time
for i in contents_X.columns:
  X[i+'_div_time'] = X[i] /X.time

# event == Selection일 때 event_contents 더미 변수 생성, 카운트
contents = train[train.event == 'Selection'].event_contents.map(lambda x: re.sub('\s\[.....\]', '', re.sub('\s\[......\]', '', re.sub('\s\[.......\]', '', x))).
                                                                replace('[', '').replace(']', '').replace(' ', '').replace('\'', ''))
contents = contents.str.split(',')
max_num = max(contents.map(lambda x: len(x)))
t = [0 for x in range(max_num)]
for i in range(max_num):
  t[i] = pd.DataFrame(contents[contents.map(lambda x: len(x) > i)].map(lambda x: x[i]))
contents = pd.concat([t[i] for i in range(max_num)])
contents['game_id'], contents['player'], contents['count'] = train.game_id, train.player, 1
contents_X = pd.DataFrame(columns=[x+y for x in ['0_','1_'] for y in contents.event_contents.unique()])
contents = contents.groupby(['player','event_contents','game_id']).count().unstack(level=[0,1])
contents.columns = contents.columns.map(lambda x: str(x[1])+'_'+x[2])
contents_X = pd.concat([contents_X, contents])
for i in contents_X.columns:
  X[i] = contents_X[i]
  X[i] = X[i].fillna(0).astype(int)

# event == Selection일 때 event_contents 더미 변수 생성 / time
for i in contents_X.columns:
  X[i+'_div_time'] = X[i] /X.time

# 30초 이내 event == Selection일 때 event_contents 더미 변수 생성, 카운트
contents = train[(train.time < 0.3) & (train.event == 'Selection')].event_contents.map(lambda x: re.sub('\s\[.....\]', '', re.sub('\s\[......\]', '', re.sub('\s\[.......\]', '', x))).
                                                                replace('[', '').replace(']', '').replace(' ', '').replace('\'', ''))
contents = contents.str.split(',')
max_num = max(contents.map(lambda x: len(x)))
t = [0 for x in range(max_num)]
for i in range(max_num):
  t[i] = pd.DataFrame(contents[contents.map(lambda x: len(x) > i)].map(lambda x: x[i]))
contents = pd.concat([t[i] for i in range(max_num)])
contents['game_id'], contents['player'], contents['count'] = train.game_id, train.player, 1
contents_X = pd.DataFrame(columns=[x+y for x in ['0_','1_'] for y in contents.event_contents.unique()])
contents = contents.groupby(['player','event_contents','game_id']).count().unstack(level=[0,1])
contents.columns = contents.columns.map(lambda x: str(x[1])+'_'+x[2])
contents_X = pd.concat([contents_X, contents])
for i in contents_X.columns:
  X[i+'_30sec'] = contents_X[i]
  X[i+'_30sec'] = X[i+'_30sec'].fillna(0).astype(int)

# event == Right Click일 때 Target 이름 더미 변수 생성, 카운트
contents = pd.DataFrame(train.event_contents[(train.event == 'Right Click') & (train.event_contents.map(lambda x: str(x)[:6]) == 'Target')].map(lambda x: x[x.find(':')+2:x.find(' [')]))  # event_contents의 Target만 추출
contents['game_id'], contents['player'], contents['count'] = train.game_id, train.player, 1
contents_X = pd.DataFrame(columns=[x+y for x in ['0_Target_','1_Target_'] for y in contents.event_contents.unique()])
contents = contents.groupby(['player','event_contents','game_id']).count().unstack(level=[0,1])
contents.columns = contents.columns.map(lambda x: str(x[1])+'_Target_'+x[2])
contents_X = pd.concat([contents_X, contents])
for i in contents_X.columns:
  X[i] = contents_X[i]
  X[i] = X[i].fillna(0).astype(int)

# 컬럼 이름 순서로 정렬
X = X[sorted(X.columns)]

# player 0,1 자리 바꾼 X1생성, X와 행 병합해 데이터 2배로 만들기
c = X.shape[1]//2
X1 = X.copy()
X1.columns = list(X.columns[c:2*c])+list(X.columns[:c])+['time']
X1.index = [x+n for x in range(n)]
X = pd.concat([X, X1])

y

In [0]:
y = train.drop_duplicates(['game_id', 'winner']).winner.reset_index(drop=True)
y = y.append(-(y - 1)).reset_index(drop=True)

test_X

In [0]:
# game_id 개수만큼의 index를 가진 DataFrame X 생성
nn = test.game_id.max()-test.game_id.min()+1
test_X = pd.DataFrame(index=[x+n for x in range(nn)])

# time 변수
test_X['time'] = test.drop_duplicates(['game_id'],keep='last').set_index('game_id').time
test_X['time'] = (test_X.time*100//100*60 + test_X.time*100%100).astype(int)

# species 더미 변수
test_X = pd.concat([pd.get_dummies(test[test.player == 0].drop_duplicates(['game_id']).set_index('game_id').species).rename(columns={'P':'0_protoss','T':'0_terran','Z':'0_zerg'}),
                    pd.get_dummies(test[test.player == 1].drop_duplicates(['game_id']).set_index('game_id').species).rename(columns={'P':'1_protoss','T':'1_terran','Z':'1_zerg'}),
                    test_X],axis=1)

# event 카운트
contents = test.loc[:,['player','game_id','time']].groupby(['player', 'game_id']).count().unstack(level=0)
contents.columns = ['0_event', '1_event']
test_X['0_event'], test_X['1_event'] = contents['0_event'], contents['1_event']

# event 카운트 / time
test_X['0_event_per_sec'], test_X['1_event_per_sec'] = test_X['0_event'] /test_X.time, test_X['1_event'] /test_X.time

# event == Ability, AddToControlGroup, Camera, ControlGroup, GetControlGroup, Right Click, Selection, SetControlGroup일 때 각각 카운트
contents = test.loc[:,['player','event','game_id','time']].groupby(['player', 'event', 'game_id']).count().unstack(level=[0,1]).fillna(0).astype(int)
contents.columns = ['0_'+x for x in sorted(test.event.unique())] + ['1_'+x for x in sorted(test.event.unique())]
for i in contents.columns:
  test_X[i] = contents[i]

# event == Camera일 때 event_contents의 2차원 좌표 간 euclidean distance sum, min, median, max
contents = (test[test.event == 'Camera'].loc[:,['player','game_id','event_contents']].
            groupby(['player','game_id'])).agg([move_sum,move_min,move_median,move_max]).unstack(level=0)
contents.columns = [y+x for x in ['sum','min','median','max'] for y in ['0_move_','1_move_']]
for i in contents.columns:
  test_X[i] = contents[i].fillna(0)

# 30초 이내 move_sum
contents = (test[(test.time < 0.3) & (test.event == 'Camera')].loc[:,['player','game_id','event_contents']].
            groupby(['player','game_id'])).agg(move_sum).unstack(level=0)
contents.columns = ['0_move_sum_30sec','1_move_sum_30sec']
for i in contents.columns:
  test_X[i] = contents[i]

# event == Ability일 때 event_contents 더미 변수 생성, 카운트
contents = pd.DataFrame(test.event_contents[(test.event == 'Ability')].map(lambda x: x[x.find('(')+1:x.find(')')]))  # event_contents의 16진수 코드만 추출
contents['game_id'], contents['player'], contents['count'] = test.game_id, test.player, 1
contents_X = pd.DataFrame(columns=[x+y for x in ['0_','1_'] for y in contents.event_contents.unique()])
contents = contents.groupby(['player','event_contents','game_id']).count().unstack(level=[0,1])
contents.columns = contents.columns.map(lambda x: str(x[1])+'_'+x[2])
contents_X = pd.concat([contents_X, contents])
for i in contents_X.columns:
  test_X[i] = contents_X[i]
  test_X[i] = test_X[i].fillna(0).astype(int)

# event == Ability일 때 event_contents 더미 변수 생성 / time
for i in contents_X.columns:
  test_X[i+'_div_time'] = test_X[i] /test_X.time

# event == Selection일 때 event_contents 더미 변수 생성, 카운트
contents = test[test.event == 'Selection'].event_contents.map(lambda x: re.sub('\s\[.....\]', '', re.sub('\s\[......\]', '', re.sub('\s\[.......\]', '', x))).
                                                              replace('[', '').replace(']', '').replace(' ', '').replace('\'', ''))
contents = contents.str.split(',')
max_num = max(contents.map(lambda x: len(x)))
t = [0 for x in range(max_num)]
for i in range(max_num):
  t[i] = pd.DataFrame(contents[contents.map(lambda x: len(x) > i)].map(lambda x: x[i]))
contents = pd.concat([t[i] for i in range(max_num)])
contents['game_id'], contents['player'], contents['count'] = test.game_id, test.player, 1
contents_X = pd.DataFrame(columns=[x+y for x in ['0_','1_'] for y in contents.event_contents.unique()])
contents = contents.groupby(['player','event_contents','game_id']).count().unstack(level=[0,1])
contents.columns = contents.columns.map(lambda x: str(x[1])+'_'+x[2])
contents_X = pd.concat([contents_X, contents])
for i in contents_X.columns:
  test_X[i] = contents_X[i]
  test_X[i] = test_X[i].fillna(0).astype(int)

# event == Selection일 때 event_contents 더미 변수 생성 / time
for i in contents_X.columns:
  test_X[i+'_div_time'] = test_X[i] /test_X.time

# 30초 이내 event == Selection일 때 event_contents 더미 변수 생성, 카운트
contents = test[(test.time < 0.3) & (test.event == 'Selection')].event_contents.map(lambda x: re.sub('\s\[.....\]', '', re.sub('\s\[......\]', '', re.sub('\s\[.......\]', '', x))).
                                                                replace('[', '').replace(']', '').replace(' ', '').replace('\'', ''))
contents = contents.str.split(',')
max_num = max(contents.map(lambda x: len(x)))
t = [0 for x in range(max_num)]
for i in range(max_num):
  t[i] = pd.DataFrame(contents[contents.map(lambda x: len(x) > i)].map(lambda x: x[i]))
contents = pd.concat([t[i] for i in range(max_num)])
contents['game_id'], contents['player'], contents['count'] = test.game_id, test.player, 1
contents_X = pd.DataFrame(columns=[x+y for x in ['0_','1_'] for y in contents.event_contents.unique()])
contents = contents.groupby(['player','event_contents','game_id']).count().unstack(level=[0,1])
contents.columns = contents.columns.map(lambda x: str(x[1])+'_'+x[2])
contents_X = pd.concat([contents_X, contents])
for i in contents_X.columns:
  test_X[i+'_30sec'] = contents_X[i]
  test_X[i+'_30sec'] = test_X[i+'_30sec'].fillna(0).astype(int)

# event == Right Click일 때 Target 이름 더미 변수 생성, 카운트
contents = pd.DataFrame(test.event_contents[(test.event == 'Right Click') & (test.event_contents.map(lambda x: str(x)[:6]) == 'Target')].map(lambda x: x[x.find(':')+2:x.find(' [')]))  # event_contents의 Target만 추출
contents['game_id'], contents['player'], contents['count'] = test.game_id, test.player, 1
contents_X = pd.DataFrame(columns=[x+y for x in ['0_Target_','1_Target_'] for y in contents.event_contents.unique()])
contents = contents.groupby(['player','event_contents','game_id']).count().unstack(level=[0,1])
contents.columns = contents.columns.map(lambda x: str(x[1])+'_Target_'+x[2])
contents_X = pd.concat([contents_X, contents])
for i in contents_X.columns:
  test_X[i] = contents_X[i]
  test_X[i] = test_X[i].fillna(0).astype(int)

# 컬럼 이름 순서로 정렬
test_X = test_X[sorted(test_X.columns)]

In [0]:
X.drop(set(X.columns) - set(test_X.columns), axis=1, inplace=True)
test_X.drop(set(test_X.columns) - set(X.columns), axis=1, inplace=True)

### <b>모델 생성, 적용

## depth parameter 깊게 준 모델 
random_seed = 2022

In [0]:
test_pred1 = pd.Series([0 for x in range(len(test_X))], index=test_X.index)
random_seed = 2022

Catboost - Depthwise \
with random_seed =2022 

In [0]:
# 10-fold 모델링을 1회 반복할 것
n = 1
for i in range(n):

  # 10-fold
  kf = KFold(n_splits=10, random_state=random_seed+i)
  for train_index, valid_index in kf.split(X):
    train_X, train_y = X.iloc[train_index], y[train_index]
    valid_X, valid_y = X.iloc[valid_index], y[valid_index]

    # catBoost
    model = CatBoostClassifier(iterations=25000,              # 반복횟수 최대 25000
                               learning_rate=0.01564,           # 러닝레이트 자동
                               depth=12,                       # 트리 깊이 ~16, 디폴트 6
                               l2_leaf_reg= 49.99,               # L2 정규화
                               random_seed=random_seed+i,       # 랜덤시드 고정
                               eval_metric='AUC',             # AUC로 성능 측정
                               metric_period=25000,            # 중간결과 출력
                               early_stopping_rounds=1000,    # 1000iteration 동안 AUC 증가 없으면 학습 중단
                               grow_policy='Depthwise',   # 트리 생성 방식, SymmetricTree는 wide, 깊이 일정, Lossguide는 deep
                               task_type='GPU'                # GPU 사용
                               )
    model.fit(train_X, train_y, eval_set=(valid_X, valid_y))
    
    test_pred1 += model.predict_proba(test_X)[:,1] /(2*10*n)

Catboost - Lossguide \
with random_seed =2022

In [0]:
# 10-fold 모델링을 1회 반복할 것
n = 1
for i in range(n):

  # 10-fold
  kf = KFold(n_splits=10, random_state=random_seed+i)
  for train_index, valid_index in kf.split(X):
    train_X, train_y = X.iloc[train_index], y[train_index]
    valid_X, valid_y = X.iloc[valid_index], y[valid_index]

    # catBoost
    model = CatBoostClassifier(iterations=25000,              # 반복횟수 최대 25000
                               learning_rate=0.01213,           # 러닝레이트 자동
                               depth=16,                       # 트리 깊이 ~16, 디폴트 6
                               l2_leaf_reg= 5.027,               # L2 정규화
                               random_seed=random_seed+i,       # 랜덤시드 고정
                               eval_metric='AUC',             # AUC로 성능 측정
                               metric_period=25000,            # 중간결과 출력
                               early_stopping_rounds=1000,    # 1000iteration 동안 AUC 증가 없으면 학습 중단
                               grow_policy='Lossguide',   # 트리 생성 방식, SymmetricTree는 wide, 깊이 일정, Lossguide는 deep
                               task_type='GPU'                # GPU 사용
                               )
    model.fit(train_X, train_y, eval_set=(valid_X, valid_y))
    
    test_pred1 += model.predict_proba(test_X)[:,1] /(2*10*n)

In [0]:
sample_submission = pd.read_csv('sample_submission.csv', index_col=0)
data1 = pd.DataFrame(data=test_pred1, columns=sample_submission.columns, index=sample_submission.index)

## depth parameter 깊게 준 모델 
random_seed = 2023

Catboost - Depthwise \
with random_seed =2023

In [0]:
test_pred2 = pd.Series([0 for x in range(len(test_X))], index=test_X.index)
random_seed = 2023

In [0]:
# 10-fold 모델링을 1회 반복할 것
n = 1
for i in range(n):

  # 10-fold
  kf = KFold(n_splits=10, random_state=random_seed+i)
  for train_index, valid_index in kf.split(X):
    train_X, train_y = X.iloc[train_index], y[train_index]
    valid_X, valid_y = X.iloc[valid_index], y[valid_index]

    # catBoost
    model = CatBoostClassifier(iterations=25000,              # 반복횟수 최대 25000
                               learning_rate=0.01564,           # 러닝레이트 자동
                               depth=12,                       # 트리 깊이 ~16, 디폴트 6
                               l2_leaf_reg= 49.99,               # L2 정규화
                               random_seed=random_seed+i,       # 랜덤시드 고정
                               eval_metric='AUC',             # AUC로 성능 측정
                               metric_period=25000,            # 중간결과 출력
                               early_stopping_rounds=1000,    # 1000iteration 동안 AUC 증가 없으면 학습 중단
                               grow_policy='Depthwise',   # 트리 생성 방식, SymmetricTree는 wide, 깊이 일정, Lossguide는 deep
                               task_type='GPU'                # GPU 사용
                               )
    model.fit(train_X, train_y, eval_set=(valid_X, valid_y))
    
    test_pred2 += model.predict_proba(test_X)[:,1] /(2*10*n)

Catboost - Lossguide \
with random_seed =2023

In [0]:
# 10-fold 모델링을 1회 반복할 것
n = 1
for i in range(n):

  # 10-fold
  kf = KFold(n_splits=10, random_state=random_seed+i)
  for train_index, valid_index in kf.split(X):
    train_X, train_y = X.iloc[train_index], y[train_index]
    valid_X, valid_y = X.iloc[valid_index], y[valid_index]

    # catBoost
    model = CatBoostClassifier(iterations=25000,              # 반복횟수 최대 25000
                               learning_rate=0.01213,           # 러닝레이트 자동
                               depth=16,                       # 트리 깊이 ~16, 디폴트 6
                               l2_leaf_reg= 5.027,               # L2 정규화
                               random_seed=random_seed+i,       # 랜덤시드 고정
                               eval_metric='AUC',             # AUC로 성능 측정
                               metric_period=25000,            # 중간결과 출력
                               early_stopping_rounds=1000,    # 1000iteration 동안 AUC 증가 없으면 학습 중단
                               grow_policy='Lossguide',   # 트리 생성 방식, SymmetricTree는 wide, 깊이 일정, Lossguide는 deep
                               task_type='GPU'                # GPU 사용
                               )
    model.fit(train_X, train_y, eval_set=(valid_X, valid_y))
    
    test_pred2 += model.predict_proba(test_X)[:,1] /(2*10*n)

In [0]:
sample_submission = pd.read_csv('sample_submission.csv', index_col=0)
data2 = pd.DataFrame(data=test_pred2, columns=sample_submission.columns, index=sample_submission.index)

## depth parameter 얕게 준 모델들 

In [0]:
test_pred3 = pd.Series([0 for x in range(len(test_X))], index=test_X.index)
random_seed = 2014

Catboost - Depthwise


In [0]:
# 10-fold 모델링을 2회 반복할 것
n = 2
for i in range(n):

  # 10-fold
  kf = KFold(n_splits=10, random_state=random_seed+i)
  for train_index, valid_index in kf.split(X):
    train_X, train_y = X.iloc[train_index], y[train_index]
    valid_X, valid_y = X.iloc[valid_index], y[valid_index]

    # catBoost
    model = CatBoostClassifier(iterations=25000,              # 반복횟수 최대 25000
                               learning_rate=0.02423,           # 러닝레이트 자동
                               depth=10,                       # 트리 깊이 ~16, 디폴트 6
                               l2_leaf_reg= 20.35,               # L2 정규화
                               random_seed=random_seed+i,       # 랜덤시드 고정
                               eval_metric='AUC',             # AUC로 성능 측정
                               metric_period=25000,             # 중간결과 출력
                               early_stopping_rounds=1000,    # 1000iteration 동안 AUC 증가 없으면 학습 중단
                               grow_policy='Depthwise',   # 트리 생성 방식, SymmetricTree는 wide, 깊이 일정, Lossguide는 deep
                               task_type='GPU'                # GPU 사용
                               )
    model.fit(train_X, train_y, eval_set=(valid_X, valid_y))
    
    test_pred3 += model.predict_proba(test_X)[:,1] /(2*10*n)

Catboost - Lossguide

In [0]:
# 10-fold 모델링을 2회 반복할 것
n = 2
for i in range(n):

  # 10-fold
  kf = KFold(n_splits=10, random_state=random_seed+i)
  for train_index, valid_index in kf.split(X):
    train_X, train_y = X.iloc[train_index], y[train_index]
    valid_X, valid_y = X.iloc[valid_index], y[valid_index]

    # catBoost
    model = CatBoostClassifier(iterations=25000,              # 반복횟수 최대 25000
                               learning_rate=0.01063,           # 러닝레이트 자동
                               depth=8,                       # 트리 깊이 ~16, 디폴트 6
                               l2_leaf_reg= 5.127,               # L2 정규화
                               random_seed=random_seed+i,       # 랜덤시드 고정
                               eval_metric='AUC',             # AUC로 성능 측정
                               metric_period=25000,            # 중간결과 출력
                               early_stopping_rounds=1000,    # 1000iteration 동안 AUC 증가 없으면 학습 중단
                               grow_policy='Lossguide',   # 트리 생성 방식, SymmetricTree는 wide, 깊이 일정, Lossguide는 deep
                               task_type='GPU'                # GPU 사용
                               )
    model.fit(train_X, train_y, eval_set=(valid_X, valid_y))
    test_pred3 += model.predict_proba(test_X)[:,1] /(2*10*n)

In [0]:
sample_submission = pd.read_csv('sample_submission.csv', index_col=0)
data3 = pd.DataFrame(data=test_pred3, columns=sample_submission.columns, index=sample_submission.index)

### 최종 모델 앙상블 

In [0]:
data_final = (data1+data2+data3)/3
data_final.to_csv("최종제출.csv", index =True)
data_final