In [None]:
# 구글 드라이브 연동을 위해 mount.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 라이브러리 불러오기
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras

In [None]:
# train 데이터 불러오기
train = pd.read_csv('/data/round2_train.csv')

In [None]:
# P1 ~ P4를 사용해 P5를 예측하는 변수 만들기
temp = train.loc[:,['P1','P2','P3','P4','P5']]

# P5가 0인 행들은 낙찰되지 않은 데이터이므로 제외하고 추출
new_temp = temp.loc[temp['P5'] != 0, :]

In [None]:
# seed 고정
tf.random.set_seed(2022)

# P5를 예측하는 모델 생성
model = keras.models.Sequential([
    keras.layers.Input(shape=(4,)),
    keras.layers.Dense(64, activation='swish'),
    keras.layers.Dense(128, activation='swish'),
    keras.layers.Dense(64, activation='swish'),
    keras.layers.Dense(1)
])

model.compile(loss='mse', optimizer=keras.optimizers.Adam(learning_rate=0.0001))

In [None]:
# early stopping 기법으로 성능이 향상되지 않을 때 학습 중단.
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
es = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1, restore_best_weights=True)

# 가장 성능이 좋은 모델을 저장
cp = ModelCheckpoint(
    filepath='/result/220929best_model.h5',
    monitor='val_loss',
    verbose=1,
    save_best_only=True
)

In [None]:
# P5 예측 모델(DNN기반) 학습
model.fit(new_temp[['P1','P2','P3','P4']], new_temp[['P5']], epochs=100, verbose=1, validation_split=0.1, callbacks=[es, cp])

Epoch 1/100
Epoch 1: val_loss improved from inf to 232.55473, saving model to /content/drive/MyDrive/공모전_본선/220929best_model.h5
Epoch 2/100
Epoch 2: val_loss improved from 232.55473 to 149.41519, saving model to /content/drive/MyDrive/공모전_본선/220929best_model.h5
Epoch 3/100
Epoch 3: val_loss did not improve from 149.41519
Epoch 4/100
Epoch 4: val_loss did not improve from 149.41519
Epoch 5/100
Epoch 5: val_loss did not improve from 149.41519
Epoch 6/100
Epoch 6: val_loss did not improve from 149.41519
Epoch 7/100
Epoch 7: val_loss did not improve from 149.41519
Epoch 8/100
Epoch 8: val_loss did not improve from 149.41519
Epoch 9/100
Epoch 9: val_loss did not improve from 149.41519
Epoch 10/100
Epoch 10: val_loss did not improve from 149.41519
Epoch 11/100
Epoch 11: val_loss did not improve from 149.41519
Epoch 12/100

Epoch 12: val_loss did not improve from 149.41519
Epoch 12: early stopping


<keras.callbacks.History at 0x7f229649cd10>

In [None]:
# 모델 저장 -> test 전처리에서 사용하기 위함
model.load_weights('/result/220929best_model.h5')
model.save('/result/p5_predict_model_final.h5')

In [None]:
# 모델 불러오기 -> 
from keras.models import Sequential, load_model
model = load_model('/result/p5_predict_model_final.h5')

In [None]:
# P1 ~ P4로 P5 예측
p5_pred = model.predict(train[['P1','P2','P3','P4']])

In [None]:
# 예측한 P5를 새로운 변수로 추가
train['pred_p5'] = p5_pred

In [None]:
# Linear Regression 모델로 P5 예측
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
model_lr = LinearRegression()
x = new_temp.drop(columns='P5')
y = new_temp.loc[:, 'P5']
x_train_temp, x_test_temp, y_train_temp, y_test_temp = train_test_split(x, y, test_size=0.1, random_state=2022)

# P5를 예측하는 Linear Regression 모델 학습
model_lr.fit(x_train_temp, y_train_temp)

LinearRegression()

In [None]:
# Linear Regression 모델 저장 -> test 전처리를 위함
from joblib import dump, load
dump(model_lr, '/result/model_lr_final.joblib') 

['/content/drive/MyDrive/공모전_본선/model_lr_final.joblib']

In [None]:
# 모델 불러오기
from joblib import dump, load
model_lr = load('/result/model_lr_final.joblib')

In [None]:
# P1~P4로 P5 예측
new_feat = model_lr.predict(train[['P1','P2','P3','P4']])
new_feat

array([ 40.22388712, 603.33614488, 268.32827486, ..., 168.57262526,
        43.92051556,  55.81634806])

In [None]:
# 새로운 feature 1 행 추가

# P5 예측 값
train['new_feature'] = new_feat

# 새로운 feature 2 행 추가

# P4는 AX bidprice P2, P3는 bidfloor라고 가정.
# AX bidprice가 bidfloor들보다 더 클 때만 낙찰에 성공할 수 있다고 생각!

# AX bidprice가 bidfloor값들보다 클 때 1, bidfloor가 하나라도 bidprice보다 크다면 0
train['new_feature2'] = 0
train.loc[(train['P2'] > train['P4']) | (train['P3'] > train['P4']), 'new_feature2'] = 0
train.loc[(train['P2'] < train['P4']) & (train['P3'] < train['P4']), 'new_feature2'] = 1

# 새로운 feature 3, 4 행 추가

# 새로운 feature 2는 범주형 데이터이므로 수치형 데이터도 추가해주기 위해 feature 2 생성원리와 동일한 이유로
# feature 3, 4, 5, 6, 7 생성
train['P4 - P3'] = train['P4'] - train['P3']
train['P4 - P2'] = train['P4'] - train['P2']
train['P1 - P2'] = train['P1'] - train['P2']
train['P1 - P3'] = train['P1'] - train['P3']
train['P2 - P3'] = train['P2'] - train['P3']

In [None]:
train

Unnamed: 0,시각,ADID,ADID 타입,DSP ID,매체 ID,애드유닛 ID,플랫폼,OS 종류,사이즈 ID,노출 ID,...,P5,winning,pred_p5,new_feature,new_feature2,P4 - P3,P4 - P2,P1 - P2,P1 - P3,P2 - P3
0,20220403000000,45503fc5d8d3a06832a087581ff6b6b4937b4d41d2d2cd...,1,d22127d7ef82d6f222558447b65f1fb63c36a55d71d0c3...,03eef1b13012d013345f17def773eceda258d8d9829484...,4a3406328504b21257aa66fc138ba78668bef318b361f9...,1,1,3,eb2cb8788b9462b49c35c1ca9a4b5828b9153274041a3a...,...,0.000000,0,39.553089,40.223887,0,9.121389,-3.667611,15.901556,28.690556,12.78900
1,20220403000000,59ae8bfa45031b28d30dc1ad6486d702148b1fbd461fd5...,1,becd0e5686fae9c4a1885a950817a70f9ae3c437d32a60...,d64dbe2039165cce22fae7c7f16259596a196f149f08e7...,a2f05b9f2662f44757f69656801c4c0bd83e95caa10ddb...,1,1,2,c215404374628df7ea645a3cbc8a40df8972ca95b46f30...,...,590.973600,1,611.452454,603.336145,1,686.952000,674.163000,1156.491000,1169.280000,12.78900
2,20220403000000,c57351c77526224145cc7a5f48de890754943c71363d80...,5,becd0e5686fae9c4a1885a950817a70f9ae3c437d32a60...,75cb5f369879cd6fa900bea129c1037432a00d6ae94128...,40f3cb27c781d1130b7565836803f952a2e327e6db72ae...,3,1,3,ed2e5996d4f0f524177eb84050fb10680899d93dc8d1fe...,...,255.780012,1,263.828583,268.328275,1,45.796800,39.396210,232.327410,238.728000,6.40059
3,20220403000000,93aca0b9a68e60ff423feeeae48d02007fab3920bc5d4f...,1,d22127d7ef82d6f222558447b65f1fb63c36a55d71d0c3...,03eef1b13012d013345f17def773eceda258d8d9829484...,4a3406328504b21257aa66fc138ba78668bef318b361f9...,1,1,3,71d399d29493273a7b33fd13344c6ba7764b5e336df44a...,...,0.000000,0,39.553089,40.223887,0,9.121389,-3.667611,15.901556,28.690556,12.78900
4,20220403000000,92a710e711b4d911c9c20ce04c6ce0381a3fbd9dcb3ba3...,1,d22127d7ef82d6f222558447b65f1fb63c36a55d71d0c3...,03eef1b13012d013345f17def773eceda258d8d9829484...,4a3406328504b21257aa66fc138ba78668bef318b361f9...,1,1,3,003d0bfc2d6181540c0343eb1b5b242b7219c1878d581e...,...,0.000000,0,39.553089,40.223887,0,9.121389,-3.667611,15.901556,28.690556,12.78900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9149649,20220410132810,02dd628573a4c42a339c20a317ac7c7ae8774b50345465...,1,becd0e5686fae9c4a1885a950817a70f9ae3c437d32a60...,554b9474090ae6e25f9c6b93294eb85a2ef720ee49476d...,dc83186485f48c299c25f841bc8b866f4b80d901d825fe...,1,1,1,5f70dcdb57e5a3d9870cf1f73123ce3915ac79eaddb0ab...,...,0.000000,0,39.542450,42.314951,0,6.925929,-5.968071,23.209215,36.103215,12.89400
9149650,20220410132810,be828aa948d49f5a36a0df9730d7603b2df8876d3a706d...,1,becd0e5686fae9c4a1885a950817a70f9ae3c437d32a60...,554b9474090ae6e25f9c6b93294eb85a2ef720ee49476d...,dc83186485f48c299c25f841bc8b866f4b80d901d825fe...,1,1,1,74548a86ca5b5c68e198e054a05b4eb355ae4739ee13f8...,...,0.000000,0,63.752773,62.384599,1,31.240320,18.346320,63.733200,76.627200,12.89400
9149651,20220410132810,bd520bad60995fd2c5f82d385d0168a2b2ad4268c82a95...,1,95369047233b18ea02be6d42399d7f6a2262834ae5d066...,f50daedc59d7a0d361e9bbc543ed3f0b9276578b2aa43e...,de36c9fee3be8d1c87d6d4223689334ada3bb37ddef18e...,3,1,1,ff0f0cf934dc283da81964368a3040d03ade3a82eddf12...,...,0.000000,0,165.502457,168.572625,1,63.080952,65.880792,189.801427,187.001587,-2.79984
9149652,20220410132810,507c42b839168430b334a9e20d02e781f88ba4642b7d59...,1,6b8a464a76ce5580d84bc6a87ca1179b4088d3c46d3952...,8b604a9f98507d296e404198d5aea33a11e4d4e0146a83...,aaebc0f56ed3262863bc83c0b24458856b41f5c654274c...,1,1,1,fd191cc4babca4971e871759a0c47850252cb6bbade481...,...,0.000000,0,40.985077,43.920516,0,8.871072,-4.022928,26.451120,39.345120,12.89400


In [None]:
# P5와 대부분의 범주형 변수 삭제
train.drop(columns=['시각', '광고 응답 소재 카테고리', '매체 ID', 'ADID', 'DSP ID', '애드유닛 ID', '노출 ID', 'SSP 입찰ID', 'DSP 입찰ID', 'AX 낙찰ID', 'WUID (웹 유저 ID)', 'OS 버전 ID', '국가코드 ID', 'P5', '광고 응답 광고주 도메인'], inplace=True)

In [None]:
# 범주형 변수에 대해 가변수화 처리
train_d = pd.get_dummies(data=train, columns = ['ADID 타입', '플랫폼', 'OS 종류', '사이즈 ID'], drop_first=True)

In [None]:
# 데이터에서 x, y 분리
x = train_d.drop(columns=['winning'])
y = train_d.loc[:, 'winning']

# colab RAM 용량 확보를 위해 변수 제거
del train_d

In [None]:
x.shape, y.shape

((9149654, 21), (9149654,))

In [None]:
# train set, validation set split
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.1, random_state=2022)

In [None]:
x_train.shape, x_val.shape, y_train.shape, y_val.shape

((8234688, 21), (914966, 21), (8234688,), (914966,))

In [None]:
# scikit learn 라이브러리에서 결정트리모델 가져오기
from sklearn.tree import DecisionTreeClassifier
model_dt = DecisionTreeClassifier(random_state=2022)
model_dt.fit(x_train, y_train)

DecisionTreeClassifier(random_state=2022)

In [None]:
from sklearn.metrics import *
pred_dt = model_dt.predict(x_val)

print(confusion_matrix(y_val, pred_dt))
print(classification_report(y_val, pred_dt))

[[600289  42864]
 [ 84975 186838]]
              precision    recall  f1-score   support

           0       0.88      0.93      0.90    643153
           1       0.81      0.69      0.75    271813

    accuracy                           0.86    914966
   macro avg       0.84      0.81      0.82    914966
weighted avg       0.86      0.86      0.86    914966



In [None]:
from joblib import dump, load
dump(model_dt, '/result/model_dt_final_final.joblib') 

['/content/drive/MyDrive/공모전_본선/model_dt_final_final.joblib']