In [23]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

train = pd.read_csv('data/movies_train.csv')
test = pd.read_csv('data/movies_test.csv')

In [24]:
# 결측치가 많은 데이터 제거
train = train.drop(['dir_prev_bfnum'],axis = 1)
test =  test.drop(['dir_prev_bfnum'],axis = 1)

# 감독명 : 너무 다양해서 제거
train = train.drop(['director'],axis = 1)
test = test.drop(['director'],axis = 1)

# 제목 : 의미가 없기 때문에 제거
train = train.drop(['title'],axis= 1)
test = test.drop(['title'],axis= 1)

In [25]:
train.distributor.value_counts()

CJ 엔터테인먼트        54
롯데엔터테인먼트         52
(주)NEW           30
(주)마운틴픽쳐스        29
(주)쇼박스           26
                 ..
OAL(올)            1
(주)에이원 엔터테인먼트     1
(주)콘텐츠 윙          1
위더스필름             1
퍼스트런              1
Name: distributor, Length: 169, dtype: int64

In [26]:
# 상위 5개의 배급사를 제외하고 '기타'로처리
distributor_list = train.distributor.value_counts()[:5]
def func(distributor):
    if distributor in distributor_list:
        return distributor
    else:
        return '기타'

train['distributor'] = train['distributor'].apply(lambda x : func(x))
test['distributor'] = test['distributor'].apply(lambda x : func(x))

# 개봉일을 바탕으로 년,월 변수 생성
train['년'] = train['release_time'].apply(lambda x: int(x[:4]))
train['월'] = train['release_time'].apply(lambda x: int(x[5:7]))
train =  train.drop(['release_time'],axis = 1)

test['년'] = test['release_time'].apply(lambda x: int(x[:4]))
test['월'] = test['release_time'].apply(lambda x: int(x[5:7]))
test =  test.drop(['release_time'],axis = 1)


In [27]:
train.shape, test.shape

((600, 10), (243, 9))

In [28]:
train.columns

Index(['distributor', 'genre', 'time', 'screening_rat', 'dir_prev_num',
       'num_staff', 'num_actor', 'box_off_num', '년', '월'],
      dtype='object')

In [29]:
# 원핫 인코딩
train = pd.get_dummies(train)
test = pd.get_dummies(test)

### 모델정의 및 학습

In [30]:
from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [40]:
X = train.drop(['box_off_num'],axis= 1)
y = train['box_off_num']

xgb = XGBRegressor(
            n_estimators=50, ## 붓스트랩 샘플 개수 또는 base_estimator 개수
            max_depth=5, ## 개별 나무의 최대 깊이
            gamma = 0, ## gamma
            importance_type='gain', ## gain, weight, cover, total_gain, total_cover
            reg_lambda = 1, ## tuning parameter of l2 penalty
            random_state=100
        ).fit(X,y)

# model=RandomForestRegressor(n_estimators=100)
# model.fit(train_x,train_y)

In [41]:
xgb.score(train_x, train_y)

0.9926873688700143

In [42]:
scores = cross_val_score(xgb, X, y, scoring='neg_mean_squared_error', cv=5)

In [43]:
rmse = np.sqrt(-scores)
print('RMSE:', np.round(rmse, 3))
print('RMSE average: %0.3f' % (rmse.mean()))

RMSE: [1285976.849 1555339.49  1097762.498 1880503.741 1799421.345]
RMSE average: 1523800.785


In [35]:
## 예측
print(xgb.predict(X)[:3]) 
print()
## 성능 평가
print('R2 : ', xgb.score(X,y)) ## 테스트 성능 평가 점수(Accuracy)
print()
## 변수 중요도
for i, feature in enumerate(test.columns):
    print(f'{feature} : {xgb.feature_importances_[i]}')

[ 250470.72 6746486.   6842887.  ]

R2 :  0.9926873688700143

time : 0.053216077387332916
dir_prev_num : 0.01993298903107643
num_staff : 0.08557137101888657
num_actor : 0.04364875704050064
년 : 0.05215051397681236
월 : 0.03509257733821869
distributor_(주)NEW : 0.02228069305419922
distributor_(주)마운틴픽쳐스 : 0.002397670643404126
distributor_(주)쇼박스 : 0.017099568620324135
distributor_CJ 엔터테인먼트 : 0.02042986825108528
distributor_기타 : 0.1295885294675827
distributor_롯데엔터테인먼트 : 0.06871483474969864
genre_SF : 0.11321604996919632
genre_공포 : 0.004631992895156145
genre_느와르 : 0.025184890255331993
genre_다큐멘터리 : 0.0
genre_드라마 : 0.05169403553009033
genre_멜로/로맨스 : 0.006850250996649265
genre_뮤지컬 : 0.0
genre_미스터리 : 0.027246885001659393
genre_서스펜스 : 0.0
genre_애니메이션 : 0.0018154374556615949
genre_액션 : 0.03886017948389053
genre_코미디 : 0.014455589465796947
screening_rat_12세 관람가 : 0.020762668922543526
screening_rat_15세 관람가 : 0.1273520588874817
screening_rat_전체 관람가 : 0.002236901083961129
screening_rat_청소년 관람불가 : 0.0155

In [11]:
y.shape

(600,)

In [12]:
test.shape

(243, 28)

### 학습 된 모델로 예측 데이터 생성


In [13]:
pred = xgb.predict(test)

### 제출파일 생성

In [14]:
submission = pd.read_csv('data/submission.csv')
submission

Unnamed: 0,title,box_off_num
0,용서는 없다,0
1,아빠가 여자를 좋아해,0
2,하모니,0
3,의형제,0
4,평행 이론,0
...,...,...
238,해에게서 소년에게,0
239,울보 권투부,0
240,어떤살인,0
241,말하지 못한 비밀,0


In [15]:
submission['box_off_num'] = pred
submission

Unnamed: 0,title,box_off_num
0,용서는 없다,3.896304e+05
1,아빠가 여자를 좋아해,1.943162e+06
2,하모니,6.579076e+05
3,의형제,1.296595e+06
4,평행 이론,1.844405e+06
...,...,...
238,해에게서 소년에게,4.263783e+04
239,울보 권투부,2.193333e+05
240,어떤살인,2.135978e+05
241,말하지 못한 비밀,-6.321826e+04


In [16]:
submission.to_csv('baseline_xgb.csv',index = False)