In [19]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.model_selection import KFold
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
import itertools
import pyprind
import pickle

%matplotlib inline

In [36]:
col_names = ['백만배우', '백만감독', '백만배급사', '개봉_6월', '액션', '개봉_4월', '멜로/로맨스', '전체관람가', '판타지', '개봉_2월', '15세이상관람가']

file = open('lst.pkl', 'rb')
lst = pickle.load(file)
file.close()

A_lst = lst['A']
B_lst = lst['B']
C_lst = lst['C']
D_lst = lst['D']
E_lst = lst['E']

In [56]:
movie = pd.read_csv('test.csv', encoding = 'cp949')
movie['고유코드'] = [str(x) + '_' + str(y) for (x, y) in zip(movie['영화명'], movie['감독'])]
col_nm = ['개봉일', '종료일', '영화명','대표국적', '제작사', '배급사', '등급', '장르', '감독', '배우', '고유코드']
movie = movie[col_nm]
movie['진행일'] = [int(len(pd.date_range(movie.iloc[i]['개봉일'], movie.iloc[i]['종료일']))) for i in range(len(movie))]
movie.loc[movie['진행일'] == 0] = None
movie = movie.drop(['종료일'], axis = 1)
movie.index = range(len(movie))
day = list(movie['진행일'])
movie_name = list(movie['영화명'])

In [57]:
# 개봉 월 정보 추가
movie['개봉시기'] = [str(int(v[5:7])) + '월' for v in movie['개봉일']]
movie = movie.drop(['개봉일'], axis = 1)

# 개봉시기 더미변수 생성
dummy = pd.DataFrame(index = movie.index)
lst = pd.unique(movie['개봉시기'])
for i in lst:
    col_n = '개봉_%s' % i
    dummy[col_n] = 0
    
    for j in dummy.index:
        if movie.loc[j, '개봉시기'] == i:
            dummy.loc[j, col_n] = 1
            
# 등급 더미변수 생성
lst = ['전체관람가', '12세이상관람가', '15세이상관람가', '청소년관람불가']
for i in lst:
    col_n = i
    dummy[col_n] = 0
    
    for j in dummy.index:
        temp = movie.loc[j, '등급'].split(',')
        if i in temp:
            dummy.loc[j, i] = 1
            
# 대표국적 더미변수 생성
lst = D_lst
for i in lst:
    col_n = i
    dummy[col_n] = 0
    
    for j in dummy.index:
        if movie.loc[j, '대표국적'] == i:
            dummy.loc[j, i] = 1
            
# 장르 더미변수 생성
lst = E_lst
for i in lst:
    col_n = i
    dummy[col_n] = 0
    
    for j in dummy.index:
        temp = movie.loc[j, '장르'].split(',')
        if i in temp:
            dummy.loc[j, i] = 1
            
# 배급사 더미변수 생성
dummy_ex = pd.DataFrame(index = movie.index)
lst = A_lst
for i in lst:
    col_n = i
    dummy_ex[col_n] = 0
    
    for j in dummy_ex.index:
        temp = movie.loc[j, '배급사'].split(',')
        if i in temp:
            dummy_ex.loc[j, i] = 1
            
# 감독 더미변수 생성
lst = B_lst
for i in lst:
    col_n = i
    dummy_ex[col_n] = 0
    
    for j in dummy_ex.index:
        temp = movie.loc[j, '감독'].split(',')
        if i in temp:
            dummy_ex.loc[j, i] = 1
            
# 배우 더미변수 생성
lst = C_lst
for i in lst:
    col_n = i
    dummy_ex[col_n] = 0
    
    for j in dummy_ex.index:
        temp = movie.loc[j, '배우'].split(',')
        if i in temp:
            dummy_ex.loc[j, i] = 1
            
dummy_ex['백만배급사'] = dummy_ex[A_lst].sum(axis = 1)
dummy_ex['백만감독'] = dummy_ex[B_lst].sum(axis = 1)
dummy_ex['백만배우'] = dummy_ex[C_lst].sum(axis = 1)
dummy_ex = dummy_ex[['백만배급사', '백만감독', '백만배우']]
#dummy_ex = (dummy_ex != 0).astype(int)

dummy = dummy.join(dummy_ex)
dummy['개봉_2월'] = 0
dummy['개봉_4월'] = 0
dummy['개봉_6월'] = 0

movie = dummy[col_names]

In [58]:
movie

Unnamed: 0,백만배우,백만감독,백만배급사,개봉_6월,액션,개봉_4월,멜로/로맨스,전체관람가,판타지,개봉_2월,15세이상관람가
0,1,1,1,0,1,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,1,0,0,0


# Model loading

In [59]:
import pickle
file = open('model1.pkl', 'rb')
model1 = pickle.load(file)
file.close()

file = open('model2.pkl', 'rb')
model2 = pickle.load(file)
file.close()

file = open('model3.pkl', 'rb')
model3 = pickle.load(file)
file.close()

file = open('model_agg.pkl', 'rb')
model_agg = pickle.load(file)
file.close()

In [60]:
def first_stage(model, data, k = 5, seed = 0):
    np.random.seed(seed)
    pred = np.zeros((len(data), 1))
    kf = KFold(n_splits = k)
    kf.get_n_splits(len(data))
    for train_idx, test_idx in kf.split(data):
        train_data = data.iloc[train_idx]
        test_data = data.iloc[test_idx]
        tr_X = train_data.drop('평균관객수', axis = 1).as_matrix()
        tr_y = train_data['평균관객수'].as_matrix()
        te_X = test_data.drop('평균관객수', axis = 1).as_matrix()
        te_y = test_data['평균관객수'].as_matrix()

        model.fit(X = tr_X, y = tr_y)
        pred[test_idx] = model.predict(te_X).reshape(-1, 1)
        
    return(pred)

In [61]:
pred1 = model1.predict(movie).reshape(-1, 1)
pred2 = model2.predict(movie).reshape(-1, 1)
pred3 = model3.predict(movie).reshape(-1, 1)

new_X = np.hstack((pred1, pred2, pred3))
pred = model_agg.predict(new_X)

print(pred)

[ 23358.32248446   8714.3952077     121.20072442]


In [63]:
predict = pred * day

for (n, v) in zip(movie_name, predict):
    print('%s: %.0f' % (n, v))

킹스맨: 골든 서클: 327017
남한산성: 95858
넛잡2: 970
