# 모듈 임포트

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 구글 드라이브 mount

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# 첫번째 시도

## 데이터 읽기

In [None]:
data_dir = './drive/MyDrive/빅데이터 분석가 양성 과정 자료실/딥러닝/dataset/'

In [None]:
train_df = pd.read_csv(data_dir + "영화 관객수/movies_train.csv")
train_df.head()

In [None]:
submission_df = pd.read_csv(data_dir + "영화 관객수/movies_test.csv")
submission_df.head()

## train + submission 데이터 합치기

In [None]:
all_df = pd.concat([train_df, submission_df])
all_df.head()

In [None]:
all_df.info()

## 안쓸 컬럼 삭제

In [None]:
all_df.drop(columns=['title',
                     'distributor',
                     'genre',
                     'release_time',
                     'screening_rat',
                     'director'], inplace=True)
all_df.info()

## 결측치 처리

In [None]:
all_df = all_df.fillna(0)

In [None]:
all_df.info()

## 전처리 전의 데이터 분포 보기

### time

In [None]:
plt.hist(all_df.time, bins=100)
plt.show()

### dir_prev_bfnum

In [None]:
plt.hist(all_df.dir_prev_bfnum, bins=100)
plt.show()

### dir_prev_num

In [None]:
plt.hist(all_df.dir_prev_num, bins=100)
plt.show()

### num_staff

In [None]:
plt.hist(all_df.num_staff, bins=100)
plt.show()

### num_actor

In [None]:
plt.hist(all_df.num_actor, bins=100)
plt.show()

### box_off_num

In [None]:
plt.hist(all_df.box_off_num, bins=100)
plt.show()

## 컬럼별 전처리

### time
250이 넘는 이상치는 250으로 바꾼다.

In [None]:
all_df.loc[all_df.time>200, 'time'] = 200

### dir_prev_bf_num
1을 더한 로그처리 한다.

In [None]:
all_df['dir_prev_bfnum'] = np.log1p(all_df.dir_prev_bfnum)

### num_staff
1을 더한 로그처리 한다.

In [None]:
all_df['num_staff'] = np.log1p(all_df.num_staff)

### num_actor
10보다 큰 이상치를 10으로 변환

In [None]:
all_df.loc[all_df.num_actor>10, 'num_actor'] = 10

### box_off_num
1을 더한 로그처리 한다.

In [None]:
all_df['box_off_num'] = np.log1p(all_df.box_off_num)

## Normalization

In [None]:
# all_df['time'] = (all_df.time - all_df.time.min()) / (all_df.time.max() - all_df.time.min())
# all_df['dir_prev_bfnum'] = (all_df.dir_prev_bfnum - all_df.dir_prev_bfnum.min()) / (all_df.dir_prev_bfnum.max() - all_df.dir_prev_bfnum.min())
# all_df['dir_prev_num'] = (all_df.dir_prev_num - all_df.dir_prev_num.min()) / (all_df.dir_prev_num.max() - all_df.dir_prev_num.min())
# all_df['num_staff'] = (all_df.num_staff - all_df.num_staff.min()) / (all_df.num_staff.max() - all_df.num_staff.min())
# all_df['num_actor'] = (all_df.num_actor - all_df.num_actor.min()) / (all_df.num_actor.max() - all_df.num_actor.min())

In [None]:
col_list = ['time', 'dir_prev_bfnum', 'dir_prev_num', 'num_staff', 'num_actor']
all_df[col_list] = (all_df[col_list] - all_df[col_list].min()) / (all_df[col_list].max() - all_df[col_list].min())

In [None]:
all_df.head()

### box_off_num

In [None]:
y_max = all_df.box_off_num.max()
y_min = all_df.box_off_num.min()

all_df['box_off_num'] = (all_df.box_off_num - y_min) / ( y_max - y_min )

## 전처리 후의 데이터 분포 보기

### time

In [None]:
plt.hist(all_df.time, bins=100)
plt.show()

### dir_prev_bfnum  

In [None]:
plt.hist(all_df.dir_prev_bfnum, bins=100)
plt.show()

### dir_prev_num    

In [None]:
plt.hist(all_df.dir_prev_num, bins=100)
plt.show()

### num_staff       

In [None]:
plt.hist(all_df.num_staff, bins=100)
plt.show()

### num_actor       

In [None]:
plt.hist(all_df.num_actor, bins=100)
plt.show()

### box_off_num     

In [None]:
plt.hist(all_df.box_off_num, bins=100)
plt.show()

## train, test, submisiion 데이터 분리

In [None]:
train_df = all_df[:500]
test_df = all_df[500:600]
submission_df = all_df[600:]

print(len(train_df))
print(len(test_df))
print(len(submission_df))

## x, y로 분리

In [None]:
train_y = train_df.box_off_num.to_numpy()
train_x_df = train_df.drop(columns=["box_off_num"])
train_x = train_x_df.to_numpy()
print(train_x.shape)
print(train_y.shape)

test_y = test_df.box_off_num.to_numpy()
test_x_df = test_df.drop(columns=["box_off_num"])
test_x = test_x_df.to_numpy()
print(test_x.shape)
print(test_y.shape)

submission_x_df = submission_df.drop(columns=["box_off_num"])
submission_x = submission_x_df.to_numpy()
print(submission_x.shape)


## 딥러닝 학습

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import optimizers
from tensorflow.keras.layers import Dense

model = keras.Sequential()
model.add(Dense(10, activation='relu', input_shape=(5,)))
model.add(Dense(10, activation='relu'))
model.add(Dense(1))

model.compile(optimizer="SGD", loss="mse", metrics=["mape"])
model.summary()

history = model.fit(train_x, train_y, epochs=1000, verbose=0, batch_size=128, validation_split=0.1)

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.show()



loss, mape = model.evaluate(test_x, test_y)
print("loss=", loss)
print("mape=", mape)

y_ = model.predict(test_x)

plt.scatter(test_y, y_)
plt.show()

In [None]:
plt.plot(history.history['loss'][50:])
plt.plot(history.history['val_loss'][50:])
plt.show()

## RMSE 계산하기

In [None]:
predicted = y_ * ( y_max - y_min) + y_min
predicted = np.exp(predicted) - 1.

org_test_y = test_y * ( y_max - y_min) + y_min
org_test_y = np.exp(org_test_y) - 1.

In [None]:
from tensorflow.keras.metrics import RootMeanSquaredError

rmse = RootMeanSquaredError()(org_test_y, predicted)
print("rmse = ", rmse)

plt.hist(predicted, bins=100)
plt.show()
plt.hist(org_test_y, bins=100)
plt.show()

## 예측하기

In [None]:
predicted = model.predict(submission_x)

In [None]:
plt.hist(predicted, bins=100)
plt.show()

normalized = ( t - min )  / (max - min)

t - min = normalized * (max - min)

t = normalized * (max - min) + min

In [None]:
predicted = predicted * ( y_max - y_min) + y_min

In [None]:
plt.hist(predicted, bins=100)
plt.show()

In [None]:
predicted = np.exp(predicted) - 1.

In [None]:
plt.hist(predicted, bins=100)
plt.show()

## submission 파일 만들기

In [None]:
!head ./drive/MyDrive/'빅데이터 분석가 양성 과정 자료실'/딥러닝/dataset/'영화 관객수'/submission.csv

In [None]:
submission_df = pd.read_csv(data_dir + "영화 관객수/movies_test.csv")
submission_df.head()

In [None]:
final_df = pd.DataFrame(submission_df.title)
final_df.head()

In [None]:
final_df['box_off_num'] = predicted.astype(np.int)
final_df.head()

In [None]:
final_df.to_csv('submission.csv', index=False)

In [None]:
!head submission.csv

# 두번째 시도
카테고리 데이터도 사용하자.

## 데이터 읽기

In [None]:
data_dir = './drive/MyDrive/빅데이터 분석가 양성 과정 자료실/딥러닝/dataset/'

In [None]:
train_df = pd.read_csv(data_dir + "영화 관객수/movies_train.csv")
train_df.head()

In [None]:
submission_df = pd.read_csv(data_dir + "영화 관객수/movies_test.csv")
submission_df.head()

## train + submission 데이터 합치기

In [None]:
all_df = pd.concat([train_df, submission_df])
all_df.head()

In [None]:
all_df.info()

## 카테고리 데이터 인코딩

In [None]:
all_df.distributor = all_df.distributor.astype('category')
all_df['distributor_code'] = all_df.distributor.cat.codes

all_df.genre = all_df.genre.astype('category')
all_df['genre_code'] = all_df.genre.cat.codes

all_df.screening_rat = all_df.screening_rat.astype('category')
all_df['screening_rat_code'] = all_df.screening_rat.cat.codes

## 카테고리 데이터 one-hot encoding

In [None]:
all_df = pd.get_dummies(all_df, columns = ['distributor_code', 'genre_code', 'screening_rat_code'])

In [None]:
all_df.info()

In [None]:
all_df.head()

## 안쓸 컬럼 삭제

In [None]:
all_df.drop(columns=['title',
                     'distributor',
                     'genre',
                     'release_time',
                     'screening_rat',
                     'director'], inplace=True)
all_df.info()

## 결측치 처리

In [None]:
all_df = all_df.fillna(0)

In [None]:
all_df.info()

## 전처리 전의 데이터 분포 보기

### time

In [None]:
plt.hist(all_df.time, bins=100)
plt.show()

### dir_prev_bfnum

In [None]:
plt.hist(all_df.dir_prev_bfnum, bins=100)
plt.show()

### dir_prev_num

In [None]:
plt.hist(all_df.dir_prev_num, bins=100)
plt.show()

### num_staff

In [None]:
plt.hist(all_df.num_staff, bins=100)
plt.show()

### num_actor

In [None]:
plt.hist(all_df.num_actor, bins=100)
plt.show()

### box_off_num

In [None]:
plt.hist(all_df.box_off_num, bins=100)
plt.show()

## 컬럼별 전처리

### time
250이 넘는 이상치는 250으로 바꾼다.

In [None]:
all_df.loc[all_df.time>200, 'time'] = 200

### dir_prev_bf_num
1을 더한 로그처리 한다.

In [None]:
all_df['dir_prev_bfnum'] = np.log1p(all_df.dir_prev_bfnum)

### num_staff
1을 더한 로그처리 한다.

In [None]:
all_df['num_staff'] = np.log1p(all_df.num_staff)

### num_actor
10보다 큰 이상치를 10으로 변환

In [None]:
all_df.loc[all_df.num_actor>10, 'num_actor'] = 10

### box_off_num
1을 더한 로그처리 한다.

In [None]:
all_df['box_off_num'] = np.log1p(all_df.box_off_num)

## Normalization

In [None]:
# all_df['time'] = (all_df.time - all_df.time.min()) / (all_df.time.max() - all_df.time.min())
# all_df['dir_prev_bfnum'] = (all_df.dir_prev_bfnum - all_df.dir_prev_bfnum.min()) / (all_df.dir_prev_bfnum.max() - all_df.dir_prev_bfnum.min())
# all_df['dir_prev_num'] = (all_df.dir_prev_num - all_df.dir_prev_num.min()) / (all_df.dir_prev_num.max() - all_df.dir_prev_num.min())
# all_df['num_staff'] = (all_df.num_staff - all_df.num_staff.min()) / (all_df.num_staff.max() - all_df.num_staff.min())
# all_df['num_actor'] = (all_df.num_actor - all_df.num_actor.min()) / (all_df.num_actor.max() - all_df.num_actor.min())

In [None]:
col_list = ['time', 'dir_prev_bfnum', 'dir_prev_num', 'num_staff', 'num_actor']
all_df[col_list] = (all_df[col_list] - all_df[col_list].min()) / (all_df[col_list].max() - all_df[col_list].min())

In [None]:
all_df.head()

### box_off_num

In [None]:
y_max = all_df.box_off_num.max()
y_min = all_df.box_off_num.min()

all_df['box_off_num'] = (all_df.box_off_num - y_min) / ( y_max - y_min )

## 전처리 후의 데이터 분포 보기

### time

In [None]:
plt.hist(all_df.time, bins=100)
plt.show()

### dir_prev_bfnum  

In [None]:
plt.hist(all_df.dir_prev_bfnum, bins=100)
plt.show()

### dir_prev_num    

In [None]:
plt.hist(all_df.dir_prev_num, bins=100)
plt.show()

### num_staff       

In [None]:
plt.hist(all_df.num_staff, bins=100)
plt.show()

### num_actor       

In [None]:
plt.hist(all_df.num_actor, bins=100)
plt.show()

### box_off_num     

In [None]:
plt.hist(all_df.box_off_num, bins=100)
plt.show()

## train, test, submisiion 데이터 분리

In [None]:
train_df = all_df[:500]
test_df = all_df[500:600]
submission_df = all_df[600:]

print(len(train_df))
print(len(test_df))
print(len(submission_df))

## x, y로 분리

In [None]:
train_y = train_df.box_off_num.to_numpy()
train_x_df = train_df.drop(columns=["box_off_num"])
train_x = train_x_df.to_numpy()
print(train_x.shape)
print(train_y.shape)

test_y = test_df.box_off_num.to_numpy()
test_x_df = test_df.drop(columns=["box_off_num"])
test_x = test_x_df.to_numpy()
print(test_x.shape)
print(test_y.shape)

submission_x_df = submission_df.drop(columns=["box_off_num"])
submission_x = submission_x_df.to_numpy()
print(submission_x.shape)


## 딥러닝 학습

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import optimizers
from tensorflow.keras.layers import Dense

model = keras.Sequential()
model.add(Dense(10, activation='relu', input_shape=(train_x.shape[1],)))
model.add(Dense(10, activation='relu'))
model.add(Dense(1))

model.compile(optimizer="SGD", loss="mse", metrics=["mape"])
model.summary()

history = model.fit(train_x, train_y, epochs=4000, verbose=0, batch_size=128, validation_split=0.1)

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.show()



loss, mape = model.evaluate(test_x, test_y)
print("loss=", loss)
print("mape=", mape)

y_ = model.predict(test_x)

plt.scatter(test_y, y_)
plt.show()

In [None]:
plt.plot(history.history['loss'][50:])
plt.plot(history.history['val_loss'][50:])
plt.show()

## RMSE 계산하기

In [None]:
predicted = y_ * ( y_max - y_min) + y_min
predicted = np.exp(predicted) - 1.

org_test_y = test_y * ( y_max - y_min) + y_min
org_test_y = np.exp(org_test_y) - 1.

In [None]:
from tensorflow.keras.metrics import RootMeanSquaredError

rmse = RootMeanSquaredError()(org_test_y, predicted)
print("rmse = ", rmse)

plt.hist(predicted, bins=100)
plt.show()
plt.hist(org_test_y, bins=100)
plt.show()

## 예측하기

In [None]:
predicted = model.predict(submission_x)

In [None]:
plt.hist(predicted, bins=100)
plt.show()

normalized = ( t - min )  / (max - min)

t - min = normalized * (max - min)

t = normalized * (max - min) + min

In [None]:
predicted = predicted * ( y_max - y_min) + y_min

In [None]:
plt.hist(predicted, bins=100)
plt.show()

In [None]:
predicted = np.exp(predicted) - 1.

In [None]:
plt.hist(predicted, bins=100)
plt.show()

## submission 파일 만들기

In [None]:
!head ./drive/MyDrive/'빅데이터 분석가 양성 과정 자료실'/딥러닝/dataset/'영화 관객수'/submission.csv

In [None]:
submission_df = pd.read_csv(data_dir + "영화 관객수/movies_test.csv")
submission_df.head()

In [None]:
final_df = pd.DataFrame(submission_df.title)
final_df.head()

In [None]:
final_df['box_off_num'] = predicted.astype(np.int)
final_df.head()

In [None]:
final_df.to_csv('submission.csv', index=False)

In [None]:
!head submission.csv

## batch_size 찾기

In [None]:
import time

for batch_size in [16, 32, 64, 128, 256, 512, 512*2, 512*4, 512*8, 512*16]:
    start = time.time()
    history = model.fit(train_x, train_y, epochs=4, verbose=0, batch_size=batch_size, validation_split=0.1)
    print(batch_size, ":", time.time() - start)

# 세번째 시도
release_time, director 컬럼 사용

## 데이터 읽기

In [None]:
data_dir = './drive/MyDrive/빅데이터 분석가 양성 과정 자료실/딥러닝/dataset/'

In [None]:
train_df = pd.read_csv(data_dir + "영화 관객수/movies_train.csv")
train_df.head()

In [None]:
submission_df = pd.read_csv(data_dir + "영화 관객수/movies_test.csv")
submission_df.head()

## train + submission 데이터 합치기

In [None]:
all_df = pd.concat([train_df, submission_df])
all_df.head()

In [None]:
all_df.info()

## release_time 값 조정
'2021-12-31'같은 값을 '2021-12'로 변경

In [None]:
print(all_df.release_time[:5])
all_df['release_time'] = all_df.release_time.str[:-3]
print(all_df.release_time[:5])

## 카테고리 데이터 인코딩

In [None]:
all_df.distributor = all_df.distributor.astype('category')
all_df['distributor_code'] = all_df.distributor.cat.codes

all_df.genre = all_df.genre.astype('category')
all_df['genre_code'] = all_df.genre.cat.codes

all_df.screening_rat = all_df.screening_rat.astype('category')
all_df['screening_rat_code'] = all_df.screening_rat.cat.codes

all_df.release_time = all_df.release_time.astype('category')
all_df['release_time_code'] = all_df.release_time.cat.codes

## 카테고리 데이터 one-hot encoding

In [None]:
all_df = pd.get_dummies(all_df, columns = ['distributor_code', 'genre_code', 'screening_rat_code', 'release_time_code'])

In [None]:
all_df.info()

In [None]:
all_df.head()

## 안쓸 컬럼 삭제

In [None]:
all_df.drop(columns=['title',
                     'distributor',
                     'genre',
                     'release_time',
                     'screening_rat',
                     'director'], inplace=True)
all_df.info()

## 결측치 처리

In [None]:
all_df = all_df.fillna(0)

In [None]:
all_df.info()

## 전처리 전의 데이터 분포 보기

### time

In [None]:
plt.hist(all_df.time, bins=100)
plt.show()

### dir_prev_bfnum

In [None]:
plt.hist(all_df.dir_prev_bfnum, bins=100)
plt.show()

### dir_prev_num

In [None]:
plt.hist(all_df.dir_prev_num, bins=100)
plt.show()

### num_staff

In [None]:
plt.hist(all_df.num_staff, bins=100)
plt.show()

### num_actor

In [None]:
plt.hist(all_df.num_actor, bins=100)
plt.show()

### box_off_num

In [None]:
plt.hist(all_df.box_off_num, bins=100)
plt.show()

## 컬럼별 전처리

### time
250이 넘는 이상치는 250으로 바꾼다.

In [None]:
all_df.loc[all_df.time>200, 'time'] = 200

### dir_prev_bf_num
1을 더한 로그처리 한다.

In [None]:
all_df['dir_prev_bfnum'] = np.log1p(all_df.dir_prev_bfnum)

### num_staff
1을 더한 로그처리 한다.

In [None]:
all_df['num_staff'] = np.log1p(all_df.num_staff)

### num_actor
10보다 큰 이상치를 10으로 변환

In [None]:
all_df.loc[all_df.num_actor>10, 'num_actor'] = 10

### box_off_num
1을 더한 로그처리 한다.

In [None]:
all_df['box_off_num'] = np.log1p(all_df.box_off_num)

## Normalization

In [None]:
# all_df['time'] = (all_df.time - all_df.time.min()) / (all_df.time.max() - all_df.time.min())
# all_df['dir_prev_bfnum'] = (all_df.dir_prev_bfnum - all_df.dir_prev_bfnum.min()) / (all_df.dir_prev_bfnum.max() - all_df.dir_prev_bfnum.min())
# all_df['dir_prev_num'] = (all_df.dir_prev_num - all_df.dir_prev_num.min()) / (all_df.dir_prev_num.max() - all_df.dir_prev_num.min())
# all_df['num_staff'] = (all_df.num_staff - all_df.num_staff.min()) / (all_df.num_staff.max() - all_df.num_staff.min())
# all_df['num_actor'] = (all_df.num_actor - all_df.num_actor.min()) / (all_df.num_actor.max() - all_df.num_actor.min())

In [None]:
col_list = ['time', 'dir_prev_bfnum', 'dir_prev_num', 'num_staff', 'num_actor']
all_df[col_list] = (all_df[col_list] - all_df[col_list].min()) / (all_df[col_list].max() - all_df[col_list].min())

In [None]:
all_df.head()

### box_off_num

In [None]:
y_max = all_df.box_off_num.max()
y_min = all_df.box_off_num.min()

all_df['box_off_num'] = (all_df.box_off_num - y_min) / ( y_max - y_min )

## 전처리 후의 데이터 분포 보기

### time

In [None]:
plt.hist(all_df.time, bins=100)
plt.show()

### dir_prev_bfnum  

In [None]:
plt.hist(all_df.dir_prev_bfnum, bins=100)
plt.show()

### dir_prev_num    

In [None]:
plt.hist(all_df.dir_prev_num, bins=100)
plt.show()

### num_staff       

In [None]:
plt.hist(all_df.num_staff, bins=100)
plt.show()

### num_actor       

In [None]:
plt.hist(all_df.num_actor, bins=100)
plt.show()

### box_off_num     

In [None]:
plt.hist(all_df.box_off_num, bins=100)
plt.show()

## train, test, submisiion 데이터 분리

In [None]:
train_df = all_df[:500]
test_df = all_df[500:600]
submission_df = all_df[600:]

print(len(train_df))
print(len(test_df))
print(len(submission_df))

## x, y로 분리

In [None]:
train_y = train_df.box_off_num.to_numpy()
train_x_df = train_df.drop(columns=["box_off_num"])
train_x = train_x_df.to_numpy()
print(train_x.shape)
print(train_y.shape)

test_y = test_df.box_off_num.to_numpy()
test_x_df = test_df.drop(columns=["box_off_num"])
test_x = test_x_df.to_numpy()
print(test_x.shape)
print(test_y.shape)

submission_x_df = submission_df.drop(columns=["box_off_num"])
submission_x = submission_x_df.to_numpy()
print(submission_x.shape)


## 딥러닝 학습

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import optimizers
from tensorflow.keras.layers import Dense

model = keras.Sequential()
model.add(Dense(10, activation='relu', input_shape=(train_x.shape[1],)))
model.add(Dense(10, activation='relu'))
model.add(Dense(1))

model.compile(optimizer="SGD", loss="mse", metrics=["mape"])
model.summary()

history = model.fit(train_x, train_y, epochs=4000, verbose=0, batch_size=128, validation_split=0.1)

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.show()



loss, mape = model.evaluate(test_x, test_y)
print("loss=", loss)
print("mape=", mape)

y_ = model.predict(test_x)

plt.scatter(test_y, y_)
plt.show()

In [None]:
plt.plot(history.history['loss'][50:])
plt.plot(history.history['val_loss'][50:])
plt.show()

## RMSE 계산하기

In [None]:
predicted = y_ * ( y_max - y_min) + y_min
predicted = np.exp(predicted) - 1.

org_test_y = test_y * ( y_max - y_min) + y_min
org_test_y = np.exp(org_test_y) - 1.

In [None]:
from tensorflow.keras.metrics import RootMeanSquaredError

rmse = RootMeanSquaredError()(org_test_y, predicted)
print("rmse = ", rmse)

plt.hist(predicted, bins=100)
plt.show()
plt.hist(org_test_y, bins=100)
plt.show()

## 예측하기

In [None]:
predicted = model.predict(submission_x)

In [None]:
plt.hist(predicted, bins=100)
plt.show()

normalized = ( t - min )  / (max - min)

t - min = normalized * (max - min)

t = normalized * (max - min) + min

In [None]:
predicted = predicted * ( y_max - y_min) + y_min

In [None]:
plt.hist(predicted, bins=100)
plt.show()

In [None]:
predicted = np.exp(predicted) - 1.

In [None]:
plt.hist(predicted, bins=100)
plt.show()

## submission 파일 만들기

In [None]:
!head ./drive/MyDrive/'빅데이터 분석가 양성 과정 자료실'/딥러닝/dataset/'영화 관객수'/submission.csv

In [None]:
submission_df = pd.read_csv(data_dir + "영화 관객수/movies_test.csv")
submission_df.head()

In [None]:
final_df = pd.DataFrame(submission_df.title)
final_df.head()

In [None]:
final_df['box_off_num'] = predicted.astype(np.int)
final_df.head()

In [None]:
final_df.to_csv('submission.csv', index=False)

In [None]:
!head submission.csv

## batch_size 찾기

In [None]:
import time

for batch_size in [16, 32, 64, 128, 256, 512, 512*2, 512*4, 512*8, 512*16]:
    start = time.time()
    history = model.fit(train_x, train_y, epochs=4, verbose=0, batch_size=batch_size, validation_split=0.1)
    print(batch_size, ":", time.time() - start)

# 네번째 시도
director 컬럼 사용

## 데이터 읽기

In [None]:
data_dir = './drive/MyDrive/빅데이터 분석가 양성 과정 자료실/딥러닝/dataset/'

In [None]:
train_df = pd.read_csv(data_dir + "영화 관객수/movies_train.csv")
train_df.head()

In [None]:
train_df.distributor.unique()

In [None]:
submission_df = pd.read_csv(data_dir + "영화 관객수/movies_test.csv")
submission_df.head()

## train + submission 데이터 합치기

In [None]:
all_df = pd.concat([train_df, submission_df])
all_df.head()

In [None]:
all_df.info()

## release_time 값 조정
'2021-12-31'같은 값을 '2021-12'로 변경

In [None]:
print(all_df.release_time[:5])
all_df['release_time'] = all_df.release_time.str[:-3]
print(all_df.release_time[:5])

## director 컬럼 카테고리

In [None]:
print(len(all_df.director.unique()))

all_df.loc[all_df.dir_prev_num==0, 'director'] = '무명감독'
print(len(all_df.director.unique()))

## 카테고리 데이터 인코딩

In [None]:
all_df.distributor = all_df.distributor.astype('category')
all_df['distributor_code'] = all_df.distributor.cat.codes

all_df.genre = all_df.genre.astype('category')
all_df['genre_code'] = all_df.genre.cat.codes

all_df.screening_rat = all_df.screening_rat.astype('category')
all_df['screening_rat_code'] = all_df.screening_rat.cat.codes

all_df.release_time = all_df.release_time.astype('category')
all_df['release_time_code'] = all_df.release_time.cat.codes

all_df.director = all_df.director.astype('category')
all_df['director'] = all_df.director.cat.codes

## 카테고리 데이터 one-hot encoding

In [None]:
all_df = pd.get_dummies(all_df, columns = ['distributor_code', 'genre_code', 'screening_rat_code', 'release_time_code', 'director'])

In [None]:
all_df.info()

In [None]:
all_df.head()

## 안쓸 컬럼 삭제

In [None]:
all_df.drop(columns=['title',
                     'distributor',
                     'genre',
                     'release_time',
                     'screening_rat',
                    #  'director'
                     ], inplace=True)
all_df.info()

## 결측치 처리

In [None]:
all_df = all_df.fillna(0)

In [None]:
all_df.info()

## 전처리 전의 데이터 분포 보기

### time

In [None]:
plt.hist(all_df.time, bins=100)
plt.show()

### dir_prev_bfnum

In [None]:
plt.hist(all_df.dir_prev_bfnum, bins=100)
plt.show()

### dir_prev_num

In [None]:
plt.hist(all_df.dir_prev_num, bins=100)
plt.show()

### num_staff

In [None]:
plt.hist(all_df.num_staff, bins=100)
plt.show()

### num_actor

In [None]:
plt.hist(all_df.num_actor, bins=100)
plt.show()

### box_off_num

In [None]:
plt.hist(all_df.box_off_num, bins=100)
plt.show()

## 컬럼별 전처리

### time
250이 넘는 이상치는 250으로 바꾼다.

In [None]:
all_df.loc[all_df.time>200, 'time'] = 200

### dir_prev_bf_num
1을 더한 로그처리 한다.

In [None]:
all_df['dir_prev_bfnum'] = np.log1p(all_df.dir_prev_bfnum)

### num_staff
1을 더한 로그처리 한다.

In [None]:
all_df['num_staff'] = np.log1p(all_df.num_staff)

### num_actor
10보다 큰 이상치를 10으로 변환

In [None]:
all_df.loc[all_df.num_actor>10, 'num_actor'] = 10

### box_off_num
1을 더한 로그처리 한다.

In [None]:
all_df['box_off_num'] = np.log1p(all_df.box_off_num)

## Normalization

In [None]:
# all_df['time'] = (all_df.time - all_df.time.min()) / (all_df.time.max() - all_df.time.min())
# all_df['dir_prev_bfnum'] = (all_df.dir_prev_bfnum - all_df.dir_prev_bfnum.min()) / (all_df.dir_prev_bfnum.max() - all_df.dir_prev_bfnum.min())
# all_df['dir_prev_num'] = (all_df.dir_prev_num - all_df.dir_prev_num.min()) / (all_df.dir_prev_num.max() - all_df.dir_prev_num.min())
# all_df['num_staff'] = (all_df.num_staff - all_df.num_staff.min()) / (all_df.num_staff.max() - all_df.num_staff.min())
# all_df['num_actor'] = (all_df.num_actor - all_df.num_actor.min()) / (all_df.num_actor.max() - all_df.num_actor.min())

In [None]:
col_list = ['time', 'dir_prev_bfnum', 'dir_prev_num', 'num_staff', 'num_actor']
all_df[col_list] = (all_df[col_list] - all_df[col_list].min()) / (all_df[col_list].max() - all_df[col_list].min())

In [None]:
all_df.head()

### box_off_num

In [None]:
y_max = all_df.box_off_num.max()
y_min = all_df.box_off_num.min()

all_df['box_off_num'] = (all_df.box_off_num - y_min) / ( y_max - y_min )

## 전처리 후의 데이터 분포 보기

### time

In [None]:
plt.hist(all_df.time, bins=100)
plt.show()

### dir_prev_bfnum  

In [None]:
plt.hist(all_df.dir_prev_bfnum, bins=100)
plt.show()

### dir_prev_num    

In [None]:
plt.hist(all_df.dir_prev_num, bins=100)
plt.show()

### num_staff       

In [None]:
plt.hist(all_df.num_staff, bins=100)
plt.show()

### num_actor       

In [None]:
plt.hist(all_df.num_actor, bins=100)
plt.show()

### box_off_num     

In [None]:
plt.hist(all_df.box_off_num, bins=100)
plt.show()

## train, test, submisiion 데이터 분리

In [None]:
train_df = all_df[:500]
test_df = all_df[500:600]
submission_df = all_df[600:]

print(len(train_df))
print(len(test_df))
print(len(submission_df))

## x, y로 분리

In [None]:
train_y = train_df.box_off_num.to_numpy()
train_x_df = train_df.drop(columns=["box_off_num"])
train_x = train_x_df.to_numpy()
print(train_x.shape)
print(train_y.shape)

test_y = test_df.box_off_num.to_numpy()
test_x_df = test_df.drop(columns=["box_off_num"])
test_x = test_x_df.to_numpy()
print(test_x.shape)
print(test_y.shape)

submission_x_df = submission_df.drop(columns=["box_off_num"])
submission_x = submission_x_df.to_numpy()
print(submission_x.shape)


## 딥러닝 학습

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import optimizers
from tensorflow.keras.layers import Dense

model = keras.Sequential()
model.add(Dense(256, activation='relu', input_shape=(train_x.shape[1],)))
model.add(Dense(10, activation='relu'))
model.add(Dense(1))

model.compile(optimizer="SGD", loss="mse", metrics=["mape"])
model.summary()

history = model.fit(train_x, train_y, epochs=1000*2, verbose=0, batch_size=512, validation_split=0.1)

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.show()


In [None]:
plt.plot(history.history['loss'][-500:])
plt.plot(history.history['val_loss'][-500:])
plt.show()

In [None]:
loss, mape = model.evaluate(test_x, test_y)
print("loss=", loss)
print("mape=", mape)

y_ = model.predict(test_x)

plt.scatter(test_y, y_)
plt.show()

## RMSE 계산하기

In [None]:
predicted = y_ * ( y_max - y_min) + y_min
predicted = np.expm1(predicted)

org_test_y = test_y * ( y_max - y_min) + y_min
org_test_y = np.expm1(org_test_y)

In [None]:
from tensorflow.keras.metrics import RootMeanSquaredError

rmse = RootMeanSquaredError()(org_test_y, predicted)
print("rmse = ", rmse)

plt.hist(predicted, bins=100)
plt.show()
plt.hist(org_test_y, bins=100)
plt.show()

## 예측하기

In [None]:
predicted = model.predict(submission_x)

In [None]:
plt.hist(predicted, bins=100)
plt.show()

normalized = ( t - min )  / (max - min)

t - min = normalized * (max - min)

t = normalized * (max - min) + min

In [None]:
predicted = predicted * ( y_max - y_min) + y_min

In [None]:
plt.hist(predicted, bins=100)
plt.show()

In [None]:
predicted = np.expm1(predicted)

In [None]:
plt.hist(predicted, bins=100)
plt.show()

## submission 파일 만들기

In [None]:
!head ./drive/MyDrive/'빅데이터 분석가 양성 과정 자료실'/딥러닝/dataset/'영화 관객수'/submission.csv

In [None]:
submission_df = pd.read_csv(data_dir + "영화 관객수/movies_test.csv")
submission_df.head()

In [None]:
final_df = pd.DataFrame(submission_df.title)
final_df.head()

In [None]:
final_df['box_off_num'] = predicted.astype(np.int)
final_df.head()

In [None]:
final_df.to_csv('submission.csv', index=False)

In [None]:
!head submission.csv

## batch_size 찾기

In [None]:
import time

for batch_size in [16, 32, 64, 128, 256, 512, 512*2, 512*4, 512*8, 512*16]:
    start = time.time()
    history = model.fit(train_x, train_y, epochs=4, verbose=0, batch_size=batch_size, validation_split=0.1)
    print(batch_size, ":", time.time() - start)