# Train 데이터 학습

## Train 데이터 불러오기

In [None]:
from google.colab import drive
drive.mount('/content/drive') 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
data_dir = './drive/MyDrive/빅데이터 분석가 양성 과정 자료실/딥러닝/dataset/'

## Train, Test 데이터 불러오기

In [None]:
df_train = pd.read_csv(data_dir + '영화 관객수/movies_train.csv')
df_test = pd.read_csv(data_dir + '영화 관객수/movies_test.csv')

In [None]:
len(df_train['distributor'].unique())

In [None]:
len(df_test['distributor'].unique())

In [None]:
df_train.head()

In [None]:
df_test.head()

### test 데이터에 더미 컬럼 생성

In [None]:
df_test['box_off_num'] = df_test.dir_prev_num

In [None]:
df_test.head()

### train, test concat

In [None]:
df = pd.concat([df_train, df_test])

### train 데이터 길이 저장

In [None]:
len_train = len(df_train)
len_test = len(df_test)

In [None]:
# df = pd.read_csv(data_dir + '영화 관객수/movies_train.csv')
df.head()

In [None]:
df.info()

## 불필요 컬럼 삭제

In [None]:
df.drop(columns=['title', 'time', 'release_time', 'director'], inplace=True)

In [None]:
df.info()

## 결측치 처리

In [None]:
df.fillna(0, inplace=True)

In [None]:
df.info()

## 카테고리 컬럼 인코딩

### distributor

In [None]:
df.distributor = df.distributor.astype('category')
df['distributor_code'] = df.distributor.cat.codes

### genre

In [None]:
df.genre = df.genre.astype('category')
df['genre_code'] = df.genre.cat.codes

### screening_rat

In [None]:
df.screening_rat = df.screening_rat.astype('category')
df['screening_rat_code'] = df.screening_rat.cat.codes

## Shuffle(불필요)

In [None]:
# df = df.sample(frac=1).reset_index(drop=True)

In [None]:
df.head()

## log 스케일 변환
```
dir_prev_bfnum
num_staff
box_off_num
```
값이 0인 컬럼은 log변환할 때 에러가 발생한다.
numpy.log1p()를 사용하여 값에 1을 더한 후 log 변환 해준다.

In [None]:
df['dir_prev_bfnum_log'] = np.log1p(df.dir_prev_bfnum)
df['num_staff_log'] = np.log1p(df.num_staff)
df['box_off_num_log'] = np.log1p(df.box_off_num)

In [None]:
df.head()

## Standardization
```
 4   dir_prev_num        600 non-null    int64   
 6   num_actor           600 non-null    int64   
 11  dir_prev_bfnum_log  600 non-null    float64 
 12  num_staff_log       600 non-null    float64 
 13  box_off_num_log     600 non-null    float64 
 ```

In [None]:
df['dir_prev_num'] = df.dir_prev_num.astype(np.float)
df['num_actor'] = df.num_actor.astype(np.float)

df['dir_prev_num'] = (df.dir_prev_num - df.dir_prev_num.mean()) / df.dir_prev_num.std()
df['num_actor'] = (df.num_actor - df.num_actor.mean()) / df.num_actor.std()
df['dir_prev_bfnum_log'] = (df.dir_prev_bfnum_log - df.dir_prev_bfnum_log.mean()) / df.dir_prev_bfnum_log.std()
df['num_staff_log'] = (df.num_staff_log - df.num_staff_log.mean()) / df.num_staff_log.std()
# df['box_off_num_log'] = (df.box_off_num_log - df.box_off_num_log.mean())/df.box_off_num_log.std()

## 데이터 분포 보기

In [None]:
plt.hist(df.distributor_code.to_numpy(), bins=100)
plt.show()

In [None]:
plt.hist(df.genre_code.to_numpy(), bins=100)
plt.show()

In [None]:
plt.hist(df.screening_rat_code.to_numpy(), bins=100)
plt.show()

In [None]:
plt.hist(df.dir_prev_bfnum_log.to_numpy(), bins=100)
plt.show()

In [None]:
plt.hist(df.dir_prev_num.to_numpy(), bins=100)
plt.show()

In [None]:
plt.hist(df.num_staff_log.to_numpy(), bins=100)
plt.show()

In [None]:
plt.hist(df.num_actor.to_numpy(), bins=100)
plt.show()

In [None]:
plt.hist(df.box_off_num_log.to_numpy(), bins=100)
plt.show()

In [None]:
plt.hist(df.dir_prev_bfnum_log.to_numpy(), bins=100)
plt.show()

In [None]:
plt.hist(df.dir_prev_bfnum_log.to_numpy(), bins=100)
plt.show()

## one-hot Encoding
```
distributor_code
genre_code
screening_rat_code
```

In [None]:
df = pd.get_dummies(df, columns =['distributor_code'])
df = pd.get_dummies(df, columns =['genre_code'])
df = pd.get_dummies(df, columns =['screening_rat_code'])

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.head()

## 불필요한 컬럼 삭제

In [None]:
df.drop(columns=['distributor',
                 'genre',
                 'screening_rat',
                 'dir_prev_bfnum',
                 'num_staff'], inplace=True)

## X, Y 나누기

In [None]:
# y = df.box_off_num.to_numpy()
y = df.box_off_num_log.to_numpy()
# df.drop(columns=['box_off_num'], inplace=True)
df.drop(columns=['box_off_num', 'box_off_num_log'], inplace=True)
x = df.to_numpy()

In [None]:
print(x.shape)
print(y.shape)

## train/test 나누기

### Train 데이터 파일과 Test 데이터 파일의 데이터 나누기

In [None]:
train_data_x = x[:len_train]
test_data_x = x[len_train:]
train_data_y = y[:len_train]
test_data_y = y[len_train:]

### Train 데이터에서 나누기

In [None]:
i = int(len(train_data_x)*0.8)
train_x, test_x = train_data_x[:i], train_data_x[i:]
train_y, test_y = train_data_y[:i], train_data_y[i:]

## 딥러닝 학습

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import optimizers
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.metrics import RootMeanSquaredError

model = keras.Sequential()
# model.add(Input(189))
model.add(Dense(10, activation='relu', input_shape=(x.shape[1],)))
model.add(Dense(10, activation='relu'))
model.add(Dense(1))

model.compile(optimizer="SGD", loss="mse", metrics=["mape", RootMeanSquaredError()])
model.summary()

history = model.fit(train_x, train_y, epochs=1000, verbose=0, batch_size=128, validation_split=0.1)

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.show()



loss, mape,rmse = model.evaluate(test_x, test_y)
print("loss=", loss)
print("mape=", mape)
print("rmse=", rmse)

y_ = model.predict(test_x)

plt.scatter(test_y, y_)
plt.show()

In [None]:
plt.plot(history.history['loss'][30:])
plt.plot(history.history['val_loss'][30:])
plt.show()

### RMSE

In [None]:
predicted = np.expm1(y_)
org_test_y = np.expm1(test_y)

rmse = RootMeanSquaredError()
r = rmse(predicted, org_test_y)

print(r)

# 예측

In [None]:
test_x = test_data_x

In [None]:
y_ = model.predict(test_x)
predicted = np.expm1(y_)

# 제출 Submission 만들기

In [None]:
title = np.array(df_test['title']).reshape(-1,1)

In [None]:
box_off_num = np.array(predicted).reshape(-1,1)

In [None]:
arr_sub = np.concatenate((title, box_off_num), axis=1)

In [None]:
df_sub = pd.DataFrame(arr_sub, columns=['title', 'box_off_num'])

In [None]:
df_sub.head()

In [None]:
df_sub.to_csv('submission.csv', index=False)