## Multiple_Linear_Regression

- 이번에는 multiple linear regression을 실제 데이터로 적용

## 1. Import Required Libraries

In [146]:
import pandas as pd
import matplotlib.pylab as plt
import numpy as np
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.init as init

## 2. Generate Data

- 영화관람객 예측 모델링을 위한 전처리 완료된 데이터
- 영화장르, 스크린수 등을 변수로 활용하여 first_week_audience 예측
- train, test set을 7:3의 비율로 맞췄다

In [147]:
train = pd.read_csv("movie_data.csv",encoding = 'euc-kr')
test = pd.read_csv("test_data.csv", encoding='euc-kr')

In [262]:
train.head(1) # 첫번쨰 변수인 first_week_audience가 target변수 
              # shape (566, 26)

Unnamed: 0,title,first_week_audience,first_day_screen,drama,action,comedy,fantasy.SF,advanture,criminal.thrill,traditional,...,etc,series,holiday,actors,director,distributor_value,aud_18,korea,usa,first_day_aud
0,"무현,두도시이야기",25506,87,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,6374


In [148]:
train_X = train.iloc[:,2:]
train_y = train.iloc[:,1]
test_X = test.iloc[:,2:]
test_y = test.iloc[:,1]

## 3. Model & Optimizer

In [242]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class makeData(Dataset):
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return (self.X_data[index], self.y_data[index])
    
    def __len__(self):
        return len(self.y_data)
    
train_data = makeData(np.array(normalize(train_X)), np.array(train_y/10000)) # X랑 y값이 차이가 너무 많이 나서 y를 10000으로 나눴다. X도 Normalize해줬다
test_data = makeData(np.array(normalize(test_X)), np.array(test_y/10000))

In [243]:
class MultipleLinearRegression(nn.Module):
    def __init__(self, feature_size):
        super(MultipleLinearRegression, self).__init__()
        self.Layer = nn.Linear(feature_size, 1)
        
        # 이 부분에서 x feature가 여러개 들어가기 때문에
        # simple linear regression과 달리 1을 feature_size로 바꿔준다.
        
    def forward(self, inputs):
        x = self.Layer(inputs)
        return x.squeeze(1)
    
    def predict(self,test_input):
        x = self.Layer(test_input)
        return x

In [245]:
X_train.shape[1]

24

## 3. Train

In [259]:
EPOCHS = 5000
BATCH_SIZE = 50
FEATURE_SIZE = X_train.shape[1]

model = MultipleLinearRegression(FEATURE_SIZE)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr = 0.1)

train_batch = DataLoader(dataset = train_data, batch_size = BATCH_SIZE, shuffle=True)
for epoch in range(EPOCHS):
    for X_batch, y_batch in train_batch:
        inputs = torch.Tensor(X_batch.float())
        targets = torch.Tensor(y_batch.float())
        model.zero_grad()
        y_pred = model(inputs)
        loss = criterion(y_pred, targets)
        loss.backward()
        optimizer.step()

    if epoch % 100 == 0:
        print(loss)        

tensor(3440.0232, grad_fn=<MseLossBackward>)
tensor(16388.7129, grad_fn=<MseLossBackward>)
tensor(3674.8027, grad_fn=<MseLossBackward>)
tensor(1760.2246, grad_fn=<MseLossBackward>)
tensor(1754.8363, grad_fn=<MseLossBackward>)
tensor(2506.0491, grad_fn=<MseLossBackward>)
tensor(2805.9155, grad_fn=<MseLossBackward>)
tensor(4058.4412, grad_fn=<MseLossBackward>)
tensor(4626.1694, grad_fn=<MseLossBackward>)
tensor(3399.8247, grad_fn=<MseLossBackward>)
tensor(24177.1230, grad_fn=<MseLossBackward>)
tensor(1377.2793, grad_fn=<MseLossBackward>)
tensor(2698.2913, grad_fn=<MseLossBackward>)
tensor(1742.1505, grad_fn=<MseLossBackward>)
tensor(16667.0996, grad_fn=<MseLossBackward>)
tensor(7199.3086, grad_fn=<MseLossBackward>)
tensor(2969.4246, grad_fn=<MseLossBackward>)
tensor(4496.7173, grad_fn=<MseLossBackward>)
tensor(3011.6709, grad_fn=<MseLossBackward>)
tensor(16270.9160, grad_fn=<MseLossBackward>)
tensor(23901.3828, grad_fn=<MseLossBackward>)
tensor(3179.5247, grad_fn=<MseLossBackward>)
tenso

## 4. Check Trained Parameters

In [260]:
y_pred = model.predict(torch.Tensor(normalize(test_X)))

In [261]:
y_pred

tensor([[84.9523],
        [87.3396],
        [70.6623],
        [ 9.3569],
        [54.0811]], grad_fn=<AddmmBackward>)

In [280]:
final = test.iloc[:,:2]
final['first_week_audience'] = final['first_week_audience']/10000
final['predicted'] = [x[0] for x in y_pred.detach().numpy()]
final  # 좋은 모델은 아닌듯 하다

Unnamed: 0,title,first_week_audience,predicted
0,대호,87.0751,84.952286
1,공조,142.8021,87.339554
2,플립,17.3194,70.662315
3,더보이,5.3313,9.356888
4,갤로우즈,7.9294,54.081051
