## Multiple_Linear_Regression

- 이번에는 multiple linear regression을 실제 데이터로 적용

## 1. Import Required Libraries

In [1]:
import pandas as pd
import matplotlib.pylab as plt
import numpy as np
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.init as init

## 2. Generate Data

- 영화관람객 예측 모델링을 위한 전처리 완료된 데이터
- 영화장르, 스크린수 등을 변수로 활용하여 first_week_audience 예측
- train, test set을 7:3의 비율로 맞췄다

In [2]:
train = pd.read_csv("movie_data.csv",encoding = 'euc-kr')
test = pd.read_csv("test_data.csv", encoding='euc-kr')

In [3]:
train.head(1) # 첫번쨰 변수인 first_week_audience가 target변수 
              # shape (566, 26)

Unnamed: 0,title,first_week_audience,first_day_screen,drama,action,comedy,fantasy.SF,advanture,criminal.thrill,traditional,...,etc,series,holiday,actors,director,distributor_value,aud_18,korea,usa,first_day_aud
0,"무현,두도시이야기",25506,87,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,6374


In [4]:
train_X = train.iloc[:,2:]
train_y = train.iloc[:,1]
test_X = test.iloc[:,2:]
test_y = test.iloc[:,1]

## 3. Model & Optimizer

In [5]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class makeData(Dataset):
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return (self.X_data[index], self.y_data[index])
    
    def __len__(self):
        return len(self.y_data)
    
train_data = makeData(np.array(normalize(train_X)), np.array(train_y/10000)) # X랑 y값이 차이가 너무 많이 나서 y를 10000으로 나눴다. X도 Normalize해줬다
test_data = makeData(np.array(normalize(test_X)), np.array(test_y/10000))

In [6]:
list(train_data)

[(array([1.36479283e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 0.00000000e+00, 1.56872739e-04, 0.00000000e+00,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 1.56872739e-04, 0.00000000e+00, 9.99906838e-01]),
  2.5506),
 (array([1.24695390e-02, 1.68507284e-04, 0.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 0.00000000e+00, 1.68507284e-04, 9.99922224e-01]),
  2.5749),
 (array([1.47800936e-02, 1.14574369e-04, 0.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 

In [7]:
class MultipleLinearRegression(nn.Module):
    def __init__(self, feature_size):
        super(MultipleLinearRegression, self).__init__()
        self.Layer = nn.Linear(feature_size, 1)
        
        # 이 부분에서 x feature가 여러개 들어가기 때문에
        # simple linear regression과 달리 1을 feature_size로 바꿔준다.
        
    def forward(self, inputs):
        x = self.Layer(inputs)
        return x.squeeze(1)
    
    def predict(self,test_input):
        x = self.Layer(test_input)
        return x

In [8]:
train_X.shape[1]

24

## 3. Train

In [9]:
EPOCHS = 5000
BATCH_SIZE = 50
FEATURE_SIZE = train_X.shape[1]

train_batch = DataLoader(dataset = train_data, batch_size = BATCH_SIZE, shuffle=True)

In [10]:
list(train_batch)[0][0][0]

tensor([1.4641e-02, 2.0251e-05, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        2.0251e-05, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 4.0502e-05,
        0.0000e+00, 4.0502e-05, 0.0000e+00, 2.0251e-05, 0.0000e+00, 9.9989e-01],
       dtype=torch.float64)

In [11]:
len(list(train_batch)[0][0]) ## 저절로 squeeze되는가보다

50

In [12]:
for X_batch, y_batch in train_batch:
    print(X_batch)

tensor([[6.5792e-03, 0.0000e+00, 0.0000e+00,  ..., 7.5106e-06, 0.0000e+00,
         9.9998e-01],
        [7.6823e-02, 0.0000e+00, 0.0000e+00,  ..., 2.0651e-04, 0.0000e+00,
         9.9704e-01],
        [1.4487e-02, 2.5416e-05, 2.5416e-05,  ..., 0.0000e+00, 2.5416e-05,
         9.9990e-01],
        ...,
        [9.3166e-03, 0.0000e+00, 8.3110e-06,  ..., 0.0000e+00, 8.3110e-06,
         9.9996e-01],
        [1.4173e-02, 0.0000e+00, 1.9024e-05,  ..., 0.0000e+00, 1.9024e-05,
         9.9990e-01],
        [2.0475e-02, 0.0000e+00, 3.0743e-05,  ..., 0.0000e+00, 3.0743e-05,
         9.9979e-01]], dtype=torch.float64)
tensor([[4.4014e-03, 0.0000e+00, 2.7578e-06,  ..., 0.0000e+00, 2.7578e-06,
         9.9999e-01],
        [3.8400e-03, 0.0000e+00, 2.0210e-05,  ..., 0.0000e+00, 2.0210e-05,
         9.9999e-01],
        [9.3687e-03, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 1.0041e-05,
         9.9996e-01],
        ...,
        [3.0623e-02, 4.8608e-05, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
    

In [13]:
EPOCHS = 5000
BATCH_SIZE = 50
FEATURE_SIZE = train_X.shape[1]

model = MultipleLinearRegression(FEATURE_SIZE)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr = 0.1)

train_batch = DataLoader(dataset = train_data, batch_size = BATCH_SIZE, shuffle=True)
for epoch in range(EPOCHS):
    for X_batch, y_batch in train_batch:
        inputs = torch.Tensor(X_batch.float())
        targets = torch.Tensor(y_batch.float())
        model.zero_grad()
        y_pred = model(inputs)
        loss = criterion(y_pred, targets)
        loss.backward()
        optimizer.step()

    if epoch % 100 == 0:
        print(loss)        

tensor(5280.6592, grad_fn=<MseLossBackward>)
tensor(5200.2847, grad_fn=<MseLossBackward>)
tensor(26910.3398, grad_fn=<MseLossBackward>)
tensor(1918.9197, grad_fn=<MseLossBackward>)
tensor(4687.9399, grad_fn=<MseLossBackward>)
tensor(9021.4893, grad_fn=<MseLossBackward>)
tensor(10268.7354, grad_fn=<MseLossBackward>)
tensor(7918.2725, grad_fn=<MseLossBackward>)
tensor(12290.7881, grad_fn=<MseLossBackward>)
tensor(2482.4067, grad_fn=<MseLossBackward>)
tensor(12312.6943, grad_fn=<MseLossBackward>)
tensor(1852.4521, grad_fn=<MseLossBackward>)
tensor(6241.1782, grad_fn=<MseLossBackward>)
tensor(9007.9199, grad_fn=<MseLossBackward>)
tensor(7093.0835, grad_fn=<MseLossBackward>)
tensor(2793.9065, grad_fn=<MseLossBackward>)
tensor(10493.8105, grad_fn=<MseLossBackward>)
tensor(3613.3054, grad_fn=<MseLossBackward>)
tensor(16230.4727, grad_fn=<MseLossBackward>)
tensor(4164.7490, grad_fn=<MseLossBackward>)
tensor(19859.1797, grad_fn=<MseLossBackward>)
tensor(13403.1934, grad_fn=<MseLossBackward>)
te

## 4. Check Trained Parameters

In [14]:
y_pred = model.predict(torch.Tensor(normalize(test_X)))

In [15]:
y_pred

tensor([[84.8889],
        [87.2722],
        [70.6221],
        [ 9.4126],
        [54.0678]], grad_fn=<AddmmBackward>)

In [16]:
final = test.iloc[:,:2]
final['first_week_audience'] = final['first_week_audience']/10000
final['predicted'] = [x[0] for x in y_pred.detach().numpy()]
final  # 좋은 모델은 아닌듯 하다

Unnamed: 0,title,first_week_audience,predicted
0,대호,87.0751,84.888885
1,공조,142.8021,87.272217
2,플립,17.3194,70.622086
3,더보이,5.3313,9.412636
4,갤로우즈,7.9294,54.06776
