토양 오염의 수치를 예측

In [None]:
import pandas as pd 
import numpy as np
import torch 
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torch.optim as optim
import torch.nn.functional as F
import torch.nn as nn

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

torch.manual_seed(777)
if device == 'cuda':
    torch.cuda.manual_seed_all(777)

In [None]:
train = pd.read_csv('../input/2020soil/2020AI_soil_train.csv')
print(train.head(10))
print(train.info())


   Area    Cd    Cu    As    Hg    Pb     Zn    Ni  pollution level
0   1.0  2.13   9.4  2.82  0.01  14.6   58.0  10.9             1.60
1   2.0  1.83   9.3  7.29  0.06  17.2   36.8   9.6             3.03
2   3.0  0.90   8.3  6.90  0.02  13.4   31.4   9.8             1.69
3   4.0  0.00   1.5  2.03  0.00   4.2    7.9   2.7             0.00
4   5.0  1.57  14.6  3.39  0.00  30.6  127.4  11.8            11.04
5   6.0  0.40   0.0  0.00  0.00   9.3   30.3   3.2             0.00
6   7.0  1.87   9.8  1.89  0.01  21.6   59.8  12.9             1.34
7   8.0  2.40  10.2  4.44  0.00  20.2   61.5  10.8             1.87
8   9.0  1.40   4.3  0.00  0.00  19.6   50.2   3.1             0.87
9  10.0  0.60   5.2  0.00  0.00  17.8   35.4   5.1             0.07
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1269 entries, 0 to 1268
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Area             1269 non-null   float64
 1   Cd 

In [None]:
learning_rate = 0.001
training_epoch = 1000
batch_size = 50 

In [None]:
x_train = train.iloc[:,1:-1]
y_train = train.iloc[:,[-1]]

x_train = np.array(x_train)
y_train = np.array(y_train)

x_train = torch.FloatTensor(x_train)
y_train = torch.FloatTensor(y_train)

print(x_train.shape)
print(y_train.shape)

torch.Size([1269, 7])
torch.Size([1269, 1])


In [None]:
train_dataset = torch.utils.data.TensorDataset(x_train,y_train)

data_loader = torch.utils.data.DataLoader(dataset = train_dataset,
                                          batch_size = batch_size,
                                          shuffle = True,
                                          drop_last=True)



layer 5개 , kaiming uniform 초기화

In [None]:
linear1 = nn.Linear(7,4,bias=True)
linear2 = nn.Linear(4,4,bias=True)
linear3 = nn.Linear(4,4,bias=True)
linear4 = nn.Linear(4,4,bias=True)
linear5 = nn.Linear(4,1,bias=True)

nn.init.xavier_uniform_(linear1.weight)
nn.init.kaiming_normal_(linear2.weight)
nn.init.xavier_uniform_(linear3.weight)
nn.init.kaiming_normal_(linear4.weight)
nn.init.xavier_uniform_(linear5.weight)
relu = nn.ReLU()

activation function - relu

In [None]:
model = nn.Sequential(
    linear1,relu,
    linear2,relu,
    linear3,relu,
    linear4,relu,
    linear5
).to(device)

In [None]:
loss = nn.MSELoss().to(device)
optimizer = optim.Adam(model.parameters(),lr=learning_rate)

In [None]:
total_batch = len(data_loader)

for epoch in range(training_epoch):
    avg_cost = 0
    for X,Y in data_loader:
        X = X.to(device)
        Y = Y.to(device)

        optimizer.zero_grad()
        hypothesis = model(X)
        cost = loss(hypothesis,Y)
        cost.backward()
        optimizer.step()

        avg_cost += cost/total_batch

    print('epoch {:.4f} , cost = {:.6f}'.format(epoch,avg_cost))
print('learning finished!')

epoch 0.0000 , cost = 442.114960
epoch 1.0000 , cost = 422.272552
epoch 2.0000 , cost = 408.077881
epoch 3.0000 , cost = 389.538116
epoch 4.0000 , cost = 374.082031
epoch 5.0000 , cost = 343.762390
epoch 6.0000 , cost = 324.111633
epoch 7.0000 , cost = 305.528076
epoch 8.0000 , cost = 296.805603
epoch 9.0000 , cost = 285.578766
epoch 10.0000 , cost = 273.811554
epoch 11.0000 , cost = 261.229034
epoch 12.0000 , cost = 246.260071
epoch 13.0000 , cost = 237.924454
epoch 14.0000 , cost = 221.782715
epoch 15.0000 , cost = 214.984161
epoch 16.0000 , cost = 203.158798
epoch 17.0000 , cost = 194.067825
epoch 18.0000 , cost = 184.702423
epoch 19.0000 , cost = 181.770020
epoch 20.0000 , cost = 175.274231
epoch 21.0000 , cost = 172.692993
epoch 22.0000 , cost = 168.782562
epoch 23.0000 , cost = 163.804184
epoch 24.0000 , cost = 163.716248
epoch 25.0000 , cost = 160.436783
epoch 26.0000 , cost = 158.095871
epoch 27.0000 , cost = 154.154251
epoch 28.0000 , cost = 152.937973
epoch 29.0000 , cost = 1

In [None]:
test = pd.read_csv('../input/2020soil/2020_soil_test.csv')
print(test.info())
test = test.iloc[:,1:]
test = np.array(test)
test = torch.FloatTensor(test).to(device)

with torch.no_grad():
    predict = model(test)
predict

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227 entries, 0 to 226
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Area    227 non-null    int64  
 1   Cd      227 non-null    float64
 2   Cu      227 non-null    float64
 3   As      227 non-null    float64
 4   Hg      227 non-null    float64
 5   Pb      227 non-null    float64
 6   Zn      227 non-null    float64
 7   Ni      227 non-null    float64
dtypes: float64(7), int64(1)
memory usage: 14.3 KB
None


tensor([[5.3503e+01],
        [4.0738e+01],
        [6.9775e-01],
        [4.9623e+00],
        [3.8435e+01],
        [7.5614e-01],
        [3.8577e+00],
        [1.1075e+00],
        [1.0077e+00],
        [1.1640e+00],
        [7.1375e-01],
        [9.6755e-01],
        [8.5854e-01],
        [1.3490e+00],
        [4.2716e+01],
        [6.5515e+01],
        [1.1407e+01],
        [5.2279e-01],
        [1.3994e+01],
        [4.8799e+01],
        [1.0024e+01],
        [5.9912e+01],
        [1.1533e+01],
        [7.7240e-01],
        [1.4459e+01],
        [2.4995e+00],
        [1.9215e+00],
        [2.6157e+01],
        [1.8412e+01],
        [3.7541e+00],
        [2.5015e+00],
        [2.8431e+01],
        [1.9298e+01],
        [7.7034e-01],
        [1.2270e+01],
        [9.4709e+00],
        [2.9753e+00],
        [8.9713e-01],
        [8.5968e-01],
        [1.6677e+00],
        [2.0715e+00],
        [1.4501e+00],
        [8.5859e-01],
        [6.8813e-01],
        [1.6005e+00],
        [1