## biggest lesson I learned
## today i try to create data with a linear function, but in the end when doing pytorch model training, it didn't do well and very quickly the loss becomes NA
## this is due to my Xs are not scaled, the issue disappear after I use a StandardScaler()

In [20]:
from torch.utils.data import Dataset
import pandas as pd
import random

In [21]:
## y=5x1 + 17*x2 + 98*x3 + 33 + error
def create_data(n_records, coefficients=[33, 5, 17, 98], column_names=["x1","x2","x3","y"], error_sigma=1):
    
    df= pd.DataFrame(columns=column_names)
    for i in range(n_records):
        y=coefficients[0]
        values=[]
        for c in coefficients[1:]:
            x=random.uniform(10,100)
            values.append(x)
            y+=c*x
        y+=random.gauss(0, error_sigma)
        values.append(y)
        df.loc[len(df)]= values
    
    return df

In [22]:
# df= create_data(2000)
df = create_data(2000, coefficients=[1, 10], column_names=["x1","target"], error_sigma=0)

In [23]:
df.head()

Unnamed: 0,x1,target
0,28.567085,286.670852
1,40.261511,403.615111
2,68.690047,687.900466
3,92.98271,930.827098
4,54.192242,542.922425


## simple test with sklearn LinearRegression

In [24]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[[ col for col in df.columns if col!="target" ]], df["target"], test_size=0.4, random_state=7251)
print(X_train.shape)
print(y_train.shape)
lrg= LinearRegression()
# lrg.fit(df[["x1","x2","x3"]], df["y"])
lrg.fit(X_train, y_train)

(1200, 1)
(1200,)


In [25]:
lrg.coef_, lrg.intercept_

(array([10.]), 1.0000000000001137)

In [26]:
lrg.score(X_train, y_train)

1.0

## use Pytorch

In [27]:
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch


In [28]:
class MyLinearDataset(Dataset):
    def __init__(self, xs, ys):
        self.xs=xs
        self.ys=ys
    def __len__(self):
        return len(self.ys)
    
    def __getitem__(self, i):
        return self.xs[i], self.ys[i]

In [29]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1200 entries, 409 to 1719
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x1      1200 non-null   float64
dtypes: float64(1)
memory usage: 18.8 KB


In [30]:
type(X_train), type(y_train)

(pandas.core.frame.DataFrame, pandas.core.series.Series)

In [31]:
X_train.shape, y_train.shape

((1200, 1), (1200,))

# you must scale, scale, scale

In [32]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(X_train)
X_train_scaled= scaler.transform(X_train)
X_test_scaled= scaler.transform(X_test)

# 3 big mistakes i made
### 1. change to numpy arrays with(.values)
### 2. need to convert from float64 to float32
### 3. reshape y to have same dimention as x  !!!! (if you don't, your loss will not converge well)

In [33]:
## NOTE: after scaling,  x_train_scaled is not a dataframe but a numpy array, so no need .values
## NOTE: try remove the reshape(-1,1) from y, you'll see no error but model training result bad
data_train=MyLinearDataset(X_train_scaled.astype("float32"), y_train.values.astype("float32").reshape(-1,1))
data_test=MyLinearDataset(X_test_scaled.astype("float32"), y_test.values.astype("float32").reshape(-1,1))


In [39]:
train_dataloader= DataLoader(data_train, batch_size=64, shuffle=True )
test_dataloader= DataLoader(data_test,  batch_size=128 )

In [35]:
class LinRegModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LinRegModel, self).__init__()
        self.li = nn.Linear(input_dim, output_dim)
        
    def forward(self, x):
        y = self.li(x)
        return y

In [36]:
model = LinRegModel(1,1)
loss_func = nn.MSELoss()
optimizer= torch.optim.SGD(model.parameters(), lr=0.1)

In [37]:
def train(n_epoch, data_loader):
    model.train()
    for i_epoch in range(n_epoch):
        for i_batch, (x,y) in enumerate(data_loader):
            y_pred = model(x)
            loss = loss_func(y_pred, y)
            # print(f"y_pred:{y_pred}, x:{x}, y:{y}")
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            if i_batch%10==0:
                print(f"batch {i_batch} : {loss.item()}")

In [38]:
train(5, train_dataloader)

batch 0 : 378396.03125
batch 10 : 5443.78125
batch 0 : 70.58527374267578
batch 10 : 0.7217512726783752
batch 0 : 0.015694081783294678
batch 10 : 0.0002060800907202065
batch 0 : 2.7992555260425434e-06
batch 10 : 3.9614860725123435e-08
batch 0 : 1.2923010217491537e-08
batch 10 : 1.3403223420027643e-08
batch 0 : 1.4425495464820415e-08
batch 10 : 1.433090801583603e-08
batch 0 : 1.359785528620705e-08
batch 10 : 1.3345015759114176e-08
batch 0 : 1.3788849173579365e-08
batch 10 : 1.2158125173300505e-08
batch 0 : 1.2231794244144112e-08
batch 10 : 1.5723344404250383e-08
batch 0 : 1.464741217205301e-08
batch 10 : 1.4111719792708755e-08


In [141]:
model

LinRegModel(
  (li): Linear(in_features=1, out_features=1, bias=True)
)

In [140]:
for para in model.parameters():
    print(para)

Parameter containing:
tensor([[263.4336]], requires_grad=True)
Parameter containing:
tensor([547.9730], requires_grad=True)


In [48]:
from sklearn.metrics import mean_squared_error
def evaluate(data_loader):
    n_total=len(data_loader)
    model.eval()
    mse=0
    with torch.no_grad():
        for ibatch, (X,y) in enumerate(data_loader):
            y_pred= model(X).numpy()
            n = len(y_pred)
            mse += mean_squared_error(y, y_pred)*n
    mse/=n_total
    print(f"mse:{mse}")
            
            

In [49]:
evaluate(test_dataloader)

mse:1.6200771954442774e-06


## example from https://towardsdatascience.com/linear-regression-with-pytorch-eb6dedead817

In [None]:
import numpy as np
import  matplotlib.pyplot as plt
# create dummy data for training
x_values = [i for i in range(11)]
x_train = np.array(x_values, dtype=np.float32)
x_train = x_train.reshape(-1, 1)

y_values = [2*i + 1 for i in x_values]
y_train = np.array(y_values, dtype=np.float32)
y_train = y_train.reshape(-1, 1)

from torch.autograd import Variable
class linearRegression(torch.nn.Module):
    def __init__(self, inputSize, outputSize):
        super(linearRegression, self).__init__()
        self.linear = torch.nn.Linear(inputSize, outputSize)

    def forward(self, x):
        out = self.linear(x)
        return out
inputDim = 1        # takes variable 'x' 
outputDim = 1       # takes variable 'y'
learningRate = 0.01 
epochs = 10

# model = linearRegression(inputDim, outputDim)
model = LinRegModel(inputDim, outputDim)
# if torch.cuda.is_available():
#     model.cuda()
criterion = torch.nn.MSELoss() 
optimizer = torch.optim.SGD(model.parameters(), lr=learningRate)

for epoch in range(epochs):
    # Converting inputs and labels to Variable
    # if torch.cuda.is_available() and False:
    #     inputs = Variable(torch.from_numpy(x_train).cuda())
    #     labels = Variable(torch.from_numpy(y_train).cuda())
    # else:
    #     inputs = Variable(torch.from_numpy(x_train))
    #     labels = Variable(torch.from_numpy(y_train))
    inputs = torch.from_numpy(x_train)
    labels = torch.from_numpy(y_train)

    # Clear gradient buffers because we don't want any gradient from previous epoch to carry forward, dont want to cummulate gradients
    optimizer.zero_grad()

    # get output from the model, given the inputs
    outputs = model(inputs)

    # get loss for the predicted output
    loss = criterion(outputs, labels)
    print(loss)
    # get gradients w.r.t to parameters
    loss.backward()

    # update parameters
    optimizer.step()

    print('epoch {}, loss {}'.format(epoch, loss.item()))

In [None]:
with torch.no_grad(): # we don't need gradients in the testing phase
    if torch.cuda.is_available():
        predicted = model(Variable(torch.from_numpy(x_train).cuda())).cpu().data.numpy()
    else:
        predicted = model(Variable(torch.from_numpy(x_train))).data.numpy()
    print(predicted)

plt.clf()
plt.plot(x_train, y_train, 'go', label='True data', alpha=0.5)
plt.plot(x_train, predicted, '--', label='Predictions', alpha=0.5)
plt.legend(loc='best')
plt.show()