In [2]:
import numpy as np
import torch as T
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
device = 'cpu'

In [3]:
import sklearn

In [4]:
seed = 0xEDA
T.manual_seed(seed)
np.random.seed(seed)

In [7]:
data = pd.read_csv('Shanghai_HMT_2010.csv')

In [8]:
print("data.shape = ", data.shape)
data.head(10)

data.shape =  (52584, 17)


Unnamed: 0,No,year,month,day,hour,season,PM_Jingan,PM_US Post,PM_Xuhui,DEWP,HUMI,PRES,TEMP,cbwd,Iws,precipitation,Iprec
0,1,2010,1,1,0,4,,,,-6.0,59.48,1026.1,1.0,cv,1.0,0.0,0.0
1,2,2010,1,1,1,4,,,,-6.0,59.48,1025.1,1.0,SE,2.0,0.0,0.0
2,3,2010,1,1,2,4,,,,-7.0,59.21,1025.1,0.0,SE,4.0,0.0,0.0
3,4,2010,1,1,3,4,,,,-6.0,63.94,1024.0,0.0,SE,5.0,0.0,0.0
4,5,2010,1,1,4,4,,,,-6.0,63.94,1023.0,0.0,SE,8.0,0.0,0.0
5,6,2010,1,1,5,4,,,,-7.0,59.21,1023.0,0.0,SE,11.0,0.0,0.0
6,7,2010,1,1,6,4,,,,-6.0,59.48,1023.0,1.0,SE,14.0,0.0,0.0
7,8,2010,1,1,7,4,,,,-5.0,64.18,1023.0,1.0,SE,17.0,0.0,0.0
8,9,2010,1,1,8,4,,,,-3.0,69.43,1023.0,2.0,SE,20.0,0.0,0.0
9,10,2010,1,1,9,4,,,,-2.0,64.9,1023.0,4.0,SE,23.0,0.0,0.0


In [9]:
data = data.drop(['PM_Jingan','PM_US Post','PM_Xuhui'],axis=1).dropna()

In [10]:
data_num = data[['DEWP','HUMI','TEMP','Iws','precipitation']]
#нормировка
data_num = ((data_num - data_num.mean())/data_num.std()).dropna(axis=1)
data_num = np.array(data_num)

In [11]:
enc = OneHotEncoder()
cbwd = enc.fit_transform(data[['cbwd']]).toarray()
hour = enc.fit_transform(data[['hour']]).toarray()
month = enc.fit_transform(data[['month']]).toarray()

In [12]:
X = np.hstack([data_num,cbwd,hour,month])
median = np.median(np.array(data['PRES']))
Y = (np.array(data['PRES'])>=median).astype('int')

In [13]:
def split(X,Y,test_percent=0.2):
    test_size = int(test_percent*len(Y))
    indices = np.random.permutation(np.arange(len(Y)))
    train_indices = indices[test_size:]
    test_indices = indices[:test_size]
    train_x = X[train_indices]
    train_y = Y[train_indices]
    test_x = X[test_indices]
    test_y = Y[test_indices]
    return train_x,test_x,train_y,test_y
    

In [18]:
class LogisticRegression:
    def __init__(self,n_features,reg='none',lr=1e-2,lamb=0.01,n_epochs=100):
        self.n_features = n_features
        lo = -0.01; hi = 0.01
        w = T.rand((n_features), dtype=T.float32, requires_grad=True).to(device)
        w = (hi - lo) * w + lo
        w.grad = T.zeros(n_features)
        w.retain_grad()
        self.w = w
        b = T.zeros((1), dtype=T.float32, requires_grad=True).to(device)
        b.grad = T.zeros(1)
        b.retain_grad()
        self.b = b
        self.reg = reg
        self.lr = lr
        self.lamda = lamb
        self.n_epochs = 100
    def fit(self,X,y):
        indices = np.arange(len(y))
        min_delta = 2e-5
        tol_counter = 0
        max_tol = 3
        prev_loss = T.tensor(float('inf'))
        loss = T.tensor(0, dtype=T.float32, requires_grad=True).to(device)
        loss.grad = T.tensor(0,dtype=T.float32)
        loss.retain_grad()
        #print(loss.size())
        for epoch in range(0, self.n_epochs):
            np.random.shuffle(indices)
            batches = self.create_batches(indices)
            tot_loss = 0
            for batch in batches:
                x = train_x[batch]
                target = train_y[batch]
                oupt = self.forward(x)
                loss = (oupt - target).pow(2).mean()
                if self.reg=='l2':
                    loss+= self.lamda*T.norm(self.w, p=2) # l2 reg
                elif self.reg=='l1':
                    loss+= self.lamda*T.norm(self.w, p=1) # l1 reg
                tot_loss+=loss
                loss.backward(retain_graph=True)  # compute gradients
                self.w.data += -1 * self.lr * self.w.grad.data
                self.b.data += -1 * self.lr * self.b.grad.data
                self.w.grad = T.zeros(self.n_features)
                self.b.grad = T.zeros(1)
                loss.grad = T.tensor(0,dtype=T.float32)
            mean_loss = (tot_loss / len(batches)).item()
            if prev_loss - mean_loss < min_delta:
                tol_counter+=1
            else:
                tol_counter=0
            if tol_counter > max_tol:
                break
            prev_loss = mean_loss
            if epoch % 10 == 0:
                print("epoch = %4d " % epoch, end="")
                print("   loss = %6.4f" % (mean_loss))

        
    def forward(self,x):
        z = x@self.w
        z += self.b
        p = 1 / (1 + T.exp(-z))
        return p
    
    def create_batches(self,X,batch_size=128):
        batches = []
        for i in range(0,len(X)//batch_size):
            batches.append(X[i*batch_size:(i+1)*batch_size])
        if len(X)%batch_size>0:
            batches.append(X[len(X)//batch_size*batch_size:])
        return batches        
    
    def predict(self,X):
        return (self.predict_proba(X)>0.5).type(T.uint8) 
    
    def predict_proba(self,X):
        batches = self.create_batches(X)
        preds = []
        for b in batches:
            preds.append(self.forward(b))
        return T.cat(preds)


In [15]:
train_x, test_x, train_y, test_y = split(X,Y)


In [16]:
from sklearn.linear_model import LogisticRegression as SL_LogReg
sklearn_logreg = SL_LogReg()
sklearn_logreg.fit(train_x,train_y)
print('Accuracy: %.4f' %(np.sum(np.equal(sklearn_logreg.predict(test_x),test_y))/len(test_y)))

Accuracy: 0.9225


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
print("Превратим в тензоры:")
print(train_x)
print(train_y)

train_x = T.tensor(train_x, dtype=T.float32).to(device)
train_y = T.tensor(train_y, dtype=T.long).to(device)

Превратим в тензоры:
[[ 1.42382658  0.83140845  1.157129   ...  0.          0.
   0.        ]
 [-0.08645215  0.72921794 -0.43931429 ...  0.          0.
   0.        ]
 [ 0.41697409  0.48249277  0.19926303 ...  0.          0.
   0.        ]
 ...
 [ 0.2156036   1.05837302 -0.22645519 ...  0.          0.
   1.        ]
 [ 1.22245608  1.41519294  0.73141079 ...  0.          0.
   0.        ]
 [ 1.42382658  0.56548727  1.26355855 ...  0.          0.
   0.        ]]
[0 0 0 ... 1 0 0]


In [19]:
logreg = LogisticRegression(n_features=train_x.shape[1],reg='l2')
logreg.fit(train_x,train_y)

epoch =    0    loss = 0.1848
epoch =   10    loss = 0.1041
epoch =   20    loss = 0.0997
epoch =   30    loss = 0.0978
epoch =   40    loss = 0.0968
epoch =   50    loss = 0.0962
epoch =   60    loss = 0.0958
epoch =   70    loss = 0.0957
epoch =   80    loss = 0.0956


In [20]:
test_x = T.tensor(test_x, dtype=T.float32).to(device)
test_y = T.tensor(test_y, dtype=T.long).to(device)

In [21]:
print('Accuracy: %.4f'%(sum(logreg.predict(test_x)==test_y)/len(test_y)))

Accuracy: 0.9115
