In [1]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 读取数据

In [2]:
data = pd.read_csv('./HR.csv')

In [3]:
data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,part,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   part                   14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [5]:
data.part.unique()

array(['sales', 'accounting', 'hr', 'technical', 'support', 'management',
       'IT', 'product_mng', 'marketing', 'RandD'], dtype=object)

In [6]:
data.salary.unique()

array(['low', 'medium', 'high'], dtype=object)

# 数据预处理

In [7]:
# 对于离散的字符串, 有两种处理方式, 1. 转化成数字. 2. 进行one-hot编码.
data = data.join(pd.get_dummies(data.part)).join(pd.get_dummies(data.salary))

In [8]:
# 把part和salary删掉. 
data.drop(columns=['part', 'salary'], inplace=True)

In [11]:
data.head(5)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,IT,RandD,...,hr,management,marketing,product_mng,sales,support,technical,high,low,medium
0,0.38,0.53,2,157,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [12]:
data.left.value_counts()

0    11428
1     3571
Name: left, dtype: int64

In [13]:
11428 / (11428 + 3571)

0.7619174611640777

In [14]:
# SMOTE 
Y_data = data.left.values.reshape(-1, 1)

In [15]:
Y_data

array([[1],
       [1],
       [1],
       ...,
       [1],
       [1],
       [1]], dtype=int64)

In [16]:
Y = torch.from_numpy(Y_data).type(torch.FloatTensor)

In [17]:
Y

tensor([[1.],
        [1.],
        [1.],
        ...,
        [1.],
        [1.],
        [1.]])

In [18]:
[c for c in data.columns if c != 'left']

['satisfaction_level',
 'last_evaluation',
 'number_project',
 'average_montly_hours',
 'time_spend_company',
 'Work_accident',
 'promotion_last_5years',
 'IT',
 'RandD',
 'accounting',
 'hr',
 'management',
 'marketing',
 'product_mng',
 'sales',
 'support',
 'technical',
 'high',
 'low',
 'medium']

In [19]:
X_data =  data[[c for c in data.columns if c != 'left']].values

In [20]:
X = torch.from_numpy(X_data).type(torch.FloatTensor)

In [21]:
X.shape

torch.Size([14999, 20])

# 常用的创建模型的方法

In [22]:
# pytorch中最常用的一种创建模型的方式
# 子类的写法. 
from torch import nn

In [23]:
class HRModel(nn.Module):
    def __init__(self):
        # 先调用父类的方法
        super().__init__()
        # 定义你这个网络中会用到的东西. 
        self.lin_1 = nn.Linear(20, 64)
        self.lin_2 = nn.Linear(64, 64)
        self.lin_3 = nn.Linear(64, 1)
        self.activate = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, input):
        # 定义前向传播
        x = self.lin_1(input)
        x = self.activate(x)
        x = self.lin_2(x)
        x = self.activate(x)
        x = self.lin_3(x)
        x = self.sigmoid(x)
        return x

In [24]:
lr = 0.001

In [25]:
# 定义获取模型的函数和优化器
def get_model():
    model = HRModel()
    return model, torch.optim.Adam(model.parameters(), lr=lr)

In [27]:
# 定义损失函数
loss_fn = nn.BCELoss()

In [28]:
model, opt = get_model()

In [29]:
batch_size = 64
steps = len(data) // batch_size
epochs = 100

In [30]:
# 训练过程
for epoch in range(epochs):
    for i in range(steps):
        start = i * batch_size
        end = start + batch_size
        x = X[start: end]
        y = Y[start: end]
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        opt.zero_grad()
        loss.backward()
        opt.step()
print('epoch:', epoch, '     ', 'loss: ', loss_fn(model(X), Y))

epoch: 99       loss:  tensor(0.5491, grad_fn=<BinaryCrossEntropyBackward0>)


In [31]:
((model(X).data.numpy() > 0.5) == Y.numpy()).mean()

0.7619174611640777

# 使用dataset重构

len(data) data.__len__()
__getitem__() 对应根据索引取数据. data[0]  = data.__getitem__(0)

pytorch中有一个Dataset类, 可以把任意的具有__len__和__getitem__的对象包装成Dataset对象. 

Dataset自动取数据

In [32]:
from torch.utils.data import TensorDataset

In [33]:
HRdataset = TensorDataset(X, Y)

In [34]:
HRdataset

<torch.utils.data.dataset.TensorDataset at 0x27365baa550>

In [35]:
model, opt = get_model()

In [36]:
# 重写训练过程
for epoch in range(epochs):
    for step in range(steps):
        # 取数据不一样了
        x, y = HRdataset[step * batch_size: step * batch_size + batch_size]
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        opt.zero_grad()
        loss.backward()
        opt.step()
print('epoch:', epoch, '     ', 'loss: ', loss_fn(model(X), Y))

epoch: 99       loss:  tensor(0.5490, grad_fn=<BinaryCrossEntropyBackward0>)


# 使用DataLoader重构

In [37]:
# dataloader可以自动分批取数据
# dataloader是由dataset创建出来的. 
# 有了dataloader就不需要按切片取数据
from torch.utils.data import DataLoader

In [38]:
HR_ds = TensorDataset(X, Y)
HR_dl = DataLoader(HR_ds, batch_size=batch_size)

In [39]:
# 现在取数据就方便了. 
for x, y in HR_dl:
    print(x, y)

tensor([[0.3800, 0.5300, 2.0000,  ..., 0.0000, 1.0000, 0.0000],
        [0.8000, 0.8600, 5.0000,  ..., 0.0000, 0.0000, 1.0000],
        [0.1100, 0.8800, 7.0000,  ..., 0.0000, 0.0000, 1.0000],
        ...,
        [0.1100, 0.9300, 7.0000,  ..., 0.0000, 0.0000, 1.0000],
        [0.1000, 0.9500, 6.0000,  ..., 0.0000, 0.0000, 1.0000],
        [0.3600, 0.5600, 2.0000,  ..., 0.0000, 0.0000, 1.0000]]) tensor([[1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],


        [0.5200, 0.7300, 4.0000,  ..., 0.0000, 0.0000, 1.0000]]) tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]])
tensor([[0.9600, 0.7700, 3.0000,  ...,

tensor([[0.8100, 0.9100, 4.0000,  ..., 0.0000, 1.0000, 0.0000],
        [0.4200, 0.5600, 2.0000,  ..., 0.0000, 0.0000, 1.0000],
        [0.1100, 0.8700, 6.0000,  ..., 0.0000, 0.0000, 1.0000],
        ...,
        [0.9000, 0.9300, 4.0000,  ..., 0.0000, 0.0000, 1.0000],
        [0.7500, 0.8300, 3.0000,  ..., 0.0000, 0.0000, 1.0000],
        [0.8100, 0.6400, 4.0000,  ..., 0.0000, 0.0000, 1.0000]]) tensor([[1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],


In [40]:
model, opt = get_model()


In [41]:
for epoch in range(epochs):
    for x, y in HR_dl:
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        opt.zero_grad()
        loss.backward()
        opt.step()
print('epoch:', epoch, '     ', 'loss: ', loss_fn(model(X), Y))

epoch: 99       loss:  tensor(0.5489, grad_fn=<BinaryCrossEntropyBackward0>)


# 添加验证

In [42]:
# 需要分割出训练数据和测试数据. 
# 我们刚才是把所有数据作为训练数据.
from sklearn.model_selection import train_test_split

切割数据--> 分别创建训练数据和测试数据的dataloader--> 训练过程 --> 校验过程

In [43]:
train_x, test_x, train_y, test_y = train_test_split(X_data, Y_data, random_state=5)

In [44]:
train_x.shape
test_x.shape

(3750, 20)

In [45]:
display(train_x.shape,test_x.shape)

(11249, 20)

(3750, 20)

In [46]:
# 转化成tensor
train_x = torch.from_numpy(train_x).type(torch.FloatTensor)
test_x = torch.from_numpy(test_x).type(torch.FloatTensor)

train_y = torch.from_numpy(train_y).type(torch.FloatTensor)
test_y = torch.from_numpy(test_y).type(torch.FloatTensor)

In [47]:
# 变成dataset和dataloader
train_ds = TensorDataset(train_x, train_y)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

test_ds = TensorDataset(test_x, test_y)
test_dl = DataLoader(test_ds, batch_size=batch_size * 2, shuffle=True)

In [48]:
# 定义计算准确率的函数
def accuracy(out, yb):
    return ((out.data.numpy() > 0.5) == yb.numpy()).mean()

In [49]:
# pytorch中有训练模式, 和测试/推理模式. model.train(), model.eval()
# 训练模式和测试模型对一些特殊层会有不同的表现. 比如, dropout, bn等. 
epochs = 1000
model , opt = get_model()

for epoch in range(epochs + 1):
    # 训练的时候, 调到训练模式
    model.train()
    for xb, yb in train_dl:
        pred = model(xb)
        loss = loss_fn(pred, yb)
        
        loss.backward()
        opt.zero_grad()
        opt.step()
        
    # 每训练100次输出一次测试结果
    if epoch % 100 == 0:
        model.eval()
        with torch.no_grad():
            # 计算测试损失
            valid_loss = sum([loss_fn(model(x), y) for x,y in test_dl])
            acc_mean = np.mean([accuracy(model(x), y) for x, y in test_dl])
        print(epoch, valid_loss / len(test_dl), acc_mean)

0 tensor(0.6202) 0.7551260964912281
100 tensor(0.6187) 0.7538925438596491
200 tensor(0.6228) 0.7538925438596491
300 tensor(0.6266) 0.7551260964912281
400 tensor(0.6184) 0.7545093201754386
500 tensor(0.6190) 0.7557428728070176
600 tensor(0.6125) 0.7545093201754386
700 tensor(0.6186) 0.7520422149122807
800 tensor(0.6193) 0.7526589912280702
900 tensor(0.6178) 0.7538925438596491
1000 tensor(0.6242) 0.7575932017543859


# 封装各个模块

In [50]:
# 按批次计算损失
def loss_batch(model, loss_func, xb, yb, opt=None):
    loss = loss_func(model(xb), yb)
    
    if opt is not None:
        loss.backward()
        opt.step()
        opt.zero_grad()
    return loss.item(), len(xb)

In [51]:
def fit(epochs, model, loss_func, opt, train_dl, valid_dl):
    for epoch in range(epochs):
        model.train()
        for xb, yb in train_dl:
            loss_batch(model, loss_func, xb, yb, opt)
            
        model.eval()
        with torch.no_grad():
            losses, nums = zip(*[loss_batch(model, loss_func, xb, yb) for xb, yb in valid_dl])
        val_loss = np.sum(np.multiply(losses, nums)) / np.sum(nums)
        acc_mean = np.mean([accuracy(model(x), y) for x, y in valid_dl])
        print(epoch, val_loss, acc_mean)

In [52]:
def get_data(train_ds, valid_ds, batch_size):
    return (DataLoader(train_ds, batch_size=batch_size, shuffle=True), DataLoader(valid_ds, batch_size=batch_size * 2))

In [53]:
# 定义计算准确率的函数
def accuracy(out, yb):
    return ((out.data.numpy() > 0.5) == yb.numpy()).mean()

In [55]:
loss_fn

BCELoss()

In [56]:
# 整个训练校验过程就可以用三行代码来执行
# 获取数据
train_dl, valid_dl = get_data(train_ds, test_ds, batch_size)
model, opt = get_model()
fit(epochs, model, loss_fn, opt, train_dl, valid_dl)

0 0.55042811683019 0.7545093201754386
1 0.561672063477834 0.7545093201754386
2 0.45167796998023985 0.7479029605263158
3 0.430368107620875 0.848999451754386
4 0.41532673211097715 0.7516447368421052
5 0.3531622886180878 0.8239994517543859
6 0.3524248302300771 0.8603892543859649
7 0.3797553614616394 0.8498766447368421
8 0.31472286097208657 0.8790433114035088
9 0.31343443698883056 0.8780975877192982
10 0.3030687873840332 0.8876370614035088
11 0.3348326402505239 0.8661184210526316
12 0.29838120489120484 0.8866913377192983
13 0.2922196108818054 0.8865953947368421
14 0.30615316615104676 0.8796600877192983
15 0.2931689449310303 0.8913788377192983
16 0.2784871667226156 0.894764254385965
17 0.3237423828125 0.8733141447368421
18 0.2721423300743103 0.893983004385965
19 0.3459079308986664 0.8592516447368421
20 0.272909268395106 0.8915433114035088
21 0.32976490023930866 0.8417077850877193
22 0.27872509123484296 0.8907620614035088
23 0.3045408615907033 0.8659265350877193
24 0.2787865794340769 0.89590

197 0.14039829874634743 0.9541666666666667
198 0.12665438042680421 0.9609375
199 0.12572016956806184 0.9600603070175439
200 0.11730374170740446 0.96484375
201 0.12004383370081584 0.9637061403508772
202 0.12118932812412579 0.9644873903508772
203 0.12695398279825845 0.9631853070175438
204 0.12056169945597649 0.9661458333333334
205 0.12445366378227869 0.9626644736842105
206 0.12459450637300809 0.96796875
207 0.1269550561308861 0.9651041666666667
208 0.1211838273247083 0.9642269736842105
209 0.13554833063085875 0.9625
210 0.13638122116327286 0.95859375
211 0.12173954322139421 0.9638020833333333
212 0.12408838707009952 0.9669270833333333
213 0.12073380325734616 0.9661458333333334
214 0.15317485822439195 0.9528645833333333
215 0.1262709829290708 0.965625
216 0.12949422912200292 0.9602247807017543
217 0.1325968513816595 0.9638020833333333
218 0.1231701291402181 0.9656935307017543
219 0.13603893097043038 0.9609375
220 0.12758221166133882 0.9653645833333333
221 0.1402527823448181 0.961458333333

396 0.14391569293340048 0.9567434210526317
397 0.14407290239334106 0.9629934210526316
398 0.14773776917854944 0.9585663377192983
399 0.12923558081388473 0.9662143640350876
400 0.1389783552110195 0.9591831140350877
401 0.14140126896401248 0.965625
402 0.13445125737984975 0.9644873903508772
403 0.13808887655735017 0.9627604166666667
404 0.129313410170873 0.9638706140350877
405 0.1308664032260577 0.9644873903508772
406 0.1383558899641037 0.9586622807017543
407 0.1362910778860251 0.9631853070175438
408 0.13385124916235605 0.9653645833333333
409 0.12929196996688844 0.963610197368421
410 0.14694942498604457 0.9621436403508772
411 0.1274766582429409 0.9663103070175438
412 0.14059188992480437 0.96328125
413 0.13698353854020437 0.9637061403508772
414 0.15046981631914774 0.9595394736842106
415 0.14073898046414057 0.9590186403508771
416 0.12974030163784822 0.9669270833333333
417 0.1539084055642287 0.9598958333333333
418 0.13424416488011678 0.9665707236842105
419 0.1323360955297947 0.9647478070175

590 0.14782039192120233 0.9647478070175438
591 0.16594257662296294 0.9609100877192983
592 0.1427078048070272 0.9699561403508772
593 0.1984866483529409 0.9497395833333333
594 0.1569402266105016 0.9686540570175438
595 0.137561039463679 0.9662143640350876
596 0.14084545129934947 0.9688185307017544
597 0.1529636874715487 0.9637061403508772
598 0.17656735631624856 0.955797697368421
599 0.15201142162879308 0.962828947368421
600 0.148793896373113 0.9641310307017543
601 0.1447433263460795 0.9641310307017543
602 0.15543180345694224 0.9629248903508771
603 0.13576263346473377 0.9717790570175439
604 0.1427620891114076 0.9678728070175439
605 0.14109166656335195 0.9668311403508771
606 0.15200356545845667 0.9702165570175438
607 0.15030983871221543 0.9663103070175438
608 0.14538488335609437 0.9686540570175438
609 0.14218594711621602 0.968297697368421
610 0.14447908845146498 0.9689144736842105
611 0.14191641664306323 0.9678728070175439
612 0.17553311175107955 0.9553728070175438
613 0.13882252570788065 

785 0.1652296475370725 0.967516447368421
786 0.1706675111413002 0.9663103070175438
787 0.2227864575544993 0.945641447368421
788 0.17947066909472148 0.9680372807017543
789 0.15490603690942129 0.9701206140350876
790 0.1660214937210083 0.9688185307017544
791 0.15966145817836125 0.9688185307017544
792 0.16622008969088395 0.9723958333333333
793 0.1752480032444 0.9673519736842106
794 0.19401158425013224 0.9599643640350877
795 0.200883497685194 0.9553728070175438
796 0.17340138792594273 0.9681332236842105
797 0.17142209731737773 0.9656935307017543
798 0.1778174268802007 0.965172697368421
799 0.17815693177382153 0.9657894736842105
800 0.20404161899288495 0.9626644736842105
801 0.16695843514998754 0.9672560307017544
802 0.17079145941138268 0.9694353070175439
803 0.17489934812386831 0.9646518640350876
804 0.2114803425083558 0.9645833333333333
805 0.2362808034102122 0.9618832236842105
806 0.16195295729438464 0.9712582236842106
807 0.15915301712354024 0.9688185307017544
808 0.16425078925291697 0.9

980 0.21776484440167745 0.9663103070175438
981 0.21004305464426676 0.9603207236842105
982 0.19035264031092325 0.9676123903508772
983 0.20813325552940368 0.9644873903508772
984 0.1960753565589587 0.960485197368421
985 0.17796099640528362 0.9683936403508772
986 0.1811627125263214 0.9656935307017543
987 0.2095604936917623 0.968297697368421
988 0.18614961694081625 0.9610060307017544
989 0.19887838115394116 0.9666666666666667
990 0.21431216468016306 0.9686540570175438
991 0.19897878243525824 0.9683936403508772
992 0.21399380248387653 0.9694353070175439
993 0.21055280084212621 0.9717790570175439
994 0.23770056153535843 0.9681332236842105
995 0.248064262231191 0.9694353070175439
996 0.2175922228574753 0.970641447368421
997 0.216632215154171 0.9704769736842105
998 0.22829337039987246 0.9629248903508771
999 0.2135890254497528 0.9696957236842105


In [57]:
import cv2