### 预测员工的离职率

在给定的样本数据中，列了几个跟离职率有可能存在关系的特征，然后基于这个特征希望能够准确预测出每一位员工的离职率。

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as matplot
%matplotlib inline
from sklearn.model_selection import train_test_split

In [3]:
# 读取数据到pandas dataframe
df = pd.read_csv("./data/HR_comma_sep.csv", index_col=None)

In [4]:
# 检测是否有缺失值
print(df.isnull().any(), "\n\n")

satisfaction_level       False
last_evaluation          False
number_project           False
average_montly_hours     False
time_spend_company       False
Work_accident            False
left                     False
promotion_last_5years    False
sales                    False
salary                   False
dtype: bool 




In [5]:
# 查看数据样例
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [6]:
# 修改列名
df = df.rename(columns={'satisfaction_level' : 'satisfaction',
                       'last_evaluation' : 'evaluation',
                       'number_project' : 'projectCount',
                       'average_montly_hours' : 'averageMontlyHours',
                       'time_spend_company' : 'yearsAtCompony',
                       'Work_accident' : 'workAccident',
                       'promotion_last_5years' : 'promotion',
                       'sales' : 'department',
                       'left' : 'turnover'})

In [7]:
# 将预测标签是否离职放在第一列
front = df['turnover']
df.drop(labels=['turnover'], axis=1, inplace=True)
df.insert(0, 'turnover', front)
df.head()

Unnamed: 0,turnover,satisfaction,evaluation,projectCount,averageMontlyHours,yearsAtCompony,workAccident,promotion,department,salary
0,1,0.38,0.53,2,157,3,0,0,sales,low
1,1,0.8,0.86,5,262,6,0,0,sales,medium
2,1,0.11,0.88,7,272,4,0,0,sales,medium
3,1,0.72,0.87,5,223,5,0,0,sales,low
4,1,0.37,0.52,2,159,3,0,0,sales,low


In [13]:
# 计算离职员工百分比
turnover_ration = df.turnover.value_counts() / len(df)
print("样本数据中,离职率为: %.2f" % turnover_ration[1])

样本数据中,离职率为: 0.24


In [14]:
# 查看df统计结果
df.describe()

Unnamed: 0,turnover,satisfaction,evaluation,projectCount,averageMontlyHours,yearsAtCompony,workAccident,promotion
count,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0
mean,0.238083,0.612834,0.716102,3.803054,201.050337,3.498233,0.14461,0.021268
std,0.425924,0.248631,0.171169,1.232592,49.943099,1.460136,0.351719,0.144281
min,0.0,0.09,0.36,2.0,96.0,2.0,0.0,0.0
25%,0.0,0.44,0.56,3.0,156.0,3.0,0.0,0.0
50%,0.0,0.64,0.72,4.0,200.0,3.0,0.0,0.0
75%,0.0,0.82,0.87,5.0,245.0,4.0,0.0,0.0
max,1.0,1.0,1.0,7.0,310.0,10.0,1.0,1.0


In [15]:
# 将string类型转为整数类型
df["department"] = df["department"].astype('category').cat.codes
df["salary"] = df["salary"].astype('category').cat.codes

In [16]:
df.head()

Unnamed: 0,turnover,satisfaction,evaluation,projectCount,averageMontlyHours,yearsAtCompony,workAccident,promotion,department,salary
0,1,0.38,0.53,2,157,3,0,0,7,1
1,1,0.8,0.86,5,262,6,0,0,7,2
2,1,0.11,0.88,7,272,4,0,0,7,2
3,1,0.72,0.87,5,223,5,0,0,7,1
4,1,0.37,0.52,2,159,3,0,0,7,1


In [17]:
# 设置特征标签
target_name = 'turnover'
X = df.drop('turnover', axis=1)
y = df[target_name]

In [18]:
# 将数据分为测试和验证
# 注意参数 stratify=y 意味着在产生训练和测试数据中，离职的员工的百分比等于原来总数据员工的百分比
X_train, X_test, y_train, y_text = train_test_split(X, y, test_size=0.15, random_state=99, stratify=y)

训练决策树模型，具体的参数可以参考一下官方的文档，并输出最后的结果。结果里包含了精确率、召回率，F1等指标。

In [19]:
# 导入使用模型
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

训练随机森林模型，具体的参数可以参考一下官方的文档，并输出最后的结果。从结果上看随机森林还是要好一些的，但不妨多调一调参数试试吧！

In [23]:
# 训练决策树
dtree = tree.DecisionTreeClassifier(
    criterion='gini',
#     max_depth=3,  # 定义树的深度，防止过拟合
    min_weight_fraction_leaf=0.01  # 定义叶子节点最少使用多少个样本（使用百分比表达），防止过拟合
)
dtree = dtree.fit(X_train, y_train)
print("\n\n ---决策树---")
print(classification_report(y_text, dtree.predict(X_test)))



 ---决策树---
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1714
           1       0.93      0.89      0.91       536

    accuracy                           0.96      2250
   macro avg       0.95      0.93      0.94      2250
weighted avg       0.96      0.96      0.96      2250



In [25]:
# 训练随机森林
rf = RandomForestClassifier(
    criterion='entropy',
    n_estimators=1000,
    max_depth=None, # 定义树的深度，防止过拟合
    min_samples_split=10, # 定义至少多少个样本的情况下才继续分叉
    min_weight_fraction_leaf=0.02  # 定义叶子节点最少使用多少个样本（使用百分比表达），防止过拟合
)
rf.fit(X_train, y_train)
print("\n\n---随机森林---")
print(classification_report(y_text, rf.predict(X_test)))



---随机森林---
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1714
           1       0.97      0.90      0.93       536

    accuracy                           0.97      2250
   macro avg       0.97      0.95      0.96      2250
weighted avg       0.97      0.97      0.97      2250

