In [None]:
import numpy as np 
import pandas as pd 
from patsy import dmatrices 
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn import metrics 
import matplotlib.pyplot as plt 

1. Read Data ../input/HR_comma_sep.csv

In [None]:
data = pd.read_csv("../input/HR_comma_sep.csv")
data.head()
data.columns

In [None]:
data.left = data.left.astype(int)

2. Relations (left and salary)

In [None]:
pd.crosstab(data.salary, data.left).plot(kind = 'bar', stacked = True)

In [None]:
tmp = pd.crosstab(data.salary, data.left)
tmp.div(tmp.sum(axis =1), axis = 0).plot(kind = 'bar', stacked = True)

Left vs. satisfaction level

In [None]:
plt.subplot(121)
data[data['left'] == 0]['satisfaction_level'].hist(bins = 50)
plt.subplot(122)
data[data['left'] == 1]['satisfaction_level'].hist(bins = 50)

dmatrices categorical features;

In [None]:
y, X = dmatrices('left ~ satisfaction_level + last_evaluation + number_project + average_montly_hours + time_spend_company + Work_accident + promotion_last_5years + C(sales) + C(salary)',data, return_type = 'dataframe')

In [None]:
X = X.rename(columns = {
    'C(sales)[T.RandD]': 'Department: Random',
    'C(sales)[T.accounting]': 'Department: Accounting',
    'C(sales)[T.hr]': 'Department: HR',
    'C(sales)[T.management]': 'Department: Management',
    'C(sales)[T.marketing]': 'Department: Marketing',
    'C(sales)[T.product_mng]': 'Department: Product_Management',
    'C(sales)[T.sales]': 'Department: Sales',
    'C(sales)[T.support]': 'Department: Support',
    'C(sales)[T.technical]': 'Department: Technical',
    'C(salary)[T.low]': 'Salary: Low',
    'C(salary)[T.medium]': 'Salary: Medium'}) 
y = np.ravel(y) # 将y变成np的一维数组

Training data use logistic regression

In [None]:
model = LogisticRegression()
model.fit(X,y)
pd.DataFrame(list(zip(X.columns, np.transpose(model.coef_))))

Score of the model

In [None]:
print(model.score(X,y))
model.coef_

Predict if one employee will leave: (e.g. high salary HR, satisfaction level 0.5, evalution last time 0.7, 4 projects, 160hr/month, 3 years, no promotion in last 5 years, no injury)

In [None]:
model.predict_proba([[1,0,0,1,0,0,0,0,0,0,0,0, 0.5, 0.7, 4.0, 160, 3.0, 0, 0]])

In [None]:
model.predict_proba(X)
pred = model.predict(X)
(abs(pred-y)).sum() / len(y)

Split dataset into training and testing data

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [None]:
model2 = LogisticRegression(C=10000)
model2.fit(Xtrain, ytrain)
pred = model2.predict(Xtest)
metrics.accuracy_score(ytest, pred)


Get the confusion matrix

In [None]:
metrics.confusion_matrix(ytest, pred)

In [None]:
print(metrics.classification_report(ytest, pred))


10 fold Cross Validation

In [None]:
print(cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=10))

In [None]:
cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=10).mean()