In [None]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

attrition = pd.read_csv('../input/WA_Fn-UseC_-HR-Employee-Attrition.csv')
attrition.head()

In [None]:
attrition.dtypes

In [None]:
numerical = ['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EmployeeNumber', 'EnvironmentSatisfaction',
       'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
       'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany',
       'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager']

plt.subplots(figsize=(20,15))
ax = plt.axes()
ax.set_title("Employee Data Heatmap")
corr = attrition[numerical].corr()
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)

This correlation heatmap shows that most variables in the dataset are not closely related. Total working years is one of the few variables with a relatively high correlation with other variables, namely job level, monthly income, and the various variables showing years at the company. 

Let's look at total working years against a few possible predictors of turnover (income, job satisfaction, performance rating, gender) to see if there's any possible correlation.

In [None]:
attrition['Perf_Rating'] = attrition.PerformanceRating.astype(object)
attrition['Job_Sat'] = attrition.JobSatisfaction.astype(object)

from ggplot import *

p = ggplot(aes(x='TotalWorkingYears',y='MonthlyIncome', 
               size = 'Perf_Rating', shape= 'Job_Sat', color = 'Gender'),data=attrition)
p + geom_point() + \
    stat_smooth(size = 1, method = 'lm', se=False) + \
    facet_grid('Attrition' , 'Department')

In [None]:
from ggplot import *

p = ggplot(aes(x='TotalWorkingYears',y='MonthlyIncome'),data=attrition)
p + geom_point() + \
    stat_smooth(color = "red") + \
    facet_wrap("Gender" , "Attrition")

In [None]:
#split into test and train dataset

#x = predictors, y = response
x = attrition.drop(['Attrition'], axis=1)
y = attrition['Attrition']

#split
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedShuffleSplit

x_train, y_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0, stratify=y)