### Introduction to Pandas

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [5]:
attrition = pd.read_csv('data/employee_attrition.csv')

In [7]:
attrition.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


- Explore data
- Work with missing data
- Exploring Categories
- Exploring Quantitative Features
- Groupby

In [8]:
attrition.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
Age                         1470 non-null int64
Attrition                   1470 non-null object
BusinessTravel              1470 non-null object
DailyRate                   1470 non-null int64
Department                  1470 non-null object
DistanceFromHome            1470 non-null int64
Education                   1470 non-null int64
EducationField              1470 non-null object
EmployeeCount               1470 non-null int64
EmployeeNumber              1470 non-null int64
EnvironmentSatisfaction     1470 non-null int64
Gender                      1470 non-null object
HourlyRate                  1470 non-null int64
JobInvolvement              1470 non-null int64
JobLevel                    1470 non-null int64
JobRole                     1470 non-null object
JobSatisfaction             1470 non-null int64
MaritalStatus               1470 non-null object
MonthlyIncome         

In [13]:
attrition['EducationField'].value_counts()

Life Sciences       606
Medical             464
Marketing           159
Technical Degree    132
Other                82
Human Resources      27
Name: EducationField, dtype: int64

In [14]:
attrition['EducationField'].unique()

array(['Life Sciences', 'Other', 'Medical', 'Marketing',
       'Technical Degree', 'Human Resources'], dtype=object)

In [17]:
attrition[['WorkLifeBalance', 'EducationField']].head()

Unnamed: 0,WorkLifeBalance,EducationField
0,1,Life Sciences
1,3,Life Sciences
2,3,Other
3,3,Life Sciences
4,3,Medical


In [30]:
sub_df = attrition.loc[:10, ['Attrition', 'BusinessTravel']]

In [32]:
sub_df

Unnamed: 0,Attrition,BusinessTravel
0,Yes,Travel_Rarely
1,No,Travel_Frequently
2,Yes,Travel_Rarely
3,No,Travel_Frequently
4,No,Travel_Rarely
5,No,Travel_Frequently
6,No,Travel_Rarely
7,No,Travel_Rarely
8,No,Travel_Frequently
9,No,Travel_Rarely


In [28]:
attrition.head(2)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7


In [29]:
attrition.iloc[:10, [i for i in range(10) if i % 2 == 0]]

Unnamed: 0,Age,BusinessTravel,Department,Education,EmployeeCount
0,41,Travel_Rarely,Sales,2,1
1,49,Travel_Frequently,Research & Development,1,1
2,37,Travel_Rarely,Research & Development,2,1
3,33,Travel_Frequently,Research & Development,4,1
4,27,Travel_Rarely,Research & Development,1,1
5,32,Travel_Frequently,Research & Development,2,1
6,59,Travel_Rarely,Research & Development,3,1
7,30,Travel_Rarely,Research & Development,1,1
8,38,Travel_Frequently,Research & Development,3,1
9,36,Travel_Rarely,Research & Development,3,1


In [33]:
attrition['Department'].value_counts()

Research & Development    961
Sales                     446
Human Resources            63
Name: Department, dtype: int64

In [35]:
attrition.loc[(attrition['Department'] == 'Research & Development') & (attrition['EducationField'] == 'Medical')]

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
6,59,No,Travel_Rarely,1324,Research & Development,3,3,Medical,1,10,...,1,80,3,12,3,2,1,0,0,0
9,36,No,Travel_Rarely,1299,Research & Development,27,3,Medical,1,13,...,2,80,2,17,3,2,7,7,7,7
10,35,No,Travel_Rarely,809,Research & Development,16,3,Medical,1,14,...,3,80,1,6,5,3,5,4,0,3
13,34,No,Travel_Rarely,1346,Research & Development,19,2,Medical,1,18,...,3,80,1,3,2,3,2,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1460,29,No,Travel_Rarely,468,Research & Development,28,4,Medical,1,2054,...,2,80,0,5,3,1,5,4,0,4
1463,31,No,Non-Travel,325,Research & Development,5,3,Medical,1,2057,...,2,80,0,10,2,3,9,4,1,7
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7


In [37]:
groups = attrition.groupby('EducationField')

In [46]:
groups.mean()

Unnamed: 0_level_0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
EducationField,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Human Resources,37.037037,675.259259,9.037037,3.111111,1.0,1180.37037,2.666667,60.888889,2.740741,2.148148,...,3.037037,80.0,0.925926,11.592593,2.444444,2.740741,6.777778,3.555556,1.62963,3.37037
Life Sciences,37.107261,804.425743,8.955446,2.929043,1.0,1024.427393,2.689769,66.831683,2.732673,2.052805,...,2.686469,80.0,0.775578,11.245875,2.739274,2.727723,6.993399,4.308581,2.178218,4.138614
Marketing,37.924528,727.836478,10.106918,3.125786,1.0,999.830189,2.72327,66.150943,2.691824,2.358491,...,2.691824,80.0,0.849057,11.855346,2.691824,2.798742,7.559748,4.490566,2.245283,4.465409
Medical,36.838362,822.799569,9.353448,2.803879,1.0,1017.165948,2.6875,65.280172,2.747845,2.040948,...,2.760776,80.0,0.836207,11.564655,2.93319,2.762931,7.105603,4.088362,2.295259,4.0625
Other,35.365854,796.02439,8.926829,3.073171,1.0,1050.695122,3.012195,62.365854,2.695122,1.987805,...,2.621951,80.0,0.646341,10.353659,2.756098,2.853659,6.243902,3.97561,1.658537,3.95122
Technical Degree,36.121212,842.128788,8.810606,2.825758,1.0,1036.242424,2.818182,66.621212,2.719697,1.871212,...,2.674242,80.0,0.727273,10.25,2.833333,2.810606,6.590909,4.340909,2.227273,4.113636


In [49]:
groups = attrition.groupby(['EducationField', 'Department', 'BusinessTravel'])

In [52]:
groups.agg(['mean', 'sum'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Age,Age,DailyRate,DailyRate,DistanceFromHome,DistanceFromHome,Education,Education,EmployeeCount,EmployeeCount,...,WorkLifeBalance,WorkLifeBalance,YearsAtCompany,YearsAtCompany,YearsInCurrentRole,YearsInCurrentRole,YearsSinceLastPromotion,YearsSinceLastPromotion,YearsWithCurrManager,YearsWithCurrManager
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum,...,mean,sum,mean,sum,mean,sum,mean,sum,mean,sum
EducationField,Department,BusinessTravel,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
Human Resources,Human Resources,Non-Travel,48.0,192,862.5,3450,8.5,34,3.25,13,1,4,...,3.0,12,5.5,22,4.25,17,2.5,10,4.0,16
Human Resources,Human Resources,Travel_Frequently,34.833333,209,938.0,5628,11.0,66,3.166667,19,1,6,...,2.666667,16,5.666667,34,2.5,15,0.333333,2,2.666667,16
Human Resources,Human Resources,Travel_Rarely,35.235294,599,538.470588,9154,8.470588,144,3.058824,52,1,17,...,2.705882,46,7.470588,127,3.764706,64,1.882353,32,3.470588,59
Life Sciences,Human Resources,Non-Travel,28.0,28,280.0,280,1.0,1,2.0,2,1,1,...,3.0,3,3.0,3,2.0,2,2.0,2,2.0,2
Life Sciences,Human Resources,Travel_Frequently,30.0,60,537.0,1074,14.0,28,3.5,7,1,2,...,3.0,6,8.5,17,8.0,16,1.0,2,4.0,8
Life Sciences,Human Resources,Travel_Rarely,41.692308,542,822.692308,10695,8.461538,110,3.076923,40,1,13,...,3.307692,43,7.846154,102,3.769231,49,1.769231,23,4.846154,63
Life Sciences,Research & Development,Non-Travel,37.55814,1615,776.488372,33389,7.953488,342,2.883721,124,1,43,...,2.744186,118,7.186047,309,4.395349,189,1.837209,79,4.046512,174
Life Sciences,Research & Development,Travel_Frequently,35.967033,3273,797.142857,72540,8.406593,765,2.846154,259,1,91,...,2.604396,237,7.164835,652,4.241758,386,2.483516,226,4.340659,395
Life Sciences,Research & Development,Travel_Rarely,37.22549,11391,788.617647,241317,9.127451,2793,2.934641,898,1,306,...,2.699346,826,6.620915,2026,4.120915,1261,2.104575,644,3.986928,1220
Life Sciences,Sales,Non-Travel,38.578947,733,865.105263,16437,10.736842,204,2.947368,56,1,19,...,2.842105,54,8.947368,170,5.263158,100,2.789474,53,4.789474,91


In [54]:
attrition.describe(include = 'all')

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1470.0,1470,1470,1470.0,1470,1470.0,1470.0,1470,1470.0,1470.0,...,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
unique,,2,3,,3,,,6,,,...,,,,,,,,,,
top,,No,Travel_Rarely,,Research & Development,,,Life Sciences,,,...,,,,,,,,,,
freq,,1233,1043,,961,,,606,,,...,,,,,,,,,,
mean,36.92381,,,802.485714,,9.192517,2.912925,,1.0,1024.865306,...,2.712245,80.0,0.793878,11.279592,2.79932,2.761224,7.008163,4.229252,2.187755,4.123129
std,9.135373,,,403.5091,,8.106864,1.024165,,0.0,602.024335,...,1.081209,0.0,0.852077,7.780782,1.289271,0.706476,6.126525,3.623137,3.22243,3.568136
min,18.0,,,102.0,,1.0,1.0,,1.0,1.0,...,1.0,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,,,465.0,,2.0,2.0,,1.0,491.25,...,2.0,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,,,802.0,,7.0,3.0,,1.0,1020.5,...,3.0,80.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,,,1157.0,,14.0,4.0,,1.0,1555.75,...,4.0,80.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
