# Series and DataFrames

In [1]:
%autosave 5

Autosaving every 5 seconds


In [2]:
# Import libraries
import pandas as pd
import numpy as np

## Series

In [3]:
# Define lists
temperature = [33, 19, 15, 89, 11, -5, 9]
days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

# Create series
pd.Series(data=temperature, index=days)

Mon    33
Tue    19
Wed    15
Thu    89
Fri    11
Sat    -5
Sun     9
dtype: int64

In [4]:
# Define dictionary
my_dict = {
    'Mon': 33,
    'Tue': 19,
    'Wed': 15,
    'Thu': 89,
    'Fri': 11,
    'Sat': -5,
    'Sun': 9
}

# Create series
pd.Series(data=my_dict)

Mon    33
Tue    19
Wed    15
Thu    89
Fri    11
Sat    -5
Sun     9
dtype: int64

In [5]:
# Define array
my_array = np.linspace(0, 10, 15)

# Create series
pd.Series(data=my_array)

0      0.000000
1      0.714286
2      1.428571
3      2.142857
4      2.857143
5      3.571429
6      4.285714
7      5.000000
8      5.714286
9      6.428571
10     7.142857
11     7.857143
12     8.571429
13     9.285714
14    10.000000
dtype: float64

In [7]:
# TASK --- Using this series...
my_series = pd.Series(np.arange(0, 20, 2))
print(my_series)
my_series+1
# -------- Add 1 to each value


0     0
1     2
2     4
3     6
4     8
5    10
6    12
7    14
8    16
9    18
dtype: int32


0     1
1     3
2     5
3     7
4     9
5    11
6    13
7    15
8    17
9    19
dtype: int32

In [8]:
# TASK --- Using this series...
my_series = pd.Series(np.arange(0, 20, 2))
my_series*2
# -------- Multiply each value by 2


0     0
1     4
2     8
3    12
4    16
5    20
6    24
7    28
8    32
9    36
dtype: int32

In [10]:
# TASK --- Using this series...
my_series = pd.Series(np.arange(0, 20, 2))
np.exp(my_series)
# -------- Calculate the exponential of each value


0    1.000000e+00
1    7.389056e+00
2    5.459815e+01
3    4.034288e+02
4    2.980958e+03
5    2.202647e+04
6    1.627548e+05
7    1.202604e+06
8    8.886111e+06
9    6.565997e+07
dtype: float64

## DataFrames

In [12]:
# Read csv file as DataFrame
data = pd.read_csv("HR-Employee-Attrition.csv", index_col='EmployeeNumber')

In [13]:
# Access index of DataFrame
data.index

Int64Index([   1,    2,    4,    5,    7,    8,   10,   11,   12,   13,
            ...
            2054, 2055, 2056, 2057, 2060, 2061, 2062, 2064, 2065, 2068],
           dtype='int64', name='EmployeeNumber', length=1470)

In [14]:
# Access columns of DataFrame
data.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
       'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18',
       'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [15]:
# Access values of DataFrame
data.values

array([[41, 'Yes', 'Travel_Rarely', ..., 4, 0, 5],
       [49, 'No', 'Travel_Frequently', ..., 7, 1, 7],
       [37, 'Yes', 'Travel_Rarely', ..., 0, 0, 0],
       ...,
       [27, 'No', 'Travel_Rarely', ..., 2, 0, 3],
       [49, 'No', 'Travel_Frequently', ..., 6, 0, 8],
       [34, 'No', 'Travel_Rarely', ..., 3, 1, 2]], dtype=object)

# Operations and manipulations
## Inspection of data

In [16]:
# Access first five rows of DataFrame
data.head()

Unnamed: 0_level_0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EnvironmentSatisfaction,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
EmployeeNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,2,...,1,80,0,8,0,1,6,4,0,5
2,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,3,...,4,80,1,10,3,3,10,7,1,7
4,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
5,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,4,...,3,80,0,8,3,3,8,7,3,0
7,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,1,...,4,80,1,6,3,3,2,2,2,2


In [17]:
# Access last five rows of DataFrame
data.tail()

Unnamed: 0_level_0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EnvironmentSatisfaction,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
EmployeeNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2061,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,3,...,3,80,1,17,3,3,5,2,0,3
2062,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,4,...,1,80,1,9,5,3,7,7,1,7
2064,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2,...,2,80,1,6,0,3,6,2,0,3
2065,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,4,...,4,80,0,17,3,2,9,6,0,8
2068,34,No,Travel_Rarely,628,Research & Development,8,3,Medical,1,2,...,1,80,0,6,3,4,4,3,1,2


## Selection, addition, and deletion of data

In [20]:
# Selecting one column
data['Age'].tail()

EmployeeNumber
2061    36
2062    39
2064    27
2065    49
2068    34
Name: Age, dtype: int64

In [26]:
# Selecting multiple columns
data[['Age', 'EmployeeCount', 'YearsAtCompany']].head()

Unnamed: 0_level_0,Age,EmployeeCount,YearsAtCompany
EmployeeNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,41,1,6
2,49,1,10
4,37,1,0
5,33,1,8
7,27,1,2


In [24]:
# Add column to DataFrame
data['AgeInMonths'] = data['Age'] * 12
data['AgeInMonths'].head()

EmployeeNumber
1    492
2    588
4    444
5    396
7    324
Name: AgeInMonths, dtype: int64

In [36]:
# Drop column from DataFrame
data.drop('AgeInMonths', axis=1, inplace=True)

KeyError: "['AgeInMonths'] not found in axis"

In [31]:
# TASK --- Drop `EmployeeCount`
data.drop('EmployeeCount', axis = 1, inplace = False)

Unnamed: 0_level_0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
EmployeeNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,...,1,80,0,8,0,1,6,4,0,5
2,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,...,4,80,1,10,3,3,10,7,1,7
4,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,...,2,80,0,7,3,3,0,0,0,0
5,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,4,Female,...,3,80,0,8,3,3,8,7,3,0
7,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,Male,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2061,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,3,Male,...,3,80,1,17,3,3,5,2,0,3
2062,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,4,Male,...,1,80,1,9,5,3,7,7,1,7
2064,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,2,Male,...,2,80,1,6,0,3,6,2,0,3
2065,49,No,Travel_Frequently,1023,Sales,2,3,Medical,4,Male,...,4,80,0,17,3,2,9,6,0,8


## Slicing DataFrames

In [37]:
# Slicing a series
data['BusinessTravel'][10:15]

EmployeeNumber
14    Travel_Rarely
15    Travel_Rarely
16    Travel_Rarely
18    Travel_Rarely
19    Travel_Rarely
Name: BusinessTravel, dtype: object

In [38]:
# Slicing a DataFrame
data[10:15]

Unnamed: 0_level_0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EnvironmentSatisfaction,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
EmployeeNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
14,35,No,Travel_Rarely,809,Research & Development,16,3,Medical,1,1,...,3,80,1,6,5,3,5,4,0,3
15,29,No,Travel_Rarely,153,Research & Development,15,2,Life Sciences,1,4,...,4,80,0,10,3,3,9,5,0,8
16,31,No,Travel_Rarely,670,Research & Development,26,1,Life Sciences,1,1,...,4,80,1,5,1,2,5,2,4,3
18,34,No,Travel_Rarely,1346,Research & Development,19,2,Medical,1,2,...,3,80,1,3,2,3,2,2,1,2
19,28,Yes,Travel_Rarely,103,Research & Development,24,3,Life Sciences,1,3,...,2,80,0,6,4,3,4,2,0,3


In [46]:
# TASK --- Access the 5th to 8th rows of `Department` and `EducationField`
data[['Department' , 'EducationField']][5:8]


Unnamed: 0_level_0,Department,EducationField
EmployeeNumber,Unnamed: 1_level_1,Unnamed: 2_level_1
8,Research & Development,Life Sciences
10,Research & Development,Medical
11,Research & Development,Life Sciences


In [47]:
# Slicing using `loc`
data.loc[[15, 94, 337, 1120]]

Unnamed: 0_level_0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EnvironmentSatisfaction,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
EmployeeNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15,29,No,Travel_Rarely,153,Research & Development,15,2,Life Sciences,1,4,...,4,80,0,10,3,3,9,5,0,8
94,29,No,Travel_Rarely,1328,Research & Development,2,3,Life Sciences,1,3,...,4,80,1,6,3,3,5,4,0,4
337,31,No,Travel_Frequently,1327,Research & Development,3,4,Medical,1,2,...,1,80,1,9,3,3,2,2,2,2
1120,29,No,Travel_Rarely,1107,Research & Development,28,4,Life Sciences,1,3,...,1,80,1,11,1,3,7,5,1,7


In [48]:
# Slicing using `iloc`
data.iloc[0:5]

Unnamed: 0_level_0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EnvironmentSatisfaction,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
EmployeeNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,2,...,1,80,0,8,0,1,6,4,0,5
2,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,3,...,4,80,1,10,3,3,10,7,1,7
4,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
5,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,4,...,3,80,0,8,3,3,8,7,3,0
7,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,1,...,4,80,1,6,3,3,2,2,2,2


In [104]:
data['PerformanceRating'].head()

EmployeeNumber
1    NaN
2    NaN
4    NaN
5    NaN
7    NaN
Name: PerformanceRating, dtype: object

# Answering questions

In [51]:
# Total employees by department
data['Department'].value_counts(normalize=True)

Research & Development    0.653741
Sales                     0.303401
Human Resources           0.042857
Name: Department, dtype: float64

In [50]:
# Overall attrition rate
normalized_count = data['Attrition'].value_counts(normalize=True)
normalized_count

No     0.838776
Yes    0.161224
Name: Attrition, dtype: float64

In [52]:
normalized_count['Yes']

0.16122448979591836

In [53]:
# Average hourly rate
data['HourlyRate'].mean()

65.89115646258503

In [69]:
# Average number of years
data['YearsAtCompany'].describe()


count    1470.000000
mean        7.008163
std         6.126525
min         0.000000
25%         3.000000
50%         5.000000
75%         9.000000
max        40.000000
Name: YearsAtCompany, dtype: float64

In [70]:
# Employees with the most number of years
data['YearsAtCompany'].sort_values(ascending=True)[:5]

EmployeeNumber
1839    0
1935    0
618     0
1504    0
614     0
Name: YearsAtCompany, dtype: int64

In [59]:
# Overall employee satisfaction
job_satisfaction_dict = {
    1: 'Low',
    2: 'Medium',
    3: 'High',
    4: 'Very High'
}

In [71]:
data['JobSatisfaction'] = data['JobSatisfaction'].map(job_satisfaction_dict)
data['JobSatisfaction'].head()

EmployeeNumber
1    Very High
2       Medium
4         High
5         High
7       Medium
Name: JobSatisfaction, dtype: object

In [72]:
data['JobSatisfaction'].value_counts(normalize=True)

Very High    0.312245
High         0.300680
Low          0.196599
Medium       0.190476
Name: JobSatisfaction, dtype: float64

In [74]:
# TASK --- Employees per education field
data['EducationField'].value_counts()

Life Sciences       606
Medical             464
Marketing           159
Technical Degree    132
Other                82
Human Resources      27
Name: EducationField, dtype: int64

In [106]:
# TASK ---  Employees per performance rating
job_performance_dict = {
    1: 'Lo',
    2: 'Medum',
    3: 'Hig',
    4: 'Ver High'
}


In [107]:

data['PerformanceRating'] = data['PerformanceRating'].map(job_performance_dict)
data['PerformanceRating'].head()

EmployeeNumber
1    NaN
2    NaN
4    NaN
5    NaN
7    NaN
Name: PerformanceRating, dtype: object

# Answering complex questions

In [92]:
# Employees with Low Job Satisfaction
data['JobSatisfaction'] == 'Low'

EmployeeNumber
1       False
2       False
4       False
5       False
7       False
        ...  
2061    False
2062     True
2064    False
2065    False
2068    False
Name: JobSatisfaction, Length: 1470, dtype: bool

In [93]:
data.loc[data['JobSatisfaction'] == 'Low'].index

Int64Index([  10,   20,   27,   31,   33,   38,   51,   52,   54,   68,
            ...
            1975, 1980, 1998, 2021, 2023, 2038, 2054, 2055, 2057, 2062],
           dtype='int64', name='EmployeeNumber', length=289)

In [108]:
# Employees with both Low Job Satisfaction and Job Involvement
job_involvement_dict = {
    1: 'Low',
    2: 'Medium',
    3: 'High',
    4: 'Very High'
}

data['JobInvolvement'] = data['JobInvolvement'].map(job_involvement_dict)

In [109]:
data.loc[(data['JobSatisfaction'] == 'Low') & (data['JobInvolvement'] == 'Low')].index

Int64Index([33, 235, 454, 615, 1019, 1037, 1237, 1460, 1478, 1544, 1611, 1622,
            1905, 1956],
           dtype='int64', name='EmployeeNumber')

In [110]:
# Employee comparison
## Create new DataFrame with observations of interest
subset = data.loc[(data['JobSatisfaction'] == 'Low') | (data['JobSatisfaction'] == 'Very High')]
print('Shape: ', subset.shape)
print('\nJob Satisfaction Count')
print(subset['JobSatisfaction'].value_counts())

Shape:  (748, 34)

Job Satisfaction Count
Very High    459
Low          289
Name: JobSatisfaction, dtype: int64


In [111]:
## Split DataFrame by 'JobSatisfaction'
grouped = subset.groupby('JobSatisfaction')

## View groups in GroupBy object
grouped.groups

{'Low': Int64Index([  10,   20,   27,   31,   33,   38,   51,   52,   54,   68,
             ...
             1975, 1980, 1998, 2021, 2023, 2038, 2054, 2055, 2057, 2062],
            dtype='int64', name='EmployeeNumber', length=289),
 'Very High': Int64Index([   1,    8,   18,   22,   23,   24,   30,   36,   39,   40,
             ...
             2022, 2024, 2027, 2036, 2040, 2041, 2045, 2052, 2056, 2061],
            dtype='int64', name='EmployeeNumber', length=459)}

In [112]:
## View details of Low group
grouped.get_group('Low').head()

Unnamed: 0_level_0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EnvironmentSatisfaction,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
EmployeeNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,59,No,Travel_Rarely,1324,Research & Development,3,3,,1,3,...,1,80,3,12,3,,1,0,0,0
20,29,No,Travel_Rarely,1389,Research & Development,21,4,,1,2,...,3,80,1,10,1,,10,9,8,8
27,36,Yes,Travel_Rarely,1218,Sales,9,4,,1,3,...,2,80,0,10,4,,5,3,0,3
31,34,Yes,Travel_Rarely,699,Research & Development,6,1,,1,2,...,3,80,0,8,2,,4,2,1,3
33,32,Yes,Travel_Frequently,1125,Research & Development,16,1,,1,2,...,2,80,0,10,5,,10,2,6,7


In [113]:
## Get summary statistics for age for each group
grouped['Age'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
JobSatisfaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Low,289.0,36.916955,9.245496,19.0,30.0,36.0,42.0,60.0
Very High,459.0,36.795207,9.125609,18.0,30.0,35.0,43.0,60.0


In [114]:
## Get employee count per department for each group
grouped['Department'].value_counts(normalize=True) * 100

JobSatisfaction  Department            
Low              Research & Development    66.435986
                 Sales                     29.757785
                 Human Resources            3.806228
Very High        Research & Development    64.270153
                 Sales                     32.026144
                 Human Resources            3.703704
Name: Department, dtype: float64

In [115]:
grouped['Department'].value_counts(normalize=True).unstack() * 100

Department,Human Resources,Research & Development,Sales
JobSatisfaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Low,3.806228,66.435986,29.757785
Very High,3.703704,64.270153,32.026144


In [117]:
# TASK --- Get mean distance from home for each group
grouped['DistanceFromHome'].mean()

JobSatisfaction
Low          9.190311
Very High    9.030501
Name: DistanceFromHome, dtype: float64