# Series and DataFrames

In [1]:
%autosave 5

Autosaving every 5 seconds


In [2]:
# Import libraries
import pandas as pd
import numpy as np

## Series

In [None]:
# Define lists
temperature = [33, 19, 15, 89, 11, -5, 9]
days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

# Create series
pd.Series(data=temperature, index=days)

In [None]:
# Define dictionary
my_dict = {
    'Mon': 33,
    'Tue': 19,
    'Wed': 15,
    'Thu': 89,
    'Fri': 11,
    'Sat': -5,
    'Sun': 9
}

# Create series
pd.Series(data=my_dict)

In [None]:
# Define array
my_array = np.linspace(0, 10, 15)

# Create series
pd.Series(data=my_array)

In [None]:
# TASK --- Using this series...
my_series = pd.Series(np.arange(0, 20, 2))
print(my_series)

# -------- Add 1 to each value


In [None]:
# TASK --- Using this series...
my_series = pd.Series(np.arange(0, 20, 2))

# -------- Multiply each value by 2


In [None]:
# TASK --- Using this series...
my_series = pd.Series(np.arange(0, 20, 2))

# -------- Calculate the exponential of each value


## DataFrames

In [None]:
# Read csv file as DataFrame
data = pd.read_csv("HR-Employee-Attrition.csv", index_col='EmployeeNumber')

In [None]:
# Access index of DataFrame
data.index

In [None]:
# Access columns of DataFrame
data.columns

In [None]:
# Access values of DataFrame
data.values

# Operations and manipulations
## Inspection of data

In [None]:
# Access first five rows of DataFrame
data.head()

In [None]:
# Access last five rows of DataFrame
data.tail()

## Selection, addition, and deletion of data

In [None]:
# Selecting one column
data['Age'].head()

In [None]:
# Selecting multiple columns
data[['Age', 'Gender', 'YearsAtCompany']].head()

In [None]:
# Add column to DataFrame
data['AgeInMonths'] = data['Age'] * 12
data['AgeInMonths'].head()

In [None]:
# Drop column from DataFrame
data.drop('AgeInMonths', axis=1, inplace=True)

In [None]:
# TASK --- Drop `EmployeeCount`


## Slicing DataFrames

In [None]:
# Slicing a series
data['BusinessTravel'][10:15]

In [None]:
# Slicing a DataFrame
data[10:15]

In [None]:
# TASK --- Access the 5th to 8th rows of `Department` and `EducationField`


In [None]:
# Slicing using `loc`
data.loc[[15, 94, 337, 1120]]

In [None]:
# Slicing using `iloc`
data.iloc[0:5]

# Answering questions

In [None]:
# Total employees by department
data['Department'].value_counts()

In [None]:
# Overall attrition rate
normalized_count = data['Attrition'].value_counts(normalize=True)
normalized_count

In [None]:
normalized_count['Yes']

In [None]:
# Average hourly rate
data['HourlyRate'].mean()

In [None]:
# Average number of years
data['YearsAtCompany'].describe()

In [None]:
# Employees with the most number of years
data['YearsAtCompany'].sort_values(ascending=False)[:5]

In [None]:
# Overall employee satisfaction
job_satisfaction_dict = {
    1: 'Low',
    2: 'Medium',
    3: 'High',
    4: 'Very High'
}

In [None]:
data['JobSatisfaction'] = data['JobSatisfaction'].map(job_satisfaction_dict)
data['JobSatisfaction'].head()

In [None]:
data['JobSatisfaction'].value_counts(normalize=True)

In [None]:
# TASK --- Employees per education field


In [None]:
# TASK ---  Employees per performance rating


# Answering complex questions

In [None]:
# Employees with Low Job Satisfaction
data['JobSatisfaction'] == 'Low'

In [None]:
data.loc[data['JobSatisfaction'] == 'Low'].index

In [None]:
# Employees with both Low Job Satisfaction and Job Involvement
job_involvement_dict = {
    1: 'Low',
    2: 'Medium',
    3: 'High',
    4: 'Very High'
}

data['JobInvolvement'] = data['JobInvolvement'].map(job_involvement_dict)

In [None]:
data.loc[(data['JobSatisfaction'] == 'Low') & (data['JobInvolvement'] == 'Low')].index

In [None]:
# Employee comparison
## Create new DataFrame with observations of interest
subset = data.loc[(data['JobSatisfaction'] == 'Low') | (data['JobSatisfaction'] == 'Very High')]
print('Shape: ', subset.shape)
print('\nJob Satisfaction Count')
print(subset['JobSatisfaction'].value_counts())

In [None]:
## Split DataFrame by 'JobSatisfaction'
grouped = subset.groupby('JobSatisfaction')

## View groups in GroupBy object
grouped.groups

In [None]:
## View details of Low group
grouped.get_group('Low').head()

In [None]:
## Get summary statistics for age for each group
grouped['Age'].describe()

In [None]:
## Get employee count per department for each group
grouped['Department'].value_counts(normalize=True) * 100

In [None]:
grouped['Department'].value_counts(normalize=True).unstack() * 100

In [None]:
# TASK --- Get mean distance from home for each group
