## Part 1: Preprocessing

In [None]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers


#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [None]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Unnamed: 0,0
Age,43
Attrition,2
BusinessTravel,3
Department,3
DistanceFromHome,29
Education,5
EducationField,6
EnvironmentSatisfaction,4
HourlyRate,71
JobInvolvement,4


In [None]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[['Attrition', 'Department']].set_index(attrition_df.index)
y_df.head()

Unnamed: 0,Attrition,Department
0,Yes,Sales
1,No,Research & Development
2,Yes,Research & Development
3,No,Research & Development
4,No,Research & Development


In [None]:
# get a list of all columns
attrition_df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EnvironmentSatisfaction', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [None]:
# Create a list of at least 10 column names to use as X data
columns = ['Age', 'BusinessTravel', 'DistanceFromHome',
       'Education', 'HourlyRate', 'JobLevel', 'JobSatisfaction', 'PerformanceRating', 'WorkLifeBalance',
       'YearsAtCompany']


# Create X_df using your selected columns
X_df = attrition_df[columns]
X_df.head()

Unnamed: 0,Age,BusinessTravel,DistanceFromHome,Education,HourlyRate,JobLevel,JobSatisfaction,PerformanceRating,WorkLifeBalance,YearsAtCompany
0,41,Travel_Rarely,1,2,94,2,4,3,1,6
1,49,Travel_Frequently,8,1,61,2,2,4,3,10
2,37,Travel_Rarely,2,2,92,1,3,3,3,0
3,33,Travel_Frequently,3,4,56,1,3,3,3,8
4,27,Travel_Rarely,2,1,40,1,2,3,3,2


In [None]:
# Show the data types for X_df
X_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Age                1470 non-null   int64 
 1   BusinessTravel     1470 non-null   object
 2   DistanceFromHome   1470 non-null   int64 
 3   Education          1470 non-null   int64 
 4   HourlyRate         1470 non-null   int64 
 5   JobLevel           1470 non-null   int64 
 6   JobSatisfaction    1470 non-null   int64 
 7   PerformanceRating  1470 non-null   int64 
 8   WorkLifeBalance    1470 non-null   int64 
 9   YearsAtCompany     1470 non-null   int64 
dtypes: int64(9), object(1)
memory usage: 115.0+ KB


In [None]:
# Split the data into training and testing sets
# Split data into X and two separate y variables
X = X_df
y_dept = pd.DataFrame(y_df["Department"], index=X_df.index)
y_att = pd.DataFrame(y_df["Attrition"], index=X_df.index)

# Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_dept_train, y_dept_test, y_att_train, y_att_test = train_test_split(X, y_dept, y_att, test_size= 0.2, random_state=1)

In [None]:
# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary
X_df["BusinessTravel"].unique()

array(['Travel_Rarely', 'Travel_Frequently', 'Non-Travel'], dtype=object)

In [None]:
# Explore y data
print(f"Department Classes: {y_dept.value_counts()}")
print(f"Attrition Classes: {y_att.value_counts()}")

Department Classes: Department            
Research & Development    961
Sales                     446
Human Resources            63
Name: count, dtype: int64
Attrition Classes: Attrition
No           1233
Yes           237
Name: count, dtype: int64


In [None]:
from sklearn.preprocessing import OneHotEncoder
# Preprocess "BusinessTravel" column (one-hot encoding for multiclass)

# initialize the encoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit the encoder on the training set and transform training and testing
X_train_encoded = encoder.fit_transform(X_train[['BusinessTravel']])
X_test_encoded = encoder.transform(X_test[['BusinessTravel']])

# get column names out
encoded_feature_names = encoder.get_feature_names_out(['BusinessTravel'])

# convert to df
X_train_processed = pd.DataFrame(X_train_encoded, columns=encoded_feature_names).reset_index(drop=True)
X_test_processed = pd.DataFrame(X_test_encoded, columns=encoded_feature_names).reset_index(drop=True)

In [None]:
print(X_train_processed.shape)
print(X_test_processed.shape)

(1176, 3)
(294, 3)


In [None]:
# Drop the original BT column from X_train and X_test
X_train_drop = X_train.drop(['BusinessTravel'], axis=1).reset_index(drop=True)
X_test_drop = X_test.drop(['BusinessTravel'], axis=1).reset_index(drop=True)
print(X_train_drop.shape)
print(X_test_drop.shape)

(1176, 9)
(294, 9)


In [None]:
# Check for null values in the original and processed DataFrames
print("Null values in X_train_drop:\n", X_train_drop.isnull().sum())
print("Null values in X_train_processed:\n", X_train_processed.isnull().sum())
print("Null values in X_test_drop:\n", X_test_drop.isnull().sum())
print("Null values in X_test_processed:\n", X_test_processed.isnull().sum())

Null values in X_train_drop:
 Age                  0
DistanceFromHome     0
Education            0
HourlyRate           0
JobLevel             0
JobSatisfaction      0
PerformanceRating    0
WorkLifeBalance      0
YearsAtCompany       0
dtype: int64
Null values in X_train_processed:
 BusinessTravel_Non-Travel           0
BusinessTravel_Travel_Frequently    0
BusinessTravel_Travel_Rarely        0
dtype: int64
Null values in X_test_drop:
 Age                  0
DistanceFromHome     0
Education            0
HourlyRate           0
JobLevel             0
JobSatisfaction      0
PerformanceRating    0
WorkLifeBalance      0
YearsAtCompany       0
dtype: int64
Null values in X_test_processed:
 BusinessTravel_Non-Travel           0
BusinessTravel_Travel_Frequently    0
BusinessTravel_Travel_Rarely        0
dtype: int64


In [None]:
print("X_train_drop index:", X_train_drop.index)
print("X_train_processed index:", X_train_processed.index)

X_train_drop index: RangeIndex(start=0, stop=1176, step=1)
X_train_processed index: RangeIndex(start=0, stop=1176, step=1)


In [None]:
X_train_concat = pd.concat([X_train_drop, X_train_processed], axis=1)
X_test_concat = pd.concat([X_test_drop, X_test_processed], axis=1)
print(X_train_concat.shape)
print(X_test_concat.shape)

(1176, 12)
(294, 12)


In [None]:
# Create a OneHotEncoder for the Department column
encoder = OneHotEncoder(sparse_output=False)

# Fit the encoder to the training data
# Create two new variables by applying the encoder
# to the training and testing data
y_dept_train_encoded = encoder.fit_transform(y_dept_train[["Department"]])
y_dept_test_encoded = encoder.transform(y_dept_test[["Department"]])

In [None]:
# Create a OneHotEncoder for the Attrition column
encoder = OneHotEncoder(sparse_output=False)

# Fit the encoder to the training data
# Create two new variables by applying the encoder
# to the training and testing data
y_att_train_encoded = encoder.fit_transform(y_att_train[["Attrition"]])
y_att_test_encoded = encoder.transform(y_att_test[["Attrition"]])

In [None]:
print({y_att_train.shape})
print({y_att_test.shape})
print({X_train.shape})

{(1176, 1)}
{(294, 1)}
{(1176, 10)}


## Create, Compile, and Train the Model

In [None]:
# Find the number of columns in the X training data
columns = X_train.columns
print(len(columns))

# Create the input layer
input_layer = layers.Input(shape=(12,), name="input_features")

# Shared hidden layers
shared_layer_1 = layers.Dense(128, activation='relu', name = "input_layer")(input_layer)
shared_layer_2 = layers.Dense(64, activation='relu', name = "shared_layer_1")(shared_layer_1)

10


In [None]:
# Create a branch for Department
# with a hidden layer and an output layer
# multiclass:
# Create the hidden layer
department_layer = layers.Dense(32, activation='relu', name = "department_hidden_layer")(shared_layer_2)

# Create the output layer
# Branch for quality prediction
department_output = layers.Dense(3, activation='softmax', name='department_output')(department_layer)

In [None]:
# Create a branch for Attrition
# with a hidden layer and an output layer

# Create the hidden layer
attrition_layer = layers.Dense(32, activation='relu', name = "attrition_hidden_layer")(shared_layer_2)

# Create the output layer

attrition_output = layers.Dense(2, activation='sigmoid', name='attrition_output')(attrition_layer)

In [None]:
# Create the model
model = Model(inputs=input_layer, outputs=[
    department_output,
    attrition_output,
])


# Compile the model
model.compile(optimizer='adam',
              loss={'department_output': 'categorical_crossentropy',
                    'attrition_output': 'binary_crossentropy'},
              metrics={'department_output': 'accuracy',
                       'attrition_output': 'accuracy'})

# Summarize the model
model.summary()

In [None]:
# Train the model
model.fit(
    X_train_concat,
    {'department_output': y_dept_train_encoded, 'attrition_output': y_att_train_encoded},
    epochs=100,
    batch_size=35,
    validation_split=0.2
)

Epoch 1/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - attrition_output_accuracy: 0.8377 - department_output_accuracy: 0.4768 - loss: 3.1089 - val_attrition_output_accuracy: 0.8305 - val_department_output_accuracy: 0.6525 - val_loss: 1.2593
Epoch 2/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - attrition_output_accuracy: 0.8523 - department_output_accuracy: 0.6446 - loss: 1.2140 - val_attrition_output_accuracy: 0.8390 - val_department_output_accuracy: 0.6398 - val_loss: 1.2578
Epoch 3/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - attrition_output_accuracy: 0.8520 - department_output_accuracy: 0.6598 - loss: 1.1898 - val_attrition_output_accuracy: 0.8390 - val_department_output_accuracy: 0.6059 - val_loss: 1.2677
Epoch 4/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - attrition_output_accuracy: 0.8722 - department_output_accuracy: 0.6498 - loss: 1.1662 - val_a

<keras.src.callbacks.history.History at 0x7a5bd4405f90>

In [None]:
# Evaluate the model with the testing data
# The evaluate function returns the following:
# [total_loss, department_loss, attrition_loss,
#  total_accuracy, department_accuracy, attrition_accuracy]
test_results = model.evaluate(X_test_concat, {'department_output': y_dept_test_encoded, 'attrition_output': y_att_test_encoded})
test_results

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - attrition_output_accuracy: 0.7907 - department_output_accuracy: 0.6353 - loss: 1.3128 


[1.260595679283142, 0.8027210831642151, 0.6326530575752258]

In [None]:
# Display evaluation results
print(f"Department Accuracy: {test_results[2]} \nAttrition Accuracy: {test_results[1]}")

Department Accuracy: 0.6326530575752258 
Attrition Accuracy: 0.8027210831642151


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

Accuracy is probably not the best metric to use here, particularly for the Department target. Department is multi-class, and the data is unbalanced with the majority falling into Research & Development. Accuracy is a better metric for Attrition because it is a simpler, binary classification of Yes or No. Even this classification might warrant a different metric because ethe Attrition data is so unbalanced (1233 No vs. 237 Yes).


2. What activation functions did you choose for your output layers, and why?

For the Department output layer, I used the softmax activation function because this target was multi-classification, and the output would be one class rather than a mixed or hybrid classification. For the Activation output layer, I used sigmoid because it was a binary classification target.

3. Can you name a few ways that this model might be improved?

There are a few things I would try in order to improve the model which has high loss and low accuracy (not saying they will all work, but would be good places to start).

Since the model contains such unbalanced data we could:
- Try using class weights for the Department Category or resampling to minimize the impact of the unbalanced data.
- Add training data, change the percentage of training/testing data, or reduce the number of eophs.
- Improve the training data by identifying alternative features, or perhaps scaling the current features.

Use another metric instead of accuracy to better understand the model's performance and where potential issues lie.

Fine tune the model: use hyperparameters (i.e. Keras tuner) to optimize the number of neurons, the number of hidden layers, the appropriate activation function, and the number of epochs.



YOUR ANSWERS HERE

1.
2.
3.