## Part 1: Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [2]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Unnamed: 0,0
Age,43
Attrition,2
BusinessTravel,3
Department,3
DistanceFromHome,29
Education,5
EducationField,6
EnvironmentSatisfaction,4
HourlyRate,71
JobInvolvement,4


In [3]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[["Attrition", "Department"]]


In [5]:
# Create a list of at least 10 column names to use as X data
X_columns = ['Education', 'Age', 'DistanceFromHome', 'JobSatisfaction',
          'HourlyRate', 'StockOptionLevel', 'OverTime', 'YearsAtCompany',
          'YearsSinceLastPromotion', 'NumCompaniesWorked', 'WorkLifeBalance',]


# Create X_df using your selected columns
X_df = attrition_df[X_columns]

# Show the data types for X_df
X_df.dtypes


Unnamed: 0,0
Education,int64
Age,int64
DistanceFromHome,int64
JobSatisfaction,int64
HourlyRate,int64
StockOptionLevel,int64
OverTime,object
YearsAtCompany,int64
YearsSinceLastPromotion,int64
NumCompaniesWorked,int64


In [17]:
from sklearn.preprocessing import OneHotEncoder

In [18]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, random_state=78)


In [19]:
# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary
X_df.loc[:, 'OverTime'] = X_df['OverTime'].astype("category").cat.codes
X_df['OverTime'].value_counts()

Unnamed: 0_level_0,count
OverTime,Unnamed: 1_level_1
0,1054
1,416


In [20]:
# Convert categorical columns to numeric values if needed

X_train = X_train.apply(lambda col: col.astype('category').cat.codes if col.dtype == 'object' else col)
X_test = X_test.apply(lambda col: col.astype('category').cat.codes if col.dtype == 'object' else col)

In [21]:
# Create a StandardScaler
scaler = StandardScaler().fit(X_train)

# Fit the StandardScaler to the training data
X_train_scaled = scaler.transform(X_train)

# Scale the training and testing data
X_test_scaled = scaler.transform(X_test)


In [22]:
# Create a OneHotEncoder for the Department column
department_ecd = OneHotEncoder(sparse_output=False)

# Fit the encoder to the training data
department_train = department_ecd.fit_transform(y_train[['Department']])
department_columns = department_ecd.get_feature_names_out(['Department'])
y_train_department = pd.DataFrame(department_train, columns=department_columns)
y_train_department

# Create two new variables by applying the encoder
# to the training and testing data
department_test = department_ecd.fit_transform(y_test[['Department']])
department_columns = department_ecd.get_feature_names_out(['Department'])
y_test_department = pd.DataFrame(department_test, columns=department_columns)
y_test_department



Unnamed: 0,Department_Human Resources,Department_Research & Development,Department_Sales
0,0.0,1.0,0.0
1,0.0,1.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,0.0,1.0,0.0
...,...,...,...
363,0.0,1.0,0.0
364,0.0,1.0,0.0
365,0.0,1.0,0.0
366,0.0,1.0,0.0


In [23]:
# Create a OneHotEncoder for the Attrition column
attrition_ecd = OneHotEncoder(sparse_output=False)

# Fit the encoder to the training data
attrition_train = attrition_ecd.fit_transform(y_train[['Attrition']])
attrition_columns = attrition_ecd.get_feature_names_out(['Attrition'])
y_train_attrition = pd.DataFrame(attrition_train, columns=attrition_columns)
y_train_attrition

# Create two new variables by applying the encoder
# to the training and testing data
attrition_test = attrition_ecd.fit_transform(y_test[['Attrition']])
attrition_columns = attrition_ecd.get_feature_names_out(['Attrition'])
y_test_attrition = pd.DataFrame(attrition_test, columns=attrition_columns)
y_test_attrition


Unnamed: 0,Attrition_No,Attrition_Yes
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,0.0,1.0
...,...,...
363,1.0,0.0
364,1.0,0.0
365,1.0,0.0
366,1.0,0.0


## Create, Compile, and Train the Model

In [27]:
# Find the number of columns in the X training data
n_of_col = X_train_scaled.shape[1]

# Create the input layer
input_layer = layers.Input(shape=(X_train.shape[1],), name='input_features')

# Create at least two shared layers
s_l_1 = layers.Dense(units=64, activation='relu', name="s_l_1")(input_layer)
s_l_2 = layers.Dense(units=128, activation='relu', name="s_l_2")(s_l_1)

In [28]:
# Create a branch for Department
# with a hidden layer and an output layer

# Create the hidden layer
department_h_l = layers.Dense(32, activation='relu')(s_l_2)


# Create the output layer
department_o_l = layers.Dense(len(y_train_department.columns), activation='softmax', name='department_output')(department_h_l)


In [29]:
# Create a branch for Attrition
# with a hidden layer and an output layer

# Create the hidden layer
attrition_h_l = layers.Dense(32, activation='relu')(s_l_2)


# Create the output layer
attrition_o_l = layers.Dense(len(y_train_attrition.columns), activation='sigmoid', name='attrition_output')(attrition_h_l)


In [31]:
# Create the model
model = Model(inputs=input_layer, outputs=[department_o_l, attrition_o_l])


# Compile the model
model.compile(optimizer='adam',
              loss={'department_output': 'categorical_crossentropy', 'attrition_output': 'binary_crossentropy'},
              metrics={'department_output': 'accuracy', 'attrition_output': 'accuracy'})

# Summarize the model
model.summary()

In [36]:
# Train the model
model.fit(X_train,
    {'department_output': y_train_department, 'attrition_output': y_train_attrition},
    epochs=100,
    batch_size=32,
    shuffle=True, verbose=2)


Epoch 1/100
35/35 - 0s - 6ms/step - attrition_output_accuracy: 0.9056 - department_output_accuracy: 0.7650 - loss: 1.0282
Epoch 2/100
35/35 - 0s - 8ms/step - attrition_output_accuracy: 0.8757 - department_output_accuracy: 0.6933 - loss: 1.0925
Epoch 3/100
35/35 - 0s - 6ms/step - attrition_output_accuracy: 0.8984 - department_output_accuracy: 0.7650 - loss: 0.8473
Epoch 4/100
35/35 - 0s - 5ms/step - attrition_output_accuracy: 0.9247 - department_output_accuracy: 0.7822 - loss: 0.7979
Epoch 5/100
35/35 - 0s - 4ms/step - attrition_output_accuracy: 0.9283 - department_output_accuracy: 0.7904 - loss: 0.7380
Epoch 6/100
35/35 - 0s - 4ms/step - attrition_output_accuracy: 0.9319 - department_output_accuracy: 0.7913 - loss: 0.7290
Epoch 7/100
35/35 - 0s - 8ms/step - attrition_output_accuracy: 0.9283 - department_output_accuracy: 0.8094 - loss: 0.6888
Epoch 8/100
35/35 - 0s - 9ms/step - attrition_output_accuracy: 0.9120 - department_output_accuracy: 0.7940 - loss: 0.7087
Epoch 9/100
35/35 - 0s -

<keras.src.callbacks.history.History at 0x7cb838d63610>

In [37]:
# Evaluate the model with the testing data
eval = model.evaluate(X_test_scaled, {'department_output': y_test_department, 'attrition_output': y_test_attrition})
eval

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - attrition_output_accuracy: 0.5740 - department_output_accuracy: 0.4820 - loss: 5.1533  


[5.2815775871276855, 0.5978260636329651, 0.47826087474823]

In [41]:
# Print the accuracy for both department and attrition
print("Evaluation results:", eval)
print('Attrition Predictions Accuracy: %.3f' % eval[1])
print('Department Predictions Accuracy: %.3f' % eval[2])

Evaluation results: [5.2815775871276855, 0.5978260636329651, 0.47826087474823]
Attrition Predictions Accuracy: 0.598
Department Predictions Accuracy: 0.478


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. While accuracy is often a go-to metric, it may not be ideal for this dataset due to the class imbalance present in both target features, like "No Attrition" and "Research & Development." Relying solely on accuracy can be misleading, as a model could achieve high accuracy by predicting the dominant class without truly understanding the minority class. For instance, if the goal is to pinpoint employees at risk of leaving, optimizing for recall (reducing missed detections of attrition) might prove more valuable than sheer accuracy. In this case, recall or F1-score might serve as more reliable metrics, ensuring the model’s predictions are not skewed toward the majority class.
2. For the "Department" classification, softmax activation is appropriate as it generates a probability distribution across multiple departments, thus indicating the likelihood of each department for a given sample. This is crucial in multi-class classification tasks, where each observation must be assigned exclusively to one category. In contrast, the "Attrition" output, a binary classification problem, uses the sigmoid activation function. Sigmoid outputs values between 0 and 1, providing the probability that a sample belongs to one of two classes, making it well-suited for yes-or-no classifications.


3. Refining Feature Selection and Engineering: Creating new features or modifying existing ones based on deeper analysis could improve predictive power. Exploring relationships within the data or applying domain-specific insights might yield more informative features.

Hyperparameter Adjustment: Tuning parameters such as the number of neurons, learning rate, batch size, or layers in the network can help boost model accuracy and efficiency. Systematic methods like grid search or random search facilitate this process.

Regularization Techniques: Applying regularization (like L1, L2, or dropout) helps to reduce overfitting by either penalizing overly complex models or selectively deactivating neurons during training, encouraging the model to generalize better.

Using Ensemble Models: Combining the predictions of multiple models, such as through bagging or boosting, can increase model robustness and accuracy by leveraging different perspectives within the data.

Data Augmentation (when applicable): Although primarily relevant for image data, augmentation strategies can enhance diversity within the training set, leading to better generalization in some contexts.

Experimenting with Model Architectures: Adapting the architecture to fit specific data types or problem structures—such as using convolutional neural networks for image data or recurrent networks for sequential data—might provide performance gains tailored to the dataset at hand.