## Part 1: Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

# Import the data. --------------------------------------------------------------------------------------> (5 points)

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [2]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Unnamed: 0,0
Age,43
Attrition,2
BusinessTravel,3
Department,3
DistanceFromHome,29
Education,5
EducationField,6
EnvironmentSatisfaction,4
HourlyRate,71
JobInvolvement,4


In [3]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[["Attrition", "Department"]]
y_df

# Create y_df with the attrition and department columns. ------------------------------------------------------------------------------------------> (5 points)

Unnamed: 0,Attrition,Department
0,Yes,Sales
1,No,Research & Development
2,Yes,Research & Development
3,No,Research & Development
4,No,Research & Development
...,...,...
1465,No,Research & Development
1466,No,Research & Development
1467,No,Research & Development
1468,No,Sales


In [4]:
# Education                   int64
# Age                         int64
# DistanceFromHome            int64
# JobSatisfaction             int64
# OverTime                   object
# StockOptionLevel            int64
# WorkLifeBalance             int64
# YearsAtCompany              int64
# YearsSinceLastPromotion     int64
# NumCompaniesWorked          int64
# dtype: object
# =========================================================

# Create a list of at least 10 column names to use as X data
x_control_list = [
    "Education",
    "Age",
    "DistanceFromHome",
    "JobSatisfaction",
    "OverTime",
    "StockOptionLevel",
    "WorkLifeBalance",
    "YearsAtCompany",
    "YearsSinceLastPromotion",
    "NumCompaniesWorked"
    ]
# Choose 10 columns for X. ---------------------------------------------------------------> (5 points)

# Create X_df using your selected columns
X_df = attrition_df[x_control_list]

# Show the data types for X_df
X_df.dtypes

# Show the data types of the X columns. ------------------------------------------------> (5 points)

Unnamed: 0,0
Education,int64
Age,int64
DistanceFromHome,int64
JobSatisfaction,int64
OverTime,object
StockOptionLevel,int64
WorkLifeBalance,int64
YearsAtCompany,int64
YearsSinceLastPromotion,int64
NumCompaniesWorked,int64


In [5]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df)


# Split the data into training and testing sets. ---------------------------------------------------------------> (5 points)

In [6]:
# Overtime needs to become numeric
X_train["OverTime"].value_counts()

Unnamed: 0_level_0,count
OverTime,Unnamed: 1_level_1
No,784
Yes,318


In [7]:
# M19D03A02 - branching

# No     788
# Yes    314
# Name: OverTime, dtype: int64
# ===========================

# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary
X_train_encoded = X_train.copy()
X_test_encoded = X_test.copy()

from sklearn.preprocessing import LabelEncoder
X_train_encoded['OverTime'] = LabelEncoder().fit_transform(X_train_encoded['OverTime'])
X_test_encoded['OverTime'] = LabelEncoder().fit_transform(X_test_encoded['OverTime'])
X_train_encoded['OverTime'].value_counts()

### Use specific value counts to determine cooralation to origional data in the cell above


Unnamed: 0_level_0,count
OverTime,Unnamed: 1_level_1
0,784
1,318


In [8]:
X_train_encoded.dtypes
# Encode all X data to numeric types. -----------------------------------------------------------------------------------------------> (5 points)

Unnamed: 0,0
Education,int64
Age,int64
DistanceFromHome,int64
JobSatisfaction,int64
OverTime,int64
StockOptionLevel,int64
WorkLifeBalance,int64
YearsAtCompany,int64
YearsSinceLastPromotion,int64
NumCompaniesWorked,int64


In [9]:
# M18D02A07 - detecting_article_objectivity
# StandardScaler was imported as part of the origional file

# Create a StandardScaler
scaler = StandardScaler().fit(X_train_encoded)


# Fit the StandardScaler to the training data
X_train_scaled = scaler.transform(X_train_encoded)


# Scale the training and testing data
X_test_scaled = scaler.transform(X_test_encoded)

# Scale the X data.  --------------------------------------------------------------------------------------------------------------------------------------------------> (5 points)

### Note to Grader → I missed using the scalled data on a previous asignment, and points were deducted from my final score.
### I have since fixed that issue, and have re-submitted the asignment, with nearly a 10% improvement in Accuracy (thank you!).
### It is my understanding that homework can be re-submitted.

### A) If Homework cannot be resubmitted, plese let me know in the assignment comment.
### B) If it can, please let me know how to flag previous homework for review.
### C) For your summary, please don't include the parts I got RIGHT.  I'm not worried about stuff I already know.
###    Please just include mistakes.  I want to get better.


In [27]:
y_df.head()

Unnamed: 0,Attrition,Department
0,Yes,Sales
1,No,Research & Development
2,Yes,Research & Development
3,No,Research & Development
4,No,Research & Development


In [10]:
y_df["Department"].value_counts()

Unnamed: 0_level_0,count
Department,Unnamed: 1_level_1
Research & Development,961
Sales,446
Human Resources,63


In [11]:
# array([[0., 1., 0.],
#        [0., 1., 0.],
#        [0., 1., 0.],
#        ...,
#        [0., 1., 0.],
#        [0., 0., 1.],
#        [0., 1., 0.]])

# M19D03A02 - branching
# quality_encoder = OneHotEncoder(sparse_output=False)
# quality_encoded = quality_encoder.fit_transform(df[['quality']])
# quality_columns = quality_encoder.get_feature_names_out(['quality'])
# df_quality_encoded = pd.DataFrame(quality_encoded, columns=quality_columns)

# ------------------------------------------------------------------
# ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# ohe.fit(data)
# encoded_data = ohe.transform(data)
#   ↑←←←←←←←←←←←←←←←←←←←←←←←←←←←↓
# encoded_data = ohe.fit_transform(data)    /// Alternate version, combined
# ohe.set_output(transform="pandas")      \\\ Convert to DataFrame (Optional)
# encoded_df = ohe.fit_transform(data)    \\\      data, w/ new output

from sklearn.preprocessing import OneHotEncoder

# Create a OneHotEncoder for the Department column
department_encoder = OneHotEncoder(sparse_output=False)

# Fit the encoder to the training data
department_encoder.fit(y_train[['Department']])

# Create two new variables by applying the encoder to the training and testing data
y_train_d_encoded = department_encoder.transform(y_train[['Department']])
y_test_d_encoded = department_encoder.transform(y_test[['Department']])

y_train_d_encoded

array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [12]:
# array([[1., 0.],
#        [0., 1.],
#        [1., 0.],
#        ...,
#        [1., 0.],
#        [1., 0.],
#        [1., 0.]])

# Create a OneHotEncoder for the Attrition column
attrition_encoder = OneHotEncoder(sparse_output=False)

# Fit the encoder to the training data
attrition_encoder.fit(y_train[['Attrition']])

# Create two new variables by applying the encoder to the training and testing data
y_train_a_encoded = attrition_encoder.transform(y_train[['Attrition']])
y_test_a_encoded = attrition_encoder.transform(y_test[['Attrition']])
y_train_a_encoded

# Encode all y data to numeric types. --------------------------------------------------------------------------------> (5 points)

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [0., 1.],
       [1., 0.]])

In [13]:
y_train


Unnamed: 0,Attrition,Department
405,Yes,Research & Development
1300,No,Sales
665,No,Sales
1260,No,Research & Development
137,No,Sales
...,...,...
212,No,Sales
1125,No,Research & Development
487,No,Research & Development
479,Yes,Research & Development


## Create, Compile, and Train the Model

In [14]:
# y_train, y_test are both dfs // encodded are arrays
# y_train_d_encoded
# y_train_a_encoded
# y_test_d_encoded
# y_test_a_encoded

department_columns = department_encoder.get_feature_names_out(['Department'])
df_y_train_d = pd.DataFrame(y_train_d_encoded, columns=department_columns)
df_y_test_d = pd.DataFrame(y_test_d_encoded, columns=department_columns)

attrition_columns = attrition_encoder.get_feature_names_out(['Attrition'])
df_y_train_a = pd.DataFrame(y_train_a_encoded, columns=attrition_columns)
df_y_test_a = pd.DataFrame(y_test_a_encoded, columns=attrition_columns)

df_y_train = pd.concat([df_y_train_d, df_y_train_a], axis=1)
df_y_test = pd.concat([df_y_test_d, df_y_test_a], axis=1)

df_y_train_a.head()

# ------------------------------------------------------------------------------------------------------
# ---------------------------------- Organizing variables ----------------------------------------------
# ------------------------------------------------------------------------------------------------------
# X_train_encoded ---> train
# X_test_encoded ---> val
# df_y_train_a --> fit
# df_y_train_d --> fit
# df_y_test_a --> val
# df_y_test_b --> Val
#
# model.fit(
#     X_train_encoded,
#     {'department_output': df_y_train_d, 'attrition_output': df_y_train_a},
#     epochs=10,
#     batch_size=32,
#     validation_split=0.2
# )
# ↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓
# test_results = model.evaluate(X_test_uncoded, {'department_output': df_y_test_d, 'attrition_output': df_y_test_a})
# print(f"Quality Accuracy: {test_results[0]}")
# print(f"Color Accuracy: {test_results[2]}")


Unnamed: 0,Attrition_No,Attrition_Yes
0,0.0,1.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0


In [15]:
number_input_features = len(X_train)
number_input_features

1102

In [29]:
# M18D02A06 - Getting REal Solutions
# M19D03A02 - Branching

# Find the number of columns in the X training data
number_input_features = len(X_train)
# Find the number of columns in the X training data. ----------------------------------------------------------------------> (5 points)

# Create the input layer
###                                               ↓ » Do NOT forget the comma.  The input is an array, adn you NEED to state the 2nd axis on the input layer, even if it's only 1
input_layer = layers.Input(shape=(X_train.shape[1],), name='input_features')
# Create an input layer. ---------------------------------------------------------------------------------------------------> (5 points)

# Create at least two shared layers
shared_layer1 = layers.Dense(64, activation='relu', name='shared_01')(input_layer)
shared_layer2 = layers.Dense(128, activation='relu', name='shared_02')(shared_layer1)

# Create at least two shared hidden layers. -----------------------------------------------------------------------------------> (10 points)


In [30]:
# Create a branch for Department with a hidden layer and an output layer

# Create the hidden layer
department_hidden = layers.Dense(32, activation='relu', name='department_hidden')(shared_layer2)

# Create the output layer
department_output = layers.Dense(3, activation='softmax', name='department_output')(department_hidden)


In [38]:
# Create a branch for Attrition with a hidden layer and an output layer

# Create the hidden layer
attrition_hidden = layers.Dense(32, activation='relu', name='attrition_hidden')(shared_layer2)

# Create the output layer
attrition_output = layers.Dense(2, activation='sigmoid', name='attrition_output')(attrition_hidden)


In [39]:
# Model: "model"
# __________________________________________________________________________________________________
#  Layer (type)                Output Shape                 Param #   Connected to
# ==================================================================================================
#  input (InputLayer)          [(None, 10)]                 0         []
#  shared1 (Dense)             (None, 64)                   704       ['input[0][0]']
#  shared2 (Dense)             (None, 128)                  8320      ['shared1[0][0]']
#  department_hidden (Dense)   (None, 32)                   4128      ['shared2[0][0]']
#  attrition_hidden (Dense)    (None, 32)                   4128      ['shared2[0][0]']
#  department_output (Dense)   (None, 3)                    99        ['department_hidden[0][0]']
#  attrition_output (Dense)    (None, 2)                    66        ['attrition_hidden[0][0]']
# ==================================================================================================
# Total params: 17445 (68.14 KB)
# Trainable params: 17445 (68.14 KB)
# Non-trainable params: 0 (0.00 Byte)
# __________________________________________________________________________________________________


# Create the model
model = Model(inputs=input_layer, outputs=[department_output, attrition_output])

# Compile the model
model.compile(optimizer='adam',
              loss={'department_output': 'categorical_crossentropy', 'attrition_output': 'binary_crossentropy'},
              metrics={'department_output': 'accuracy', 'attrition_output': 'recall'})

# Summarize the model
model.summary()

In [40]:
# Epoch 99/100
# 35/35 [==============================] - 0s 3ms/step - loss: 0.0564 - department_output_loss: 0.0254 - attrition_output_loss: 0.0309 - department_output_accuracy: 0.9955 - attrition_output_accuracy: 0.9918
# Epoch 100/100
# 35/35 [==============================] - 0s 3ms/step - loss: 0.0607 - department_output_loss: 0.0368 - attrition_output_loss: 0.0239 - department_output_accuracy: 0.9891 - attrition_output_accuracy: 0.9927

# Train the model
model.fit(
    X_train_encoded,
    {'department_output': df_y_train_d, 'attrition_output': df_y_train_a},
    epochs=100,
    batch_size=32,
    validation_split=0.2 ### → 2nd internal split for adam already taken care of here // no need for extrnal split
)


Epoch 1/100
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - attrition_output_recall: 0.5197 - department_output_accuracy: 0.7615 - loss: 1.2560 - val_attrition_output_recall: 0.8416 - val_department_output_accuracy: 0.5973 - val_loss: 1.5789
Epoch 2/100
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - attrition_output_recall: 0.8503 - department_output_accuracy: 0.7586 - loss: 0.9428 - val_attrition_output_recall: 0.8507 - val_department_output_accuracy: 0.5882 - val_loss: 1.4687
Epoch 3/100
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - attrition_output_recall: 0.8408 - department_output_accuracy: 0.7465 - loss: 0.9441 - val_attrition_output_recall: 0.8507 - val_department_output_accuracy: 0.6109 - val_loss: 1.5423
Epoch 4/100
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - attrition_output_recall: 0.8475 - department_output_accuracy: 0.7290 - loss: 0.9339 - val_attrition_outpu

<keras.src.callbacks.history.History at 0x78dcffcefe50>

In [41]:
# 12/12 [==============================] - 0s 3ms/step - loss: 4.1624 - department_output_loss: 3.1143 - attrition_output_loss: 1.0481 - department_output_accuracy: 0.5272 - attrition_output_accuracy: 0.8261
# [4.162380218505859,
#  3.114327907562256,
#  1.0480519533157349,
#  0.5271739363670349,
#  0.8260869383811951]

# Evaluate the model with the testing data
test_results = model.evaluate(X_test_encoded, {'department_output': df_y_test_d, 'attrition_output': df_y_test_a})
test_results

# 12/12 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - attrition_output_accuracy: 0.7990 - department_output_accuracy: 0.5596 - loss: 1.8162
# [1.6625502109527588, 0.8288043737411499, 0.58423912525177]


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - attrition_output_recall: 0.7863 - department_output_accuracy: 0.5318 - loss: 3.6411 


[3.2669641971588135, 0.8070651888847351, 0.551630437374115]

In [42]:
# Department predictions accuracy: 0.5271739363670349
# Attrition predictions accuracy: 0.8260869383811951

# Print the accuracy for both department and attrition
print(f"Department predictions accuracy: {test_results[2]}")
print(f"Attrition predictions accuracy: {test_results[1]}")

Department predictions accuracy: 0.551630437374115
Attrition predictions accuracy: 0.8070651888847351


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. No.  The main focus of the Attrition anaylisys is for the identificatin of at-risk employees.  False Positives (as long ans not too infrequent as to displace resources) is nto an issue with this model. Special attention given to "happy" employes has little risk of makign them "unhappy".  Fals negatives, on the other hand, will let at-risk employees remain undetected / unaddresses.  The best Metric for this portion of the analysis would be a Reval function.
"Accuracy" for the Department portion of the anaylis will be fine, as it has the added features of auto-scaling based on shape.  If the departmetn list is changed later on, this portion will automatically adjust.
2. **<ins>Department</ins>** : I used Softmax, because there are three different possible outcomes (the three different departments), and the output is mututally exclusive (you can only ne on 1 departmetn at the time of attritions).  **<ins>Attrition</ins>** : I used Sigmoid, as the outcome from this field will be a binary state (quit / not quit).
3. More data, verification of persent data (error may be generated by incorrect reporting), Longer study (some of the present "non-Attrition" employees may be int eh process of quitting), information on the job they are going to, direct manager (some portion of the Attrition rate may be due to inter-personal conflict).
