## Part 1: Preprocessing

In [310]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.layers import Input, Concatenate
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam


#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [311]:
# Determine the number of unique values in each column
attrition_df.nunique()

Age                         43
Attrition                    2
BusinessTravel               3
Department                   3
DistanceFromHome            29
Education                    5
EducationField               6
EnvironmentSatisfaction      4
HourlyRate                  71
JobInvolvement               4
JobLevel                     5
JobRole                      9
JobSatisfaction              4
MaritalStatus                3
NumCompaniesWorked          10
OverTime                     2
PercentSalaryHike           15
PerformanceRating            2
RelationshipSatisfaction     4
StockOptionLevel             4
TotalWorkingYears           40
TrainingTimesLastYear        7
WorkLifeBalance              4
YearsAtCompany              37
YearsInCurrentRole          19
YearsSinceLastPromotion     16
YearsWithCurrManager        18
dtype: int64

In [312]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[['Attrition', 'Department']]


In [313]:
# Create a list of at least 10 column names to use as X data
X_columns = ['Education', 'Age', 'DistanceFromHome', 'JobSatisfaction', 'OverTime',
             'StockOptionLevel', 'WorkLifeBalance', 'YearsAtCompany', 'Attrition',
             'YearsSinceLastPromotion', 'NumCompaniesWorked']

# Create X_df using your selected columns
X_df = attrition_df[X_columns]

# Show the data types for X_df
X_df.dtypes

Education                   int64
Age                         int64
DistanceFromHome            int64
JobSatisfaction             int64
OverTime                   object
StockOptionLevel            int64
WorkLifeBalance             int64
YearsAtCompany              int64
Attrition                  object
YearsSinceLastPromotion     int64
NumCompaniesWorked          int64
dtype: object

In [314]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, random_state=42)



In [315]:
# Convert your X data to numeric data types however you see fit
X_train['OverTime'] = X_train['OverTime'].map({'Yes': 1, 'No': 0})
X_test['OverTime'] = X_test['OverTime'].map({'Yes': 1, 'No': 0})
X_train['Attrition'] = X_train['Attrition'].map({'Yes': 1, 'No': 0})
X_test['Attrition'] = X_test['Attrition'].map({'Yes': 1, 'No': 0})
# Add new code cells as necessary
over_time_counts = X_train['OverTime'].map({0: 'No', 1: 'Yes'}).value_counts()
output = pd.DataFrame({'OverTime': over_time_counts.index, 'Count': over_time_counts.values})
output

Unnamed: 0,OverTime,Count
0,No,780
1,Yes,322


In [316]:
# Create a StandardScaler
scaler = StandardScaler()

# Fit the StandardScaler to the training data
X_train_scaled = scaler.fit_transform(X_train)

# Scale the training and testing data
X_test_scaled = scaler.transform(X_test)

In [317]:
from sklearn.preprocessing import OneHotEncoder

# Create a OneHotEncoder for the Department column
encoder = OneHotEncoder(sparse_output=False)

# Fit the encoder to the training data
X_train_encoded = encoder.fit_transform(X_train[['OverTime']])

# Create two new variables by applying the encoder
# to the training and testing data
X_test_encoded = encoder.transform(X_test[['OverTime']])

X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out(['OverTime']))
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out(['OverTime']))

# Show the encoded DataFrames
print("Training Encoded Data:")
print(X_train_encoded_df.head())

print("Testing Encoded Data:")
print(X_test_encoded_df.head())

Training Encoded Data:
   OverTime_0  OverTime_1
0         1.0         0.0
1         1.0         0.0
2         1.0         0.0
3         1.0         0.0
4         1.0         0.0
Testing Encoded Data:
   OverTime_0  OverTime_1
0         1.0         0.0
1         1.0         0.0
2         1.0         0.0
3         1.0         0.0
4         1.0         0.0


In [318]:
# First, let's check the column names of your DataFrame to ensure proper formatting
print(X_train.columns)

# Remove any extra spaces from column names
X_train.columns = X_train.columns.str.strip()
X_test.columns = X_test.columns.str.strip()

# Create a OneHotEncoder for the Attrition column
encoder_attrition = OneHotEncoder(sparse_output=False)


Index(['Education', 'Age', 'DistanceFromHome', 'JobSatisfaction', 'OverTime',
       'StockOptionLevel', 'WorkLifeBalance', 'YearsAtCompany', 'Attrition',
       'YearsSinceLastPromotion', 'NumCompaniesWorked'],
      dtype='object')


In [319]:
# Create a OneHotEncoder for the Attrition column
encoder_attrition = OneHotEncoder(sparse_output=False)

# Fit the encoder to the training data
X_train_encoded_attrition = encoder_attrition.fit_transform(X_train[['Attrition']])


# Create two new variables by applying the encoder
# to the training and testing data
X_test_encoded_attrition = encoder_attrition.transform(X_test[['Attrition']])
X_train_encoded_attrition_df = pd.DataFrame(X_train_encoded_attrition, columns=encoder_attrition.get_feature_names_out(['Attrition']))
X_test_encoded_attrition_df = pd.DataFrame(X_test_encoded_attrition, columns=encoder_attrition.get_feature_names_out(['Attrition']))


# Show the encoded DataFrames
print("Training Encoded Attrition Data:")
print(X_train_encoded_attrition_df.head())

print("Testing Encoded Attrition Data:")
print(X_test_encoded_attrition_df.head())


Training Encoded Attrition Data:
   Attrition_0  Attrition_1
0          1.0          0.0
1          1.0          0.0
2          1.0          0.0
3          1.0          0.0
4          1.0          0.0
Testing Encoded Attrition Data:
   Attrition_0  Attrition_1
0          1.0          0.0
1          1.0          0.0
2          0.0          1.0
3          1.0          0.0
4          1.0          0.0


## Part 2: Create, Compile, and Train the Model

In [320]:
# Find the number of columns in the X training data.
num_columns = X_train.shape[1]
print(f"Number of columns in the training data: {num_columns}")

# Create the input layer
input_layer = Input(shape=(num_columns,), name='shared_input')

# Create at least two shared layers
shared = Dense(64, activation='relu', name='input_layer')(input_layer)
shared = Dense(32, activation='relu', name='shared_layer_1')(shared)
shared = Dense(16, activation='relu', name='shared_layer_2')(shared)


Number of columns in the training data: 11


In [321]:
# Create a branch for Department
# with a hidden layer and an output layer
department_output = Dense(2, activation='softmax', name='department_output')(shared)
# Create the hidden layer
department_hidden = Dense(32, activation='relu', name='department_hidden')(department_input)
# Create the output layer (corrected: call Dense layer on previous layer)
department_output = Dense(1, activation='sigmoid', name='department_output')(department_hidden)

In [322]:
# Create a branch for Attrition
# with a hidden layer and an output layer
attrition_output = Dense(1, activation='sigmoid', name='attrition_output')(shared)
# Create the hidden layer
attrition_hidden = Dense(32, activation='relu', name='attrition_hidden')(attrition_input)
# Create the output layer 
attrition_output = Dense(1, activation='sigmoid', name='attrition_output')(attrition_hidden)

In [323]:
# Create the model

model = Model(inputs=input_layer, outputs=[department_output, attrition_output])

# Compile the model
model.compile(
    optimizer='adam',
    loss={
        'department_output': 'binary_crossentropy',
        'attrition_output': 'binary_crossentropy'
    },
    metrics={
        'department_output': 'accuracy',
        'attrition_output': 'accuracy'
    }
)

# Summarize the model
model.summary()

In [None]:
print(X_train.shape)  


(1102, 11)


In [348]:
# Remove rows with missing values in 'Attrition' or replace them with a default value like 'No'
X_train['Attrition'] = X_train['Attrition'].fillna('No')  # Optional, if you want to replace NaNs with 'No'

# Map 'Yes' and 'No' to 1 and 0
X_train['Attrition'] = X_train['Attrition'].map({'Yes': 1, 'No': 0})

# Check the result
print(X_train['Attrition'].head())


1343    0
1121    0
1048    0
1393    0
527     0
Name: Attrition, dtype: int64


In [349]:
print(X_train['Attrition'].value_counts())  # This should show counts of 1 and 0


Attrition
0    1102
Name: count, dtype: int64


In [333]:
X_train['Attrition'] = X_train['Attrition'].map({'Yes': 1, 'No': 0})
X_test['Attrition'] = X_test['Attrition'].map({'Yes': 1, 'No': 0})

In [350]:
y_train_department = pd.get_dummies(y_train['Department'], drop_first=True)  
y_test_department = pd.get_dummies(y_test['Department'], drop_first=True)

In [351]:
# Check if department columns were correctly created
print(f"Columns after one-hot encoding for training department: {y_train_department.columns}")
print(f"First few rows of the department encoding (train):\n{y_train_department.head()}")

# Check the types and shape of your target variables
print(f"Data type of y_train_department: {y_train_department.dtypes}")
print(f"Shape of y_train_department: {y_train_department.shape}")

# Ensure the 'Attrition' column is in the correct format
print(f"Data type of y_train_attrition: {y_train['Attrition'].dtype}")
print(f"Data type of X_train['Attrition']: {X_train['Attrition'].dtype}")
print(f"First few values of y_train['Attrition']:\n{y_train['Attrition'].head()}")

Columns after one-hot encoding for training department: Index(['Research & Development', 'Sales'], dtype='object')
First few rows of the department encoding (train):
      Research & Development  Sales
1343                    True  False
1121                   False   True
1048                   False   True
1393                   False   True
527                    False   True
Data type of y_train_department: Research & Development    bool
Sales                     bool
dtype: object
Shape of y_train_department: (1102, 2)
Data type of y_train_attrition: object
Data type of X_train['Attrition']: int64
First few values of y_train['Attrition']:
1343    No
1121    No
1048    No
1393    No
527     No
Name: Attrition, dtype: object


In [352]:
# Convert boolean department columns to integer (0 or 1)
y_train_department = y_train_department.astype(int)
y_test_department = y_test_department.astype(int)

# Check that department columns are now integers
print(f"Data type of y_train_department after conversion: {y_train_department.dtypes}")

# Convert 'Attrition' column from categorical ('Yes', 'No') to numeric (1, 0)
y_train_attrition = y_train['Attrition'].map({'Yes': 1, 'No': 0}).astype(int)
y_test_attrition = y_test['Attrition'].map({'Yes': 1, 'No': 0}).astype(int)

# Check the conversion
print(f"Data type of y_train_attrition after conversion: {y_train_attrition.dtype}")
print(f"First few values of y_train_attrition:\n{y_train_attrition.head()}")


Data type of y_train_department after conversion: Research & Development    int64
Sales                     int64
dtype: object
Data type of y_train_attrition after conversion: int64
First few values of y_train_attrition:
1343    0
1121    0
1048    0
1393    0
527     0
Name: Attrition, dtype: int64


In [341]:
print(f"X_train shape: {X_train.shape}")


X_train shape: (1102, 11)


In [353]:
# Separate target columns
y_train_department = y_train['Department']
y_train_attrition = y_train['Attrition']

y_test_department = y_test['Department']
y_test_attrition = y_test['Attrition']

In [354]:
import pandas as pd
from tensorflow.keras.utils import to_categorical

# Convert the 'Attrition' column to binary (1 for 'Yes' and 0 for 'No')
X_train['Attrition'] = X_train['Attrition'].map({'Yes': 1, 'No': 0})
X_test['Attrition'] = X_test['Attrition'].map({'Yes': 1, 'No': 0})

# One-hot encode the 'Department' column for multi-class classification
y_train_department = pd.get_dummies(y_train['Department'], drop_first=True)
y_test_department = pd.get_dummies(y_test['Department'], drop_first=True)

# Convert the 'Attrition' column into a numeric format (for binary classification)
y_train_attrition = y_train['Attrition'].map({'Yes': 1, 'No': 0}).values
y_test_attrition = y_test['Attrition'].map({'Yes': 1, 'No': 0}).values


In [355]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

# Define the input layer
input_layer = Input(shape=(X_train.shape[1],))

# Add hidden layers
x = Dense(64, activation='relu')(input_layer)
x = Dense(64, activation='relu')(x)

# Define the output for 'Attrition' (binary classification)
attrition_output = Dense(1, activation='sigmoid', name='attrition_output')(x)

# Define the output for 'Department' (multi-class classification)
department_output = Dense(y_train_department.shape[1], activation='softmax', name='department_output')(x)

# Create the model
model = Model(inputs=input_layer, outputs=[attrition_output, department_output])

# Summary of the model architecture
model.summary()


In [356]:
model.compile(
    loss={'attrition_output': 'binary_crossentropy', 'department_output': 'categorical_crossentropy'},
    optimizer='adam',
    metrics={'attrition_output': 'accuracy', 'department_output': 'accuracy'}
)


In [365]:
# Train the model
history = model.fit(
    X_train,  # Features
    {'attrition_output': y_train_attrition, 'department_output': y_train_department},  # Labels for each output
    epochs=20,  # Number of epochs
    batch_size=32,  # Number of samples per gradient update
    validation_split=0.2,  # Use 20% of data for validation
    verbose=1  # Print training progress
)


Epoch 1/20
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - attrition_output_accuracy: 0.8497 - attrition_output_loss: nan - department_output_accuracy: 0.7077 - department_output_loss: nan - loss: nan - val_attrition_output_accuracy: 0.7873 - val_attrition_output_loss: nan - val_department_output_accuracy: 0.6516 - val_department_output_loss: nan - val_loss: nan
Epoch 2/20
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - attrition_output_accuracy: 0.8537 - attrition_output_loss: nan - department_output_accuracy: 0.7069 - department_output_loss: nan - loss: nan - val_attrition_output_accuracy: 0.7873 - val_attrition_output_loss: nan - val_department_output_accuracy: 0.6516 - val_department_output_loss: nan - val_loss: nan
Epoch 3/20
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - attrition_output_accuracy: 0.8402 - attrition_output_loss: nan - department_output_accuracy: 0.6967 - department_output_loss: nan 

In [364]:
# Evaluate the model with the testing data
loss, attrition_loss, department_loss, attrition_acc, department_acc = model.evaluate(
    X_test,
    {'attrition_output': y_test_attrition, 'department_output': y_test_department}
)
print(f'Attrition Accuracy: {attrition_acc}, Department Accuracy: {department_acc}')


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - attrition_output_accuracy: 0.8492 - attrition_output_loss: nan - department_output_accuracy: 0.6916 - department_output_loss: nan - loss: nan
Attrition Accuracy: 0.8695651888847351, Department Accuracy: 0.7010869383811951


In [359]:
predictions = model.predict(X_test)
attrition_preds = predictions[0]  # Output for 'Attrition'
department_preds = predictions[1]  # Output for 'Department'


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


In [361]:
# Print the accuracy for both department and attrition
print(f"Attrition Accuracy: {attrition_acc:.4f}")
print(f"Department Accuracy: {department_acc:.4f}")

Attrition Accuracy: 0.8696
Department Accuracy: 0.7011


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. 
Accuracy might not be the most reliable metric, especially for the attrition prediction task. If the dataset is imbalanced, meaning there are significantly more employees who stayed than those who left, a model could appear to perform well by mostly predicting the majority class. In such cases, precision, recall, and the F1 score provide a clearer picture of how well the model is identifying actual attrition cases. Additionally, a confusion matrix can help visualize the true and false predictions. For department prediction, accuracy can be more meaningful if the classes are balanced, but macro F1 may still provide more nuance.

2. 
For the attrition output, a sigmoid activation function was used because it is ideal for binary classification tasks. It produces an output between 0 and 1, which can be interpreted as the probability of an employee leaving. For the department output, a softmax activation was chosen since the task involves multiple distinct classes. Softmax converts the final layer's outputs into a probability distribution over the department categories. This allows the model to predict the most likely department while still giving insight into alternative possibilities.

3. 
There are several ways the model could be enhanced. First, addressing class imbalance using techniques like oversampling, undersampling, or class weighting could improve the model’s ability to detect attrition cases. Second, tuning hyperparameters such as learning rate, number of layers, or batch size may yield better results. Additionally, adding regularization methods like dropout or using more advanced optimizers might improve generalization. Lastly, evaluating the model using more robust metrics and visualization tools can help identify weak points and guide improvements.