# Submission 1 - Machine Learning Terapan - Predictive Analytics

Name : Alfia N. Rakhmatika<br>
Dataset : IBM HR Analytics Employee Attrition & Performance https://www.kaggle.com/pavansubhasht/ibm-hr-analytics-attrition-dataset<br>
Problem : Classification

# Load Modules & Datasets

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
import warnings

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier

warnings.simplefilter(action='ignore')
%matplotlib inline

In [None]:
# Install kaggle package
!pip install -q kaggle

In [None]:
# Upload file kaggle.json
from google.colab import files
files.upload()

In [None]:
# Create directory and change permission
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!ls ~/.kaggle

In [None]:
# Download datasets fron kaggle using API command
!kaggle competitions download -c store-sales-time-series-forecasting

In [None]:
# Unzip dataset
!mkdir hr
!unzip -qq store-sales-time-series-forecasting.zip -d hr
!ls hr

In [None]:
# Datasets
dfraw = pd.read_csv('/content/hr/WA_Fn-UseC_-HR-Employee-Attrition.csv', sep=',')
df = dfraw.copy()
df.head()

# Local Data

In [None]:
# Datasets
dfraw = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv', sep=',')
df = dfraw.copy()
pd.set_option('display.max_columns', None)
df.head()

# Exploratory Data Analysis

In [None]:
df.info()

In [None]:
df.shape

Check Missing Value

In [None]:
print ("Missing Values per column:")
df.isna().sum()

Change Value for Several Columns

In [None]:
# Replace several columns with integers 
df['Attrition'] = df['Attrition'].apply(lambda x: 1 if x == 'Yes' else 0)
df['OverTime'] = df['OverTime'].apply(lambda x: 1 if x == 'Yes' else 0)
df['Over18'] = df['Over18'].apply(lambda x: 1 if x == 'Y' else 0)


Histogram

In [None]:
df.hist(bins = 30, figsize = (20,20), color = 'b')
warnings.filterwarnings("ignore")

Unique Value

In [None]:
for col in df.columns:
    print(f'Unique values in column {col}:')
    print(df[col].unique())
    print()

Explore Statistic Descriptive of each Target

In [None]:
left_df = df[df['Attrition'] == 1]
stayed_df = df[df['Attrition'] == 0]

In [None]:
left_df.describe()

In [None]:
stayed_df.describe()

Heatmap

In [None]:
f, ax = plt.subplots(figsize = (20, 20))
sns.heatmap(df.corr(), cmap='Blues', annot=True, fmt='.2f')
warnings.filterwarnings("ignore")

Boxplot of Monthly Income

In [None]:
plt.figure(figsize=(15, 10))
sns.boxplot(x = 'MonthlyIncome', y = 'JobRole', data = df)

Check for Target Imbalance

In [None]:
target = df.groupby(['Attrition']).Gender.agg('count').reset_index()

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8,4), sharey=True)
ax = sns.barplot(x='Attrition', y='Gender', data=target)

for p in ax.patches:
    ax.annotate('{}'.format(int(p.get_height())), 
                   (p.get_x() + p.get_width() / 2, p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points')
ax.set(ylim=(0, 2500))
ax.set(xlabel='Segmentation', ylabel='Count')

# Data Preparation

Drop Columns with Single Unique Value and High Correlation

In [None]:
df.drop(['EmployeeCount', 'StandardHours', 'Over18', 'EmployeeNumber', 'JobLevel'], axis=1, inplace=True)

Onehot Encoding for Categorical Data

In [None]:
# Separate Categorical and Numerical Data
num_dtypes=['int64','float64']
cat_dtypes=['object']

num_df = df.select_dtypes(include = num_dtypes)
cat_df = df.select_dtypes(include = cat_dtypes)
numericals = num_df.columns
categoricals = cat_df.columns

print(numericals)
print(categoricals)

In [None]:
# Onehot encoding

for col in categoricals:
  df = pd.get_dummies(df, columns=[col], prefix=[col])

In [None]:
df.head(5)

In [None]:
df.shape

Separate Features and Target Dataset

In [None]:
feat = df.drop('Attrition', axis=1)

In [None]:
target = df['Attrition']

Scaling Data

In [None]:
scaler = MinMaxScaler()
X = scaler.fit_transform(feat)

Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, target, test_size = 0.2)

# Model

Logistic Regression Model

In [None]:
logit = LogisticRegression()
logit.fit(X_train, y_train)

In [None]:
y_pred = logit.predict(X_test)
cm = confusion_matrix(y_pred, y_test)
sns.heatmap(cm, annot=True)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
print("Accuracy : ", round(accuracy_score(y_test, y_pred), 2))
print("Precision : ", round(precision_score(y_test, y_pred, average='macro'), 2))
print("Recall : ", round(recall_score(y_test, y_pred, average='macro'), 2))
print("F1 Score : ", round(f1_score(y_test, y_pred, average='macro'), 2))

Random Forrest

In [None]:
rf = LogisticRegression()
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)
cm = confusion_matrix(y_pred, y_test)
sns.heatmap(cm, annot=True)

In [None]:
print(classification_report(y_test, y_pred))

Artificial Neural Networks

In [None]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(units=500, activation='relu', input_shape=(50, )))
model.add(tf.keras.layers.Dense(units=500, activation='relu'))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(units=500, activation='relu'))
model.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))
model.summary()

In [None]:
# Compile Model
model.compile(optimizer='Adam', loss='binary_crossentropy', metrics = ['accuracy'])

In [None]:
hist = model.fit(X_train, y_train, epochs = 100, batch_size = 50)

In [None]:
y_pred = model.predict(X_test)

# Set Treshhold, < 0.5 is Class 0
y_pred = (y_pred > 0.5)

In [None]:
hist.history.keys()

In [None]:
plt.plot(hist.history['loss'])
plt.title('Model Loss Progress During Training')
plt.xlabel('Epoch')
plt.ylabel('Training Loss')
plt.legend(['Training Loss'])

In [None]:
plt.plot(hist.history['accuracy'])
plt.title('Model Accuracy Progress During Training')
plt.xlabel('Epoch')
plt.ylabel('Training Accuracy')
plt.legend(['Training Accuracy'])

In [None]:
# Testing Set Performance
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True)

In [None]:
print(classification_report(y_test, y_pred))

# END