In [1]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import auc, roc_curve, classification_report

import h2o
from h2o.frame import H2OFrame
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.grid.grid_search import H2OGridSearch

%matplotlib inline

# Load Dataset

In [None]:
inputfilename = 'employee_retention_data.csv'
df = pd.read_csv(inputfilename, parse_dates=['join_date', 'quit_date'])
df.head()

In [None]:
df.info()

In [None]:
for column in df.columns:
    uniques = sorted(df[column].unique())
    print('{0:20s} {1:5d}\t'.format(column, len(uniques)), uniques[:5])

In [None]:
df.describe()

In [None]:
df.isnull().sum()

# Remove Outliers

In [None]:
df[df['seniority'] > 50]

In [None]:
df = df[df['seniority'] < 50]

# Check Uniqueness

In [None]:
duplicateRowsDF = df[df.duplicated(['employee_id', 'company_id'])]
duplicateRowsDF

In [None]:
df.drop_duplicates(subset=['employee_id', 'company_id'], keep=False, inplace=True)

# Feature Engineering

# Exploratory Data Analysis

In [None]:
# Visualization of different countries - bar
# grouped = df[['country', 'converted']].groupby('country').mean().reset_index()
# layout - 1 row and 2 columns
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(18, 6))
# plot aggregate
sns.barplot(x='country', y='converted', data=df, ax=ax[0])
ax[0].set_title('Mean Conversion Rate per Country', fontsize=16)
# plot for multiple variables
sns.countplot(x='country', hue='converted', data=df, ax=ax[1])
ax[1].set_title('Count Plot of Country', fontsize=16)
ax[1].set_yscale('log')
plt.tight_layout()
plt.show()

# Machine Learning