In [59]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix
from sklearn.cross_validation import train_test_split
from sklearn.metrics.scorer import make_scorer
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import ElasticNet, Ridge, Lasso
from sklearn.ensemble import GradientBoostingClassifier
%matplotlib inline

In [80]:
df = pd.read_csv('../hackerrank_data/AWEmployees.csv')
# print df.head()
# print df.info()

In [61]:
y = df.pop('SalariedFlag').apply(lambda x: 1 if x==True else 0) #create the target variable

In [62]:
# print len(df.ModifiedDate .unique()) #this code helped me understand which of the original variables had no variation and varied completely...I dropped these variables
df.drop(['BusinessEntityID', 'NationalIDNumber', 'LoginID', 'OrganizationNode', 'CurrentFlag', 'rowguid', 'ModifiedDate'], 1, inplace=True)

In [63]:
df.BirthDate = pd.to_datetime('2016-12-17') - pd.to_datetime(df.BirthDate) #I converted the dates into panda datetime variables, and then took the difference between it and today's date in order to get a continuous variable
df.HireDate = pd.to_datetime('2016-12-17') - pd.to_datetime(df.HireDate)

df.BirthDate = (df.BirthDate / np.timedelta64(1, 'D')).astype(int) #I convert the timedelta variable into an int
df.HireDate = (df.HireDate / np.timedelta64(1, 'D')).astype(int)

In [65]:
# Here I create dummies out of the categorical variables
df_en = pd.concat([df, pd.get_dummies(df['OrganizationLevel'], drop_first=True), pd.get_dummies(df['JobTitle'], drop_first=True), pd.get_dummies(df['MaritalStatus'], drop_first=True), pd.get_dummies(df['Gender'], drop_first=True)], axis=1)
df_en.drop(['OrganizationLevel', 'JobTitle', 'MaritalStatus', 'Gender'], 1, inplace=True)

In [66]:
X = df_en # Define my matrix of features

In [79]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 23) #Create training and test subsamples
# print X_train.describe()

## Method 1: Elastic Net
The first method I use is logistic regression. The two hyperparameters I tune are (1) lambda (the coefficient on the penalty term) and (2) the elastic net mixing parameter. I also normalize the data since this algorithm uses a distance metric.

In [70]:
# This is the scoring metric...I use accuracy (ratio of correct predictions to total observations). The built-in scorer is R^2, but I want the training to be done in terms of accuracy.
def scorer(y_actual, y_pred):
    return np.mean(np.where(y_actual==np.where(y_pred>.5, 1, 0), 1, 0))

In [77]:
en = ElasticNet(normalize=True)

score = make_scorer(scorer, greater_is_better=True)

param_dict = {'alpha': [0.0000000001, 0.0000005, 0.0000075, 0.000001, 0.000005, 0.0001, 0.001, 0.01, 0.1, .25, .5, .75, 1],
    'l1_ratio': [0, 0.0002, .45, .475, .5, .525, .55, .95, 1]}
gsCV_en = GridSearchCV(en, param_dict, n_jobs = -1, scoring=score)
gsCV_en.fit(X_train, y_train)

print gsCV_en.best_params_
print gsCV_en.best_score_

y_pred = gsCV_en.predict(X_test)
print '\nElastic Net accuracy on test data: {}'.format(scorer(y_test, y_pred))

{'alpha': 7.5e-06, 'l1_ratio': 0}
0.92118226601

Elastic Net accuracy on test data: 0.942528735632


The optimal 'l1_ratio' parameter is 0 which maybe means the penalty (represented by alpha) is just too strong for the lasso dimension of the regression so all of the coefficients are getting zeroed. This would be something to explore further with more time. 

## Method 2: Gradient Boosting
The second classifier I use is gradient boost. The two hyperparameters I tune are the learning rate (step size) and the number of estimators (number of stumps). 

In [81]:
gbc = GradientBoostingClassifier()

param_dict = {'learning_rate': [0.00001, 0.00005, 0.0001, 0.001, 0.01, 0.1, .25, .5, .75, 1],
    'n_estimators': [50, 75, 100, 500, 1000]}
gsCV_gbc = GridSearchCV(gbc, param_dict, n_jobs = -1, scoring=score)
gsCV_gbc.fit(X_train, y_train)

print gsCV_gbc.best_params_
print gsCV_gbc.best_score_

y_pred = gsCV_gbc.predict(X_test)
print '\nGradient Boost accuracy on test data: {}'.format(scorer(y_test, y_pred))

{'n_estimators': 500, 'learning_rate': 0.25}
0.916256157635

Gradient Boost accuracy on test data: 0.988505747126


The optimal step size is 0.25 and the optimal number of iterations is 500.

## Explanations and Conclusions 
For both methods I performed a grid search with k-fold (k=3 in this case) cross-validation in order to tune the hyperparamters for each method. The levels of the hyperparameters that scored the best were an improvement from the baseline model since I included the baseline parameters in the search. The measure of performance I used was accuracy. I did notice that only about 18% of the observations were such that the salaryflag variable was equal to false. It might be worthwhile to consider other scoring metrics (if I only predicted True, then my accuracy would be 82%...in any case, the model was ostensibly an improvement on this baseline). 

In terms of overfitting, the k-folds cross-validation technique helps mitigate this potential problem. Likewise, the two approaches I use also help mitigate overfitting (the elasticnet imposes a penalty on the coefficients and the gradient boost, as an ensemble method, tends to be less prone to overfitting).  

After tuning the hyperparamters using the training set through cross-validation, I use the testing subsample to compare the two models (again, using accuracy as a measure of performance) and give an estimate of how the models would perform on a different, indepedent dataset taken from the same population. Based on these scores, the gradient boosting models is superior. 

Finally, below I train the entire dataset using the gradient boosting model with the best hyperparameters identified above. 

In [83]:
gbc = GradientBoostingClassifier(n_estimators = 500, learning_rate = 0.25)
gbc.fit(X, y)

GradientBoostingClassifier(init=None, learning_rate=0.25, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=500,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)