# Importing & Inspecting

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from math import sqrt

# preprocessing
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.over_sampling import SMOTE, ADASYN

# selection and tunning
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_predict, KFold, train_test_split, GridSearchCV, learning_curve
from sklearn.pipeline import make_pipeline, Pipeline

# models
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier

# evaluation
from sklearn import metrics
from sklearn.metrics import confusion_matrix, plot_confusion_matrix



data_train  = pd.read_csv('train.csv')
data_test   = pd.read_csv('test.csv')

pd.set_option("max_columns", None)

In [None]:
data_test[data_test.Education_level == 'level_5'].describe()

# Cleaning & Inspecting

In [None]:
# simplify columns' names

data_train.rename(columns = 
                  {'Employee_type'                        : 'employee_type',
                  'marital_status_maried(Y/N)'            : 'marital_status', 
                  'number_of_dependences'                 : 'children',
                  'Education_level'                       : 'education_level',
                  'GPA'                                   : 'gpa',
                  'assign_of_otherposition'               : 'job_otherposition',
                  'annual leave'                          : 'annual_leave',
                  'sick_leaves'                           : 'sick_leave',
                  'Last_achievement_%'                    : 'last_achievement',
                  'Achievement_above_100%_during3quartal' : 'achievement_above_100',
                  'Best Performance'                      : 'performance'},
                  inplace = True)

data_test.rename(columns = 
                  {'Employee_type'                        : 'employee_type',
                  'marital_status_maried(Y/N)'            : 'marital_status', 
                  'number_of_dependences'                 : 'children',
                  'Education_level'                       : 'education_level',
                  'GPA'                                   : 'gpa',
                  'assign_of_otherposition'               : 'job_otherposition',
                  'annual leave'                          : 'annual_leave',
                  'sick_leaves'                           : 'sick_leave',
                  'Last_achievement_%'                    : 'last_achievement',
                  'Achievement_above_100%_during3quartal' : 'achievement_above_100',
                  'Best Performance'                      : 'performance'},
                  inplace = True)

In [None]:
# data cleansing

def cleaning(data):
    
    # 'age' column descibes year of birth, turn it into age
    
    data['age'] = 2020 - data.age
    
    
    # 'gpa' column hasn't right baseline on every education level, standarize:
    
    for i in range(len(data)):
    
        edu = data.loc[i, 'education_level']
        ipk = data.loc[i, 'gpa']
        
        if ((edu == 'level_0') or (edu == 'level_1')):
        
            if (ipk == 0 or ipk > 100) :
                continue
                
            elif ipk <= 10:
                data.loc[i, 'gpa'] = ipk / 10
                
            elif ipk <= 60:
                 data.loc[i, 'gpa'] = ipk / 60
            
            else:
                 data.loc[i, 'gpa'] = ipk / 100
                

        else:
            
            if (ipk == 0 or ipk > 400) :
                continue
                
            elif ipk <= 4:
                data.loc[i, 'gpa'] = ipk / 4
            
            elif ipk <= 40:        
                data.loc[i, 'gpa'] = ipk / 40
            
            else:
                data.loc[i, 'gpa'] = ipk / 400

                
cleaning(data_train)
cleaning(data_test)

## fill the outlier from level_1

data_train.loc[108, 'gpa'] = np.median(data_train.gpa[data_train.education_level == 'level_1'])


## drop null values in data_train

data_train.dropna(inplace = True)
data_train.reset_index(inplace = True)
data_train.drop(columns = 'index', inplace=True)

# concatenate train & test

data_test['performance'] = np.NaN

data_train['train_test'] = 1
data_test['train_test']  = 0

all_data = pd.concat([data_train, data_test])


# EDA

In [None]:
# seperate numerical & categorical data

numerical_data    = data_train.select_dtypes(['int', 'float'])
categorical_data  = data_train.select_dtypes(['object', 'bool'])

Numerical

In [None]:
for i in numerical_data.columns:
    plt.hist(numerical_data[i], bins = 20)
    plt.title(i)
    plt.show()

In [None]:
for i in numerical_data.columns:
    sns.boxplot(numerical_data[i])
    plt.title(i)
    plt.show()

In [None]:
pd.pivot_table(numerical_data, index = 'performance', values = ['job_duration_in_current_job_level',
       'job_duration_in_current_person_level',
       'job_duration_in_current_branch', 'age', 'children', 'gpa',
       'year_graduated', 'job_duration_from_training', 'branch_rotation',
       'job_rotation', 'job_otherposition', 'annual_leave', 'sick_leave',
       'last_achievement', 'achievement_above_100'])

In [None]:
abs(data_train.corr().performance).sort_values(ascending = False)

Categorical

In [None]:
for i in categorical_data.columns:
    sns.barplot(categorical_data[i].value_counts().index, 
                categorical_data[i].value_counts(),
                ).set_title(i)
    plt.show()

In [None]:
# significancy checks on categorical data

from scipy.stats import chi2_contingency

categorical = data_train.select_dtypes(['object', 'bool'])


for i in categorical.columns:
    
    pivoted = data_train.pivot_table(index   = 'performance',
                                     columns = i,
                                     values  = 'age',
                                     aggfunc = np.count_nonzero,
                                     fill_value=0)
    
    print(pivoted) 
    col = []
    
    for j in range(len(pivoted.columns)):
        col.append(pivoted.columns[j])
    
    _, pval, _, _ = chi2_contingency(pivoted[col])
    
    print('\n pval for {} is {} \n\n'.format(i, pval))

# feature engineering & selection

In [None]:
# normalize skewed distribution on numerical data

all_data['norm_job_duration_in_current_branch'] = np.log(all_data.job_duration_in_current_branch+1)
all_data['norm_age'] = np.log(all_data.age+1)
all_data['norm_year_graduated'] = np.log(all_data.year_graduated+1)
all_data['norm_job_duration_from_training'] = np.log(all_data.job_duration_from_training+1)
all_data['norm_branch_rotation'] = np.log(all_data.branch_rotation+1)
all_data['norm_job_rotation'] = np.log(all_data.job_rotation+1)
all_data['norm_job_otherposition'] = np.log(all_data.job_otherposition+1)
all_data['norm_annual_leave'] = np.log(all_data.annual_leave+1)
all_data['norm_sick_leave'] = np.log(all_data.sick_leave+1)

In [None]:
# recheck if the operation usefull

abs(all_data[all_data.train_test == 1].corr().performance).sort_values(ascending = False)

In [None]:
# drop columns that have very low correlation

all_data = all_data.drop(['norm_age', 'job_duration_in_current_branch', 'sick_leave', 'norm_branch_rotation',
                          'norm_job_duration_from_training', 'annual_leave', 'norm_year_graduated',
                          'year_graduated', 'job_rotation','job_otherposition',
                          'job_duration_in_current_job_level', 'norm_job_rotation',
                          'achievement_above_100'], axis = 1)

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
sns.heatmap(abs(all_data.corr()), annot= True, ax=ax)

In [None]:
# create dummies for train data to check feature selection

feat_sel = all_data[all_data.train_test == 1]

X_feat_sel = pd.get_dummies(data = feat_sel, columns = feat_sel.select_dtypes(['object', 'bool']).columns)
X_feat_sel = X_feat_sel.drop(columns = ['performance', 'train_test'])
y_feat_sel = feat_sel.performance

In [None]:
# feature selection insights

models = [LogisticRegression(C=1, penalty='l2'), Lasso(alpha = 1.0)]

for model in models:
    rfe = RFE(model, 10)
    fit = rfe.fit(X_feat_sel, y_feat_sel)
    
    print("Num Features: %s \n" % (fit.n_features_))
    print("Selected Features: %s \n" % (fit.support_))
    print("Feature Ranking: %s \n \n \n \n" % (fit.ranking_))

# PRE PROCESSING

In [None]:
# create dummies from OneHotEncoding

all_data = pd.get_dummies(data = all_data, columns = all_data.select_dtypes(['object', 'bool']).columns)

In [None]:
# seperate the data

all_data_cleaned = all_data[all_data.train_test == 1].drop(['train_test'], axis = 1)


X_submission = X = all_data[all_data.train_test == 0].drop(['performance', 'train_test'], axis = 1)

In [None]:
# split train - test
data_train, data_test = train_test_split(all_data_cleaned, test_size = 0.2,
                                         random_state = 10)


# Define class target for testing
X_test = data_test.drop(['performance'], axis = 1)
y_test = data_test['performance']

In [None]:
# SMOTE
X = data_train.drop(['performance'], axis = 1)
y = data_train['performance']

oversample = ADASYN()
X_train, y_train = oversample.fit_resample(X, y)


print(f'''shape of data before SMOTE: {X.shape}
#shape of data after SMOTE: {X_train.shape}''')


# MODEL PREPARATION

In [None]:
# input classifiers here!

classifiers = {}

classifiers.update({"AdaBoost": AdaBoostClassifier()})
classifiers.update({"Gradient Boosting": GradientBoostingClassifier()})
classifiers.update({"Bagging": BaggingClassifier()})
classifiers.update({"Extra Trees Ensemble": ExtraTreesClassifier()})
classifiers.update({"Random Forest": RandomForestClassifier()})
classifiers.update({"XGB": xgb.XGBRFClassifier(random_state=12345, nthread=-1)})


# input parameters here!

parameters = {}

parameters.update({"AdaBoost": { 
                                "classifier__base_estimator": [DecisionTreeClassifier(max_depth = ii) for ii in range(1,6)],
                                "classifier__n_estimators": [200],
                                "classifier__learning_rate": [0.001, 0.01, 0.05, 0.1, 0.25, 0.50, 0.75, 1.0]
                               }})

parameters.update({"Gradient Boosting": { 
                                        "classifier__learning_rate":[0.15,0.1,0.05,0.01,0.005,0.001], 
                                        "classifier__n_estimators": [200],
                                        "classifier__max_depth": [2,3,4,5,6],
                                        "classifier__min_samples_split": [0.001, 0.01, 0.05, 0.10],
                                        "classifier__min_samples_leaf": [0.001, 0.01, 0.05, 0.10],
                                        "classifier__max_features": ["auto", "sqrt", "log2"],
                                        "classifier__subsample": [1]
                                         }})

parameters.update({"Bagging": { 
                                "classifier__base_estimator": [DecisionTreeClassifier(max_depth = ii) for ii in range(1,6)],
                                "classifier__n_estimators": [200],
                                "classifier__max_features": [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                                "classifier__n_jobs": [-1]
                                }})

parameters.update({"Extra Trees Ensemble": { 
                                            "classifier__n_estimators": [200],
                                            "classifier__class_weight": [None, "balanced"],
                                            "classifier__max_features": ["auto", "sqrt", "log2"],
                                            "classifier__max_depth" : [3, 4, 5, 6, 7, 8],
                                            "classifier__min_samples_split": [0.001, 0.01, 0.05, 0.10],
                                            "classifier__min_samples_leaf": [0.001, 0.01, 0.05, 0.10],
                                            "classifier__criterion" :["gini", "entropy"]     ,
                                            "classifier__n_jobs": [-1]
                                             }})

parameters.update({"Random Forest": { 
                                    "classifier__n_estimators": [200],
                                    "classifier__class_weight": [None, "balanced"],
                                    "classifier__max_features": ["auto", "sqrt", "log2"],
                                    "classifier__max_depth" : [3, 4, 5, 6, 7, 8],
                                    "classifier__min_samples_split": [0.001, 0.01, 0.05, 0.10],
                                    "classifier__min_samples_leaf": [0.001, 0.01, 0.05, 0.10],
                                    "classifier__criterion" :["gini", "entropy"]     ,
                                    "classifier__n_jobs": [-1]
                                     }})

parameters.update({"XGB": { 
                           'classifier__min_child_weight': [1, 5, 10],
                           'classifier__gamma': [0.5, 1, 1.5, 2, 5],
                           'classifier__subsample': [0.6, 0.8, 1.0],
                           'classifier__colsample_bytree': [0.6, 0.8, 1.0],
                           'classifier__max_depth': [3, 4, 5, 10, 12, 15],
                           'classifier__learning_rate' : [0.01, 0.02, 0.1, 0.25, 0.5],
                           }})
                                     

In [None]:
results = {}


# Tune and evaluate classifiers
for classifier_label, classifier in classifiers.items():
    # Print message to user
    print("Now tuning {} :".format(classifier_label))
    
    # Scale features via Z-score normalization
    scaler = StandardScaler()
    
    # Define steps in pipeline
    steps = [("scaler", scaler), ("classifier", classifier)]
    
    # Initialize Pipeline object
    pipeline = Pipeline(steps = steps)
      
    # Define parameter grid
    param_grid = parameters[classifier_label]
    
    # Initialize GridSearch object
    gscv = GridSearchCV(pipeline, param_grid, cv = 5,  n_jobs= -1, verbose = 1, scoring = "accuracy", return_train_score=True)
                      
    # Fit gscv
    gscv.fit(X_train, y_train)  
    
    # Get best parameters and score
    best_params = gscv.best_params_
    best_score = gscv.best_score_
    
    # Update classifier parameters and define new pipeline with tuned classifier
    tuned_params = {item[12:]: best_params[item] for item in best_params}
    classifier.set_params(**tuned_params)
            
    # Make predictions
    y_pred = gscv.predict(X_test)
        
    # Get AUC
    auc = metrics.roc_auc_score(y_test, y_pred)
    
    # Get f1 score
    y_pred = gscv.predict(X_test)
    f1 = metrics.f1_score(y_test, y_pred)
    
    # Get recall score
    recall = metrics.recall_score(y_test, y_pred)

    # Get precision score
    precision = metrics.precision_score(y_test, y_pred)
    
    # Get accuracy score
    accuracy = metrics.accuracy_score(y_test, y_pred)
    
    # False Positive rate
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    fp_rate = (fp)/(fp + tn)
    
    plot_confusion_matrix(gscv, X_test, y_test) 
    
    plt.figure(figsize=(20,20))
    plt.show()
    
    test_scores = gscv.cv_results_['mean_test_score']
    train_scores = gscv.cv_results_['mean_train_score'] 

    plt.plot(test_scores, label='test')
    plt.plot(train_scores, label='train')
    plt.legend(loc='best')
    plt.show()

    # Save results
    result = {"Classifier": classifier_label,
              "Best Parameters": best_params,
              "Training Recall Score": best_score,
              "Test Recall Score": recall,
              "Test Precision Score": precision,
              "Test Accuracy Score" : accuracy,
              "Test F1 Score": f1,
              "False Positive Rate": fp_rate,
              "Test AUC": auc}
    
    results.update({classifier_label: result})  
    
    print(f'''Best Parameters       : {best_params} \n,
          Training Recall Score : {best_score} \n,
          Test Recall Score     : {recall} \n,
          Test Precision Score  : {precision} \n,
          Test Accuracy Score   : {accuracy} \n,
          Test F1 Score         : {f1} \n,
          False Positive Rate   : {fp_rate} \n,
          Test AUC              : {auc}''')
    
      

In [None]:
y_submission = gscv.predict_proba(X_submission)
submission = pd.DataFrame(y_submission)
submission.rename(columns = {1 : 'Best Performance'}, inplace = True)
submission = submission['Best Performance']
submission = submission.reset_index()

submission.to_csv(f'''submission_XGB.csv''', index = False)