**Training**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce
import catboost
from sklearn import preprocessing
%matplotlib inline

In [None]:
dataset = pd.read_csv('train.csv')

In [None]:
dataset.head()

In [None]:
dataset['RecruitedAge'] = dataset['Year_of_recruitment'] - dataset['Year_of_birth']
dataset['CurrentAge'] = 2019 - dataset['Year_of_birth']
dataset['Length_of_service'] = 2019 - dataset['Year_of_recruitment']
dataset.head()

In [None]:
north_central = ['BENUE', 'KOGI', 'KWARA',
                'NASSARAWA', 'NIGER', 'PLATEAU', 'FCT']
north_east = ['ADAMAWA', 'BAUCHI', 'BORNO', 'GOMBE',
             'TARABA', 'YOBE']
north_west = ['JIGAWA', 'KADUNA', 'KANO', 'KATSINA',
             'KEBBI', 'SOKOTO', 'ZAMFARA']
south_east = ['ABIA', 'ANAMBRA', 'EBONYI', 'ENUGU', 'IMO']
south_west = ['EKITI', 'LAGOS', 'OGUN', 'ONDO',
             'OSUN', 'OYO']
south_south = ['AKWA IBOM', 'BAYELSA', 'CROSS RIVER', 'RIVERS',
              'DELTA', 'EDO']


geo_zone = {}
for i in dataset['State_Of_Origin'].unique():
    geo_zone[i] = None

for key in geo_zone.keys():
    if key in north_central:
    geo_zone[key] = 'North Central'
    elif key in north_east:
    geo_zone[key] = 'North East'
    elif key in north_west:
    geo_zone[key] = 'North West'
    elif key in south_west:
    geo_zone[key] = 'South West'
    elif key in south_south:
    geo_zone[key] = 'South South'
    elif key in south_east:
    geo_zone[key] = 'South East'
    else:
    print("No zone found", key)
    
dataset['Zones'] = dataset['State_Of_Origin'].map(geo_zone)

In [None]:
dataset=pd.merge(dataset,dataset[['State_Of_Origin','Division',
                            'Training_score_average']].groupby(['State_Of_Origin','Division']).mean(),how='left',on=['State_Of_Origin','Division'])
dataset=dataset.rename(columns={'Training_score_average_x':'Training_score_average','Training_score_average_y':'mean_org_div'})
dataset['new_trng_score_avg'] = dataset['Training_score_average'] / dataset['mean_org_div']
dataset.head()

In [None]:
dataset.info()

In [None]:
dataset.describe()

In [None]:
dataset["Qualification"].fillna(dataset["Qualification"].mode()[0], inplace = True)
tr_data = dataset
tr_data.info()

In [None]:
plt.figure(figsize=(14,12))
sns.heatmap(tr_data.corr().round(decimals=2), annot=True)
plt.title("Correlation heatmap")

In [None]:
cols =['Promoted_or_Not', 'EmployeeNo']
X_train = tr_data.drop(cols, axis=1)

In [None]:
X_train.head()

In [None]:
col = ['Promoted_or_Not']
y_train = tr_data[col]

In [None]:
# cat_cols = ['Division', 'Qualification', 'Gender', 'Channel_of_Recruitment', 
#              'Zones', 'State_Of_Origin',
#             'Marital_Status', 'Past_Disciplinary_Action',
#             'Previous_IntraDepartmental_Movement', 'Foreign_schooled']

# ce_target = ce.TargetEncoder(cols = cat_cols, smoothing = 5, return_df = True)
# X_train = ce_target.fit_transform(X_train, y_train)

In [None]:
columns = X_train.select_dtypes(include='object').columns.tolist()

le = preprocessing.LabelEncoder()
X_train[columns] = X_train[columns].apply(lambda x: le.fit_transform(x))

X_train = pd.get_dummies(X_train, columns=columns, drop_first=True)

In [None]:
# cat_features_index = np.where(X_train.dtypes != float)[0]


In [None]:
# cat_features_index

In [None]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.preprocessing import StandardScaler

xtrain,xtest,ytrain,ytest = train_test_split(X_train,y_train,train_size=.85,random_state=1234)


In [None]:
from catboost import CatBoostClassifier
# model = CatBoostClassifier(iterations=500,depth=12, 
#                            learning_rate=0.2,
#                            eval_metric='F1', score_function='L2', leaf_estimation_iterations=2, 
#                            l2_leaf_reg=9,one_hot_max_size=50,use_best_model=True,
#                            early_stopping_rounds=20, random_seed=42)


model = CatBoostClassifier(iterations=400, 
                           depth=12, learning_rate=0.2, eval_metric='F1', early_stopping_rounds=20, 
                           score_function='L2', bootstrap_type= 'Bernoulli', subsample=0.8, 
                           sampling_frequency='PerTree', random_strength=0.3, use_best_model=True, 
                           leaf_estimation_iterations=2, fold_len_multiplier=1.8, best_model_min_trees=3)

In [None]:
model.fit(xtrain,ytrain,eval_set=(xtest,ytest))

**Prediction**

In [None]:
test = pd.read_csv('test.csv')


In [None]:
test['RecruitedAge'] = test['Year_of_recruitment'] - test['Year_of_birth']
test['CurrentAge'] = 2019 - test['Year_of_birth']
test['Length_of_service'] = 2019 - test['Year_of_recruitment']

In [None]:
north_central = ['BENUE', 'KOGI', 'KWARA',
                'NASSARAWA', 'NIGER', 'PLATEAU', 'FCT']
north_east = ['ADAMAWA', 'BAUCHI', 'BORNO', 'GOMBE',
             'TARABA', 'YOBE']
north_west = ['JIGAWA', 'KADUNA', 'KANO', 'KATSINA',
             'KEBBI', 'SOKOTO', 'ZAMFARA']
south_east = ['ABIA', 'ANAMBRA', 'EBONYI', 'ENUGU', 'IMO']
south_west = ['EKITI', 'LAGOS', 'OGUN', 'ONDO',
             'OSUN', 'OYO']
south_south = ['AKWA IBOM', 'BAYELSA', 'CROSS RIVER', 'RIVERS',
              'DELTA', 'EDO']


geo_zone = {}
for i in test['State_Of_Origin'].unique():
    geo_zone[i] = None

for key in geo_zone.keys():
    if key in north_central:
    geo_zone[key] = 'North Central'
    elif key in north_east:
    geo_zone[key] = 'North East'
    elif key in north_west:
    geo_zone[key] = 'North West'
    elif key in south_west:
    geo_zone[key] = 'South West'
    elif key in south_south:
    geo_zone[key] = 'South South'
    elif key in south_east:
    geo_zone[key] = 'South East'
    else:
    print("No zone found", key)
    
test['Zones'] = test['State_Of_Origin'].map(geo_zone)

In [None]:
test=pd.merge(test,test[['State_Of_Origin','Division',
                            'Training_score_average']].groupby(['State_Of_Origin','Division']).mean(),how='left',on=['State_Of_Origin','Division'])
test=test.rename(columns={'Training_score_average_x':'Training_score_average','Training_score_average_y':'mean_org_div'})
test['new_trng_score_avg'] = test['Training_score_average'] / test['mean_org_div']
test.head()

In [None]:
test["Qualification"].fillna(test["Qualification"].mode()[0], inplace = True)
test.info()

In [None]:
cols =['EmployeeNo']
X_test = test.drop(cols, axis=1)

In [None]:
columns = X_test.select_dtypes(include='object').columns.tolist()

X_test[columns] = X_test[columns].apply(lambda x: le.fit_transform(x))
    
X_test = pd.get_dummies(X_test, columns=columns, drop_first=True)

In [None]:
pred = model.predict(X_test)
pred = pred.astype(np.int)

In [None]:
submission = pd.DataFrame({'EmployeeNo':test['EmployeeNo'],'Promoted_or_Not':pred})

In [None]:
submission.to_csv('DSN_Kaggle.csv',index=False)