In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math, random, os
from scipy.stats import ttest_ind
sns.set()

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler,MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import auc, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.naive_bayes import GaussianNB

from sklearn.manifold import Isomap, LocallyLinearEmbedding

from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier

In [3]:
df = pd.read_csv(r'./train.csv')
df.loc[df.shape[0], :] = ['JG06', 0.8185352771872451, 'PG07', 0.8185352771872451,
            1.47309198626562, 'RM_type_A', 1, 1969, 'Y', 3, 'level_4', 3.28, 2014,
            29, 13, 10, 4, 8, 0, 118.31, 3.0, 0]

df['Education_level'] = df['Education_level'].apply(lambda x: int(x.split('_')[-1]))
df['person_level'] = df['person_level'].apply(lambda x: int(x.split('PG')[-1]))
df['job_level'] = df['job_level'].apply(lambda x: int(x.split('JG')[-1]))

cat_columns = [col for col in df.columns if df.dtypes[col]==np.dtype('O')]
cat_columns.extend(['gender',])
num_columns = [col for col in df.columns if col not in cat_columns]
target_column = num_columns.pop()

ii = (df['GPA']<2.0) | (df['GPA']>4.0) 
idx = [i for i in range(df.shape[0]) if ii[i]]
df.loc[idx, 'GPA'] = np.nan

cols = df.isna().sum()>0
for col in cols.index:
    if cols[col]>0:
        df[col] = df[col].replace(np.nan, df[col].mean())

df.head()

Unnamed: 0,job_level,job_duration_in_current_job_level,person_level,job_duration_in_current_person_level,job_duration_in_current_branch,Employee_type,gender,age,marital_status_maried(Y/N),number_of_dependences,...,year_graduated,job_duration_from_training,branch_rotation,job_rotation,assign_of_otherposition,annual leave,sick_leaves,Last_achievement_%,Achievement_above_100%_during3quartal,Best Performance
0,4,1.352775,3,1.352775,1.732051,RM_type_A,2.0,1985.0,Y,1.0,...,2011.0,4.0,2.0,3.0,2.0,3.0,0.0,46.37,0.0,0.0
1,4,1.292285,3,1.292285,1.03923,RM_type_A,2.0,1989.0,Y,0.0,...,2007.0,4.0,4.0,3.0,0.0,0.0,1.0,47.68,0.0,0.0
2,5,2.565151,6,2.308679,1.780449,RM_type_C,1.0,1970.0,Y,1.0,...,1989.0,28.0,10.0,9.0,6.0,2.0,1.0,40.73,0.0,0.0
3,5,2.828427,6,1.0,1.385641,RM_type_A,2.0,1967.0,Y,2.0,...,1987.0,29.0,11.0,6.0,0.0,4.0,10.0,47.42,0.0,1.0
4,5,2.828427,6,2.828427,0.707107,RM_type_A,2.0,1965.0,Y,3.0,...,1985.0,30.0,9.0,8.0,2.0,3.0,0.0,47.18,0.0,0.0


In [4]:
# ano = ['job_duration_from_training','branch_rotation']
# df[ano[0]].apply(f2).max()
df.GPA.describe()


count    11154.000000
mean         3.160919
std          0.217193
min          2.000000
25%          3.030000
50%          3.160919
75%          3.260000
max          3.970000
Name: GPA, dtype: float64

In [5]:
def f1(x):
    return x*np.log(x)
def f2(x):
    return np.exp(x)/x
def f3(x):
    return 1/x
def f4(x):
    return x
def f5(x):
    return x*x

fs = [f1, f3, f4,]
trans = dict()
for col in num_columns:
    pmin, ft = 0, None
    for f in fs:
        if df[col].min()<=0:
            if f is f1:
                print((col,f))
                continue
        if np.sum(df[col]==0)>0:
            if f in [f3,f2]:
                print((col,f))
                continue
        
        d0, d1 = df[df[target_column]==0][col].apply(f), df[df[target_column]==1][col].apply(f)
        _, p = ttest_ind(d0,d1)
        if p>pmin:
            pmin = p
            ft = f
    trans[col] = {'p': pmin, 'f': ft}
trans

('job_duration_in_current_job_level', <function f1 at 0x000002A89C3C5DC8>)
('job_duration_in_current_job_level', <function f3 at 0x000002A89C33C288>)
('job_duration_in_current_person_level', <function f1 at 0x000002A89C3C5DC8>)
('job_duration_in_current_person_level', <function f3 at 0x000002A89C33C288>)
('job_duration_in_current_branch', <function f1 at 0x000002A89C3C5DC8>)
('job_duration_in_current_branch', <function f3 at 0x000002A89C33C288>)
('number_of_dependences', <function f1 at 0x000002A89C3C5DC8>)
('number_of_dependences', <function f3 at 0x000002A89C33C288>)
('Education_level', <function f1 at 0x000002A89C3C5DC8>)
('Education_level', <function f3 at 0x000002A89C33C288>)
('assign_of_otherposition', <function f1 at 0x000002A89C3C5DC8>)
('assign_of_otherposition', <function f3 at 0x000002A89C33C288>)
('annual leave', <function f1 at 0x000002A89C3C5DC8>)
('annual leave', <function f3 at 0x000002A89C33C288>)
('sick_leaves', <function f1 at 0x000002A89C3C5DC8>)
('sick_leaves', <fu

{'job_level': {'p': 0.994105153807013, 'f': <function __main__.f1(x)>},
 'job_duration_in_current_job_level': {'p': 0.45849000938230133,
  'f': <function __main__.f4(x)>},
 'person_level': {'p': 0.8297456540185715, 'f': <function __main__.f4(x)>},
 'job_duration_in_current_person_level': {'p': 0.08677309711839097,
  'f': <function __main__.f4(x)>},
 'job_duration_in_current_branch': {'p': 0.06861137639980186,
  'f': <function __main__.f4(x)>},
 'age': {'p': 0.04656313059277444, 'f': <function __main__.f1(x)>},
 'number_of_dependences': {'p': 0.2885645050256088,
  'f': <function __main__.f4(x)>},
 'Education_level': {'p': 0.18756941002590294, 'f': <function __main__.f4(x)>},
 'GPA': {'p': 0.8419574161808138, 'f': <function __main__.f3(x)>},
 'year_graduated': {'p': 0.3676190632060363, 'f': <function __main__.f1(x)>},
 'job_duration_from_training': {'p': 0.7040726742152347,
  'f': <function __main__.f3(x)>},
 'branch_rotation': {'p': 0.38729866921417333, 'f': <function __main__.f3(x)>},


In [6]:
# if col not in ['job_duration_from_training','branch_rotation']
num_columns = [col for col in num_columns if trans[col]['p']>0.1 ]
for col in num_columns:
    df[col] = df[col].apply(trans[col]['f'])
# df.isna().sum()
df.head()

Unnamed: 0,job_level,job_duration_in_current_job_level,person_level,job_duration_in_current_person_level,job_duration_in_current_branch,Employee_type,gender,age,marital_status_maried(Y/N),number_of_dependences,...,year_graduated,job_duration_from_training,branch_rotation,job_rotation,assign_of_otherposition,annual leave,sick_leaves,Last_achievement_%,Achievement_above_100%_during3quartal,Best Performance
0,5.545177,1.352775,3,1.352775,1.732051,RM_type_A,2.0,1985.0,Y,1.0,...,15296.445041,0.25,0.5,0.333333,2.0,3.0,0.0,0.021566,0.0,0.0
1,5.545177,1.292285,3,1.292285,1.03923,RM_type_A,2.0,1989.0,Y,0.0,...,15262.023472,0.25,0.25,0.333333,0.0,0.0,1.0,0.020973,0.0,0.0
2,8.04719,2.565151,6,2.308679,1.780449,RM_type_C,1.0,1970.0,Y,1.0,...,15107.225298,0.035714,0.1,0.111111,6.0,2.0,1.0,0.024552,0.0,0.0
3,8.04719,2.828427,6,1.0,1.385641,RM_type_A,2.0,1967.0,Y,2.0,...,15090.035529,0.034483,0.090909,0.166667,0.0,4.0,10.0,0.021088,0.0,1.0
4,8.04719,2.828427,6,2.828427,0.707107,RM_type_A,2.0,1965.0,Y,3.0,...,15072.847773,0.033333,0.111111,0.125,2.0,3.0,0.0,0.021195,0.0,0.0


In [7]:
def balanceDF(df, df1, target_column):
    id0 = np.array(list(range(0, df.shape[0])))[df1[target_column]==0]
    id1 = np.array(list(range(0, df.shape[0])))[df1[target_column]==1]
    i0, i1 = id0.size, id1.size
    c1 = math.floor(i0/i1)-1
    df1 = df.copy()
    for i in range(c1):
        df1 = df1.append([df.loc[id1,:]], ignore_index = True)
#         print(df1.shape)
    return df1

# df = balanceDF(df, df,target_column)
# df.head()
# print(df.shape)

In [8]:
X = df.drop(target_column, axis = 1)
y = df[target_column].copy()
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size = 0.25, stratify = y, random_state=1)

In [9]:
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy = 'median')),
    ('scaler', MinMaxScaler())
])
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('encoder', OneHotEncoder())
])
preprocessor = ColumnTransformer([
    ('numerical', num_pipe, num_columns),
    ('categorical', cat_pipe, cat_columns)
])
# preprocessor.fit(X)

In [10]:
ETC = Pipeline([
    ('prep', preprocessor),
    ('algo', ExtraTreesClassifier(n_estimators=100, max_depth=None, 
                           min_samples_split=2, random_state=0, bootstrap=False))
])#(20,2)
GNB = Pipeline([
    ('prep', preprocessor),
    ('algo', GaussianNB())
])#(20,2)
LR = Pipeline([
    ('prep', preprocessor),
    ('algo', LogisticRegression())
])#(20,2)
KNN = Pipeline([
    ('prep', preprocessor),
    ('algo', KNeighborsClassifier(n_neighbors=1,weights='uniform'))
])#(20,2)
MLP = Pipeline([
    ('prep', preprocessor),
    ('algo', MLPClassifier(hidden_layer_sizes= (100,50), activation='logistic',
                           solver = 'adam', max_iter = 100, alpha=0.01))
])

In [11]:
model = ETC #ETC
# scores = cross_val_score(model, X_train, y_train, cv=4)
# print(scores)
model.fit(X_train, y_train)
print(confusion_matrix(y_test, model.predict(X_test)))

[[2367   13]
 [ 408    1]]


In [12]:
df_test = pd.read_csv(r'./test.csv')

df_test['Education_level'] = df_test['Education_level'].apply(lambda x: int(x.split('_')[-1]))
df_test['person_level'] = df_test['person_level'].apply(lambda x: int(x.split('PG')[-1]))
df_test['job_level'] = df_test['job_level'].apply(lambda x: int(x.split('JG')[-1]))

df_test.head()

Unnamed: 0,job_level,job_duration_in_current_job_level,person_level,job_duration_in_current_person_level,job_duration_in_current_branch,Employee_type,gender,age,marital_status_maried(Y/N),number_of_dependences,...,GPA,year_graduated,job_duration_from_training,branch_rotation,job_rotation,assign_of_otherposition,annual leave,sick_leaves,Last_achievement_%,Achievement_above_100%_during3quartal
0,4,1.256981,3,1.256981,0.707107,RM_type_A,1,1988,N,0,...,3.08,2009,6,4,2,6,3,7,84.02,0.0
1,4,1.224745,3,1.224745,1.256981,RM_type_B,2,1991,N,0,...,3.31,2014,2,2,2,0,1,0,63.74,0.0
2,4,0.5,3,0.5,1.081665,RM_type_A,2,1989,Y,1,...,3.44,2011,4,2,2,0,2,1,88.19,1.0
3,4,1.256981,3,1.256981,1.802776,RM_type_A,2,1990,Y,1,...,0.0,2014,3,1,3,0,3,0,25.94,0.0
4,4,1.352775,3,1.352775,1.224745,RM_type_B,1,1985,Y,0,...,3.34,2007,5,3,2,0,3,0,82.32,0.0


In [13]:
for col in num_columns:
    df[col] = df[col].apply(trans[col]['f'])
# df.isna().sum()
df.head()

Unnamed: 0,job_level,job_duration_in_current_job_level,person_level,job_duration_in_current_person_level,job_duration_in_current_branch,Employee_type,gender,age,marital_status_maried(Y/N),number_of_dependences,...,year_graduated,job_duration_from_training,branch_rotation,job_rotation,assign_of_otherposition,annual leave,sick_leaves,Last_achievement_%,Achievement_above_100%_during3quartal,Best Performance
0,9.498493,1.352775,3,1.352775,1.732051,RM_type_A,2.0,1985.0,Y,1.0,...,147386.995303,4.0,2.0,3.0,2.0,3.0,0.0,46.37,0.0,0.0
1,9.498493,1.292285,3,1.292285,1.03923,RM_type_A,2.0,1989.0,Y,0.0,...,147020.947745,4.0,4.0,3.0,0.0,0.0,1.0,47.68,0.0,0.0
2,16.780989,2.565151,6,2.308679,1.780449,RM_type_C,1.0,1970.0,Y,1.0,...,145375.747436,28.0,10.0,9.0,6.0,2.0,1.0,40.73,0.0,0.0
3,16.780989,2.828427,6,1.0,1.385641,RM_type_A,2.0,1967.0,Y,2.0,...,145193.151537,29.0,11.0,6.0,0.0,4.0,10.0,47.42,0.0,1.0
4,16.780989,2.828427,6,2.828427,0.707107,RM_type_A,2.0,1965.0,Y,3.0,...,145010.5966,30.0,9.0,8.0,2.0,3.0,0.0,47.18,0.0,0.0


In [14]:
# y = ETC.predict(df_test)
# y
y = model.predict_proba(df_test)[:,1]

In [15]:
df_test[target_column] = y
submission = df_test[target_column].to_frame()#.astype('int')
submission.index.name = 'index'
submission.head()

Unnamed: 0_level_0,Best Performance
index,Unnamed: 1_level_1
0,0.23
1,0.19
2,0.31
3,0.27
4,0.26


In [16]:
submission.to_csv('submission(3).csv', header=True)

In [17]:
# submission['Best Performance'].value_counts()
np.sum(submission['Best Performance']>0.5)

0

In [18]:
df.describe()

Unnamed: 0,job_level,job_duration_in_current_job_level,person_level,job_duration_in_current_person_level,job_duration_in_current_branch,gender,age,number_of_dependences,Education_level,GPA,year_graduated,job_duration_from_training,branch_rotation,job_rotation,assign_of_otherposition,annual leave,sick_leaves,Last_achievement_%,Achievement_above_100%_during3quartal,Best Performance
count,11154.0,11154.0,11154.0,11154.0,11154.0,11154.0,11154.0,11154.0,11154.0,11154.0,11154.0,11154.0,11154.0,11154.0,11154.0,11154.0,11154.0,11154.0,11154.0,11154.0
mean,9.941605,1.433153,3.227542,1.34917,1.034686,1.735521,1985.835575,0.995697,3.808858,3.160919,147220.011404,6.279989,3.720997,3.506276,1.202977,3.664605,1.100054,72.239033,0.679279,0.146763
std,1.809532,0.431106,0.686127,0.324685,0.416725,0.441075,4.634749,0.881268,0.489327,0.217193,376.610097,5.037923,2.400569,1.818986,2.575158,2.652846,2.714423,23.032488,1.1075,0.353886
min,3.930813,0.0,1.0,0.0,0.0,1.0,1963.0,0.0,0.0,2.0,144736.84108,2.0,1.0,1.0,0.0,0.0,0.0,4.51,0.0,0.0
25%,9.498493,1.224745,3.0,1.224745,0.707107,1.0,1985.0,0.0,4.0,3.03,147112.444422,4.0,2.0,2.0,0.0,2.0,0.0,56.65,0.0,0.0
50%,9.498493,1.352775,3.0,1.352775,1.118034,2.0,1987.0,1.0,4.0,3.160919,147295.468206,5.0,3.0,3.0,0.0,3.0,0.0,71.685,0.0,0.0
75%,9.498493,1.414214,3.0,1.385641,1.224745,2.0,1989.0,2.0,4.0,3.26,147478.532532,6.0,4.0,4.0,1.0,5.0,1.0,88.1775,1.0,0.0
max,25.532116,2.95804,8.0,2.828427,2.677686,2.0,1997.0,7.0,5.0,3.97,148119.576416,36.0,22.0,15.0,29.0,21.0,77.0,130.0,3.0,1.0
