In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math, random, os
from scipy import stats
sns.set()

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler,MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import auc, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier, MLPRegressor

from sklearn.manifold import Isomap, LocallyLinearEmbedding

from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier

In [3]:
df = pd.read_csv(r'./train.csv')
df['Education_level'] = df['Education_level'].apply(lambda x: int(x.split('_')[-1]))

cat_columns = [col for col in df.columns if df.dtypes[col]==np.dtype('O')]
cat_columns.extend(['gender',])
num_columns = [col for col in df.columns if col not in cat_columns]
target_column = num_columns.pop()

In [4]:
id0 = np.array(list(range(0, df.shape[0])))[df[target_column]==0]
id1 = np.array(list(range(0, df.shape[0])))[df[target_column]==1]
i0, i1 = id0.size, id1.size
c1 = math.floor(i0/i1)-1
df1 = df.copy()
for i in range(c1):
    df1 = df1.append([df.loc[id1,:]], ignore_index = True)
    print(df1.shape)
df = df1
df.loc[df.shape[0], :] = ['JG06', 0.8185352771872451, 'PG07', 0.8185352771872451,
        1.47309198626562, 'RM_type_A', 1, 1969, 'Y', 3, 4, 3.28, 2014,
        29, 13, 10, 4, 8, 0, 118.31, 3.0, 0]
df

(12790, 22)
(14427, 22)
(16064, 22)
(17701, 22)


Unnamed: 0,job_level,job_duration_in_current_job_level,person_level,job_duration_in_current_person_level,job_duration_in_current_branch,Employee_type,gender,age,marital_status_maried(Y/N),number_of_dependences,...,year_graduated,job_duration_from_training,branch_rotation,job_rotation,assign_of_otherposition,annual leave,sick_leaves,Last_achievement_%,Achievement_above_100%_during3quartal,Best Performance
0,JG04,1.352775,PG03,1.352775,1.732051,RM_type_A,2.0,1985.0,Y,1.0,...,2011.0,4.0,2.0,3.0,2.0,3.0,0.0,46.37,0.0,0.0
1,JG04,1.292285,PG03,1.292285,1.039230,RM_type_A,2.0,1989.0,Y,0.0,...,2007.0,4.0,4.0,3.0,0.0,0.0,1.0,47.68,0.0,0.0
2,JG05,2.565151,PG06,2.308679,1.780449,RM_type_C,1.0,1970.0,Y,1.0,...,1989.0,28.0,10.0,9.0,6.0,2.0,1.0,40.73,0.0,0.0
3,JG05,2.828427,PG06,1.000000,1.385641,RM_type_A,2.0,1967.0,Y,2.0,...,1987.0,29.0,11.0,6.0,0.0,4.0,10.0,47.42,0.0,1.0
4,JG05,2.828427,PG06,2.828427,0.707107,RM_type_A,2.0,1965.0,Y,3.0,...,1985.0,30.0,9.0,8.0,2.0,3.0,0.0,47.18,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17697,JG04,1.352775,PG03,1.352775,0.412311,RM_type_A,2.0,1988.0,Y,1.0,...,2016.0,4.0,5.0,3.0,4.0,8.0,0.0,111.32,2.0,1.0
17698,JG04,1.555635,PG03,1.555635,1.870829,RM_type_A,2.0,1982.0,Y,1.0,...,2012.0,5.0,3.0,2.0,0.0,3.0,0.0,91.43,3.0,1.0
17699,JG04,1.385641,PG03,1.385641,1.118034,RM_type_A,2.0,1982.0,Y,2.0,...,2010.0,7.0,4.0,4.0,3.0,3.0,7.0,104.69,3.0,1.0
17700,JG04,2.828427,PG04,1.224745,0.412311,RM_type_A,2.0,1978.0,Y,2.0,...,2005.0,15.0,10.0,8.0,6.0,8.0,1.0,60.10,1.0,1.0


In [5]:
# df = df.loc[np.hstack((id0[:2000],id1)),:]
X = df.drop(target_column, axis = 1)
y = df[target_column].copy()
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size = 0.25, stratify = y, random_state=0)

In [6]:
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy = 'median')),
    ('scaler', MinMaxScaler())
])
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('encoder', OneHotEncoder())
])
preprocessor = ColumnTransformer([
    ('numerical', num_pipe, num_columns),
    ('categorical', cat_pipe, cat_columns)
])
# preprocessor.fit(X)

In [7]:
ETC = Pipeline([
    ('prep', preprocessor),
    ('algo', ExtraTreesClassifier(n_estimators=20, max_depth=None, 
                           min_samples_split=2, random_state=0, bootstrap=False))
])#(20,2)
BCMLP = Pipeline([
    ('prep', preprocessor),
    ('algo', BaggingClassifier(MLPClassifier(hidden_layer_sizes= (30,), activation='logistic',
                           solver = 'adam', max_iter = 100, alpha=0.1),
                               n_estimators=10))
])
BCLR = Pipeline([
    ('prep', preprocessor),
    ('algo', BaggingClassifier(LogisticRegression(),
                               n_estimators=10))
])

In [8]:
model = ETC
# scores = cross_val_score(model, X_train, y_train, cv=4)
# print(scores)
model.fit(X_train, y_train)
confusion_matrix(y_test, model.predict(X_test))

array([[2362,   18],
       [  10, 2036]], dtype=int64)

In [9]:
df_test = pd.read_csv(r'./test.csv')
df_test['Education_level'] = df_test['Education_level'].apply(lambda x: int(x.split('_')[-1]))
df_test.head()

Unnamed: 0,job_level,job_duration_in_current_job_level,person_level,job_duration_in_current_person_level,job_duration_in_current_branch,Employee_type,gender,age,marital_status_maried(Y/N),number_of_dependences,...,GPA,year_graduated,job_duration_from_training,branch_rotation,job_rotation,assign_of_otherposition,annual leave,sick_leaves,Last_achievement_%,Achievement_above_100%_during3quartal
0,JG04,1.256981,PG03,1.256981,0.707107,RM_type_A,1,1988,N,0,...,3.08,2009,6,4,2,6,3,7,84.02,0.0
1,JG04,1.224745,PG03,1.224745,1.256981,RM_type_B,2,1991,N,0,...,3.31,2014,2,2,2,0,1,0,63.74,0.0
2,JG04,0.5,PG03,0.5,1.081665,RM_type_A,2,1989,Y,1,...,3.44,2011,4,2,2,0,2,1,88.19,1.0
3,JG04,1.256981,PG03,1.256981,1.802776,RM_type_A,2,1990,Y,1,...,0.0,2014,3,1,3,0,3,0,25.94,0.0
4,JG04,1.352775,PG03,1.352775,1.224745,RM_type_B,1,1985,Y,0,...,3.34,2007,5,3,2,0,3,0,82.32,0.0


In [10]:
# y = ETC.predict(df_test)
# y
y = ETC.predict_proba[:,1]

array([0., 0., 0., ..., 0., 1., 0.])

In [11]:
df_test[target_column] = y
df_test

Unnamed: 0,job_level,job_duration_in_current_job_level,person_level,job_duration_in_current_person_level,job_duration_in_current_branch,Employee_type,gender,age,marital_status_maried(Y/N),number_of_dependences,...,year_graduated,job_duration_from_training,branch_rotation,job_rotation,assign_of_otherposition,annual leave,sick_leaves,Last_achievement_%,Achievement_above_100%_during3quartal,Best Performance
0,JG04,1.256981,PG03,1.256981,0.707107,RM_type_A,1,1988,N,0,...,2009,6,4,2,6,3,7,84.02,0.0,0.0
1,JG04,1.224745,PG03,1.224745,1.256981,RM_type_B,2,1991,N,0,...,2014,2,2,2,0,1,0,63.74,0.0,0.0
2,JG04,0.500000,PG03,0.500000,1.081665,RM_type_A,2,1989,Y,1,...,2011,4,2,2,0,2,1,88.19,1.0,0.0
3,JG04,1.256981,PG03,1.256981,1.802776,RM_type_A,2,1990,Y,1,...,2014,3,1,3,0,3,0,25.94,0.0,0.0
4,JG04,1.352775,PG03,1.352775,1.224745,RM_type_B,1,1985,Y,0,...,2007,5,3,2,0,3,0,82.32,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,JG04,1.292285,PG03,1.292285,0.707107,RM_type_B,2,1988,Y,0,...,2015,2,2,2,0,4,0,58.60,0.0,0.0
5996,JG04,1.352775,PG03,1.352775,1.581139,RM_type_A,2,1989,Y,1,...,2011,5,3,2,7,3,0,90.64,3.0,0.0
5997,JG04,1.385641,PG03,1.385641,1.224745,RM_type_A,2,1991,Y,0,...,2013,3,2,3,0,16,0,13.02,0.0,0.0
5998,JG05,1.385641,PG05,1.385641,0.648074,RM_type_A,2,1980,Y,2,...,2003,14,7,6,8,2,0,82.26,0.0,1.0


In [12]:
submission = df_test[target_column].to_frame().astype('int')
submission.index.name = 'index'
submission.head()

Unnamed: 0_level_0,Best Performance
index,Unnamed: 1_level_1
0,0
1,0
2,0
3,0
4,0


In [13]:
submission.to_csv('sample_submission.csv', header=True)

In [14]:
submission['Best Performance'].value_counts()

0    5937
1      63
Name: Best Performance, dtype: int64