# HR Analytics: Job Change of Data Scientists

### Predict who will move to a new job

https://www.kaggle.com/datasets/arashnic/hr-analytics-job-change-of-data-scientists

![image.png](../Images/Job.png)

주어진 데이터를 이용하여 예측 모형을 만들고 csv 파일로 저장하시오.

### Library & Data Import

In [1]:
import pandas as pd
import numpy as np

In [2]:
X_train = pd.read_csv('../Datasets/Job_X_train.csv')
X_test = pd.read_csv('../Datasets/Job_X_test.csv')
y_train = pd.read_csv('../Datasets/Job_y_train.csv')

### 1. Data Exploration

In [3]:
X_train

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
0,6424,city_75,0.939,Male,Has,No,Graduate,STEM,10,50-99,Pvt Ltd,1,176
1,18304,city_64,0.666,Male,No,Full,High,,2,,,1,110
2,22515,city_103,0.920,Male,No,Full,High,,4,,,never,44
3,15878,city_16,0.910,Male,Has,No,Graduate,STEM,8,50-99,Early Stage Startup,1,82
4,22128,city_103,0.920,Male,Has,No,Graduate,STEM,6,10000+,Pvt Ltd,1,70
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10058,28053,city_21,0.624,Male,No,Full,Graduate,STEM,4,,,never,34
10059,28108,city_103,0.920,Male,Has,No,Phd,STEM,>20,,,1,72
10060,58,city_21,0.624,Male,Has,Full,High,,3,,Pvt Ltd,never,53
10061,11305,city_103,0.920,Male,No,Full,Graduate,STEM,6,,,1,166


In [4]:
X_test

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
0,666,city_162,0.767,Male,Has,No,Masters,STEM,>20,50-99,Funded Startup,4,8
1,28806,city_160,0.920,Male,Has,No,High,,5,50-99,Funded Startup,1,24
2,5826,city_21,0.624,Male,No,,,,2,,,never,24
3,19061,city_114,0.926,Male,Has,No,Masters,STEM,11,100-500,Pvt Ltd,2,50
4,28476,city_103,0.920,Male,Has,No,Graduate,Arts,5,,,2,72
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4308,14257,city_21,0.624,Male,No,Full,Graduate,STEM,13,10/49,NGO,1,21
4309,6374,city_21,0.624,Male,Has,No,Graduate,Business Degree,14,<10,Pvt Ltd,1,87
4310,15133,city_160,0.920,Male,No,Part,Graduate,STEM,3,,,1,322
4311,27018,city_16,0.910,Male,Has,No,Graduate,STEM,8,<10,Pvt Ltd,3,78


In [5]:
y_train

Unnamed: 0,enrollee_id,target
0,6424,0
1,18304,0
2,22515,0
3,15878,0
4,22128,0
...,...,...
10058,28053,1
10059,28108,1
10060,58,1
10061,11305,1


In [6]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10063 entries, 0 to 10062
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             10063 non-null  int64  
 1   city                    10063 non-null  object 
 2   city_development_index  10063 non-null  float64
 3   gender                  10063 non-null  object 
 4   relevent_experience     10063 non-null  object 
 5   enrolled_university     9921 non-null   object 
 6   education_level         9901 non-null   object 
 7   major_discipline        8740 non-null   object 
 8   experience              10063 non-null  object 
 9   company_size            7133 non-null   object 
 10  company_type            7048 non-null   object 
 11  last_new_job            10063 non-null  object 
 12  training_hours          10063 non-null  int64  
dtypes: float64(1), int64(2), object(10)
memory usage: 1022.1+ KB


In [7]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4313 entries, 0 to 4312
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             4313 non-null   int64  
 1   city                    4313 non-null   object 
 2   city_development_index  4313 non-null   float64
 3   gender                  4313 non-null   object 
 4   relevent_experience     4313 non-null   object 
 5   enrolled_university     4262 non-null   object 
 6   education_level         4231 non-null   object 
 7   major_discipline        3724 non-null   object 
 8   experience              4313 non-null   object 
 9   company_size            3052 non-null   object 
 10  company_type            3024 non-null   object 
 11  last_new_job            4313 non-null   object 
 12  training_hours          4313 non-null   int64  
dtypes: float64(1), int64(2), object(10)
memory usage: 438.2+ KB


### 2. Data Preprocessing

#### (1) 상관관계가 낮은 변수 삭제

In [8]:
# enrollee_id 컬럼은 고유 정보로 key 역할로 모델에는 불필요함
# 결과 제출 시에는 X_test의 enrollee_id 컬럼이 필요하기 때문에 별도 저장
enrollee_id = X_test['enrollee_id'].copy()

# 데이터들에서 enrollee_id 컬럼 삭제

# 또한, enrollee_id, company_size, company_type는 이직여부와 상관관계가 낮으므로 컬럼을 삭제
# enrolled_university, major_discipline은 대치하기 어려운 변수이므로 삭제
X_train = X_train.drop(columns = ['enrollee_id', 'company_size', 'company_type', 'enrolled_university', 'major_discipline'])
X_test = X_test.drop(columns = ['enrollee_id','company_size', 'company_type', 'enrolled_university', 'major_discipline'])
y_train = y_train.drop(columns = ['enrollee_id'])

#### (2) Missing Value

In [9]:
X_train.isna().sum()

city                        0
city_development_index      0
gender                      0
relevent_experience         0
education_level           162
experience                  0
last_new_job                0
training_hours              0
dtype: int64

In [10]:
X_test.isna().sum()

city                       0
city_development_index     0
gender                     0
relevent_experience        0
education_level           82
experience                 0
last_new_job               0
training_hours             0
dtype: int64

In [11]:
X_train.education_level.value_counts()

Graduate    6288
Masters     2233
High        1009
Phd          223
Primary      148
Name: education_level, dtype: int64

In [12]:
######## education_level 컬럼 (train : 162, test : 82)
# 최다빈도를 가지는 라벨로 대체
mode_EL = X_train['education_level'].value_counts().idxmax()
X_train['education_level'] = X_train['education_level'].fillna(mode_EL)
X_test['education_level'] = X_test['education_level'].fillna(mode_EL)

### 3. Data Modeling

#### (1) Encoding

In [13]:
from sklearn.preprocessing import OneHotEncoder

X_train_cat = X_train.select_dtypes('object').copy()
X_test_cat =  X_test.select_dtypes('object').copy()

ohe = OneHotEncoder(sparse=False, handle_unknown = 'ignore')

ohe.fit(X_train_cat)

X_train_ohe = ohe.transform(X_train_cat)
X_test_ohe = ohe.transform(X_test_cat)

#### (2) Scaling

In [14]:
from sklearn.preprocessing import MinMaxScaler

X_train_num = X_train.select_dtypes(exclude='object').copy()
X_test_num = X_test.select_dtypes(exclude='object').copy()

scaler = MinMaxScaler()

scaler.fit(X_train_num)

X_train_sca = scaler.transform(X_train_num)
X_test_sca = scaler.transform(X_test_num)

#### (3) Data Concat & Split

In [15]:
X_TRAIN = np.concatenate([X_train_ohe, X_train_sca], axis=1)
X_TEST = np.concatenate([X_test_ohe, X_test_sca], axis=1)

y_TRAIN = y_train['target']

print(type(X_TRAIN), type(X_TEST), type(y_TRAIN))
print(X_TRAIN.shape, X_TEST.shape, y_TRAIN.shape)

<class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'pandas.core.series.Series'>
(10063, 159) (4313, 159) (10063,)


In [16]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(X_TRAIN, y_TRAIN, test_size = 0.25, stratify=y_TRAIN, random_state=1234)

print(xtrain.shape, xtest.shape, ytrain.shape, ytest.shape)

(7547, 159) (2516, 159) (7547,) (2516,)


### 4. Modeling

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [18]:
def make_models(xtrain, xtest, ytrain, ytest):
    model1 = LogisticRegression(solver='lbfgs', max_iter=1000).fit(xtrain, ytrain)
    print('model1', get_scores(model1, xtrain, xtest, ytrain, ytest))

    model2 = DecisionTreeClassifier(random_state=0).fit(xtrain, ytrain)
    print('model2', get_scores(model2, xtrain, xtest, ytrain, ytest))

    for d in range(3, 8):
        model2 = DecisionTreeClassifier(max_depth=d, random_state=0).fit(xtrain, ytrain)
        print('model2', d, get_scores(model2, xtrain, xtest, ytrain, ytest))

    model3 = RandomForestClassifier(random_state=0).fit(xtrain, ytrain)
    print('model3', get_scores(model3, xtrain, xtest, ytrain, ytest))

    for d in range(3, 8):
        model3 = RandomForestClassifier(500, max_depth=d, random_state=0).fit(xtrain, ytrain)
        print('model3', d, get_scores(model3, xtrain, xtest, ytrain, ytest))

    model4 = XGBClassifier(eval_metric='logloss').fit(xtrain, ytrain)
    print('model4', get_scores(model4, xtrain, xtest, ytrain, ytest))

### 5. Model Evaluation

In [19]:
from sklearn.metrics import roc_auc_score

def get_scores(model, xtrain, xtest, ytrain, ytest):
    A = model.score(xtrain, ytrain)
    
    ypred = model.predict_proba(xtest)[:, 1]
    
    B = roc_auc_score(ytest, ypred)
    
    return f'{A:.4} {B:.4}'

In [20]:
make_models(xtrain, xtest, ytrain, ytest)

model1 0.8002 0.7455
model2 0.9931 0.6157
model2 3 0.8003 0.7222
model2 4 0.8011 0.7361
model2 5 0.8026 0.7404
model2 6 0.8048 0.738
model2 7 0.8101 0.7358
model3 0.993 0.6921
model3 3 0.7697 0.736
model3 4 0.7697 0.7379
model3 5 0.7697 0.7386
model3 6 0.7732 0.7389
model3 7 0.7884 0.7418
model4 0.8472 0.7274


In [21]:
final_model = DecisionTreeClassifier(max_depth=4, random_state=0).fit(xtrain, ytrain)

print('final model', get_scores(final_model, xtrain, xtest, ytrain, ytest))

final model 0.8011 0.7361


### 6. Save Result

In [22]:
y_pred = final_model.predict(X_TEST)
y_pred_prob = final_model.predict_proba(X_TEST)[:,1]

obj = {'enrollee_id' : enrollee_id,
       'target' : y_pred,
       'y_pred_prob' : y_pred_prob}

result = pd.DataFrame(obj)
result.to_csv("./result.csv", index = False)