# Feature selection 


In [1]:
import numpy as np
import pandas as pd
#for splitting the data
from sklearn.model_selection import train_test_split
#for models
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [11]:
# read data
dataset = pd.read_csv('train.csv')

# Preprocessing

In [3]:
selected_data= dataset.drop(['employee_id'], axis = 1)
# selected_data= selected_data.drop(['department'], axis = 1)
# selected_data= selected_data.drop(['region'], axis = 1)
selected_data= selected_data.drop(['education'], axis = 1)
selected_data= selected_data.drop(['gender'],axis=1)
# selected_data=selected_data.drop(['recruitment_channel'],axis=1)
# selected_data=selected_data.drop(['age'],axis=1)
# selected_data=selected_data.drop(['length_of_service'],axis=1)
selected_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54808 entries, 0 to 54807
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   department            54808 non-null  object 
 1   region                54808 non-null  object 
 2   recruitment_channel   54808 non-null  object 
 3   no_of_trainings       54808 non-null  int64  
 4   age                   54808 non-null  int64  
 5   previous_year_rating  50684 non-null  float64
 6   length_of_service     54808 non-null  int64  
 7   KPIs_met >80%         54808 non-null  int64  
 8   awards_won?           54808 non-null  int64  
 9   avg_training_score    54808 non-null  int64  
 10  is_promoted           54808 non-null  int64  
dtypes: float64(1), int64(7), object(3)
memory usage: 4.6+ MB


removing null values

In [177]:
selected_data['previous_year_rating'].fillna(3, inplace = True)
# dataset.isnull().sum()

### Handling Categorical data

In [178]:
#split data into features and target
X = selected_data.iloc[:, :-1]
y = selected_data.iloc[:, -1]

print(X.shape)
print(y.shape)

(54808, 10)
(54808,)


In [179]:
#performing one hot encoding on the categorical variables
X = pd.get_dummies(X)
X.columns

Index(['no_of_trainings', 'age', 'previous_year_rating', 'length_of_service',
       'KPIs_met >80%', 'awards_won?', 'avg_training_score',
       'department_Analytics', 'department_Finance', 'department_HR',
       'department_Legal', 'department_Operations', 'department_Procurement',
       'department_R&D', 'department_Sales & Marketing',
       'department_Technology', 'region_region_1', 'region_region_10',
       'region_region_11', 'region_region_12', 'region_region_13',
       'region_region_14', 'region_region_15', 'region_region_16',
       'region_region_17', 'region_region_18', 'region_region_19',
       'region_region_2', 'region_region_20', 'region_region_21',
       'region_region_22', 'region_region_23', 'region_region_24',
       'region_region_25', 'region_region_26', 'region_region_27',
       'region_region_28', 'region_region_29', 'region_region_3',
       'region_region_30', 'region_region_31', 'region_region_32',
       'region_region_33', 'region_region_34', 'reg

### Handling Imbalanced class

In [180]:
from imblearn.over_sampling import SMOTE

X, y = SMOTE().fit_resample(X, y.values.ravel())

X = pd.DataFrame(X)
y = pd.DataFrame(y)

print( X.shape)
print( y.shape)

(100280, 53)
(100280, 1)


In [181]:
y[0].value_counts()
#Now the problem of imbalancing is solved

0    50140
1    50140
Name: 0, dtype: int64

### Splitting the data set into train_validate and test

In [182]:
#concatenate X and Y
dataset_v2 = pd.concat([X,y], axis=1)
dataset_v2.head(2)

Unnamed: 0,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,department_Analytics,department_Finance,department_HR,...,region_region_4,region_region_5,region_region_6,region_region_7,region_region_8,region_region_9,recruitment_channel_other,recruitment_channel_referred,recruitment_channel_sourcing,0
0,1,35,5.0,8,1,0,49,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,1,30,5.0,4,0,0,60,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [183]:
dataset_v2.rename(columns = {0:'is_promoted'}, inplace = True)


In [184]:
dataset_v2.head(2)

Unnamed: 0,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,department_Analytics,department_Finance,department_HR,...,region_region_4,region_region_5,region_region_6,region_region_7,region_region_8,region_region_9,recruitment_channel_other,recruitment_channel_referred,recruitment_channel_sourcing,is_promoted
0,1,35,5.0,8,1,0,49,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,1,30,5.0,4,0,0,60,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [185]:
dataset_train, dataset_test = train_test_split(dataset_v2, test_size = 0.2, random_state = 0)

### Data Normalization

In [186]:
#of the train data
for column in dataset_train.columns:
    dataset_train[column] = (dataset_train[column] - dataset_train[column].min()) / (dataset_train[column].max() - dataset_train[column].min())    

In [187]:
#of the test data
for column in dataset_test.columns:
    dataset_test[column] = (dataset_test[column] - dataset_test[column].min()) / (dataset_test[column].max() - dataset_test[column].min())    

### Splitting the train into train and validate

In [188]:
dataset_train_v2, dataset_validation = train_test_split(dataset_train, test_size = 0.2, random_state = 0)

### preparing the features and the target


In [189]:
X_train = dataset_train_v2.drop(['is_promoted'],axis=1)
y_train = dataset_train_v2['is_promoted']

X_train.head(2)

Unnamed: 0,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,department_Analytics,department_Finance,department_HR,...,region_region_34,region_region_4,region_region_5,region_region_6,region_region_7,region_region_8,region_region_9,recruitment_channel_other,recruitment_channel_referred,recruitment_channel_sourcing
80653,0.111111,0.25,1.0,0.083333,0.0,0.0,0.766667,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
63614,0.0,0.125,0.5,0.0,1.0,0.0,0.133333,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [190]:
X_valid = dataset_validation.drop(['is_promoted'],axis=1)
y_valid = dataset_validation['is_promoted']

X_valid.head(2)

Unnamed: 0,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,department_Analytics,department_Finance,department_HR,...,region_region_34,region_region_4,region_region_5,region_region_6,region_region_7,region_region_8,region_region_9,recruitment_channel_other,recruitment_channel_referred,recruitment_channel_sourcing
85789,0.0,0.2,0.774773,0.027778,0.0,1.0,0.55,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
29791,0.0,0.375,0.5,0.25,0.0,0.0,0.133333,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [191]:
X_test = dataset_test.drop(['is_promoted'],axis=1)
y_test = dataset_test['is_promoted']

X_test.head(2)

Unnamed: 0,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,department_Analytics,department_Finance,department_HR,...,region_region_34,region_region_4,region_region_5,region_region_6,region_region_7,region_region_8,region_region_9,recruitment_channel_other,recruitment_channel_referred,recruitment_channel_sourcing
92988,0.0,0.375,0.638358,0.060606,0.0,0.0,0.881356,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
77138,0.0,0.175,1.0,0.151515,0.0,0.0,0.745763,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Training Models

## SVM

In [192]:
SVM = SVC().fit(X_train, y_train)
y_pred_svm = SVM.predict(X_valid)

#evaluating 
print(confusion_matrix(y_valid, y_pred_svm))
print(classification_report(y_valid, y_pred_svm))
print(accuracy_score(y_valid, y_pred_svm))

[[7979   67]
 [ 992 7007]]
              precision    recall  f1-score   support

         0.0       0.89      0.99      0.94      8046
         1.0       0.99      0.88      0.93      7999

    accuracy                           0.93     16045
   macro avg       0.94      0.93      0.93     16045
weighted avg       0.94      0.93      0.93     16045

0.9339981302586475


## KNN

In [193]:
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(n_neighbors=5).fit(X_train, y_train)
y_pred_knn = KNN.predict(X_valid)

#evaluating 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_valid, y_pred_knn))
print(classification_report(y_valid, y_pred_knn))
print(accuracy_score(y_valid, y_pred_knn))

[[7590  456]
 [ 589 7410]]
              precision    recall  f1-score   support

         0.0       0.93      0.94      0.94      8046
         1.0       0.94      0.93      0.93      7999

    accuracy                           0.93     16045
   macro avg       0.94      0.93      0.93     16045
weighted avg       0.93      0.93      0.93     16045

0.9348706762231225


## Naive Bayes

In [194]:
from sklearn.naive_bayes import GaussianNB
NB = GaussianNB().fit(X_train, y_train)
y_pred_nb = NB.predict(X_valid)

#evaluating
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_valid, y_pred_nb))
print(classification_report(y_valid, y_pred_nb))
print(accuracy_score(y_valid, y_pred_nb))

[[4497 3549]
 [ 641 7358]]
              precision    recall  f1-score   support

         0.0       0.88      0.56      0.68      8046
         1.0       0.67      0.92      0.78      7999

    accuracy                           0.74     16045
   macro avg       0.77      0.74      0.73     16045
weighted avg       0.78      0.74      0.73     16045

0.7388594577750078


## Logistic Regression

In [195]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression().fit(X_train, y_train)
y_pred_lr = LR.predict(X_valid)

#evaluating
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_valid, y_pred_lr))
print(classification_report(y_valid, y_pred_lr))
print(accuracy_score(y_valid, y_pred_lr))

[[7813  233]
 [ 988 7011]]
              precision    recall  f1-score   support

         0.0       0.89      0.97      0.93      8046
         1.0       0.97      0.88      0.92      7999

    accuracy                           0.92     16045
   macro avg       0.93      0.92      0.92     16045
weighted avg       0.93      0.92      0.92     16045

0.9239015269554378


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## Decision Trees

In [196]:
#trying decision tree
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier().fit(X_train, y_train)
y_pred_dt = DT.predict(X_valid)

#evaluating
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_valid, y_pred_dt))
print(classification_report(y_valid, y_pred_dt))
print(accuracy_score(y_valid, y_pred_dt))

[[7458  588]
 [ 396 7603]]
              precision    recall  f1-score   support

         0.0       0.95      0.93      0.94      8046
         1.0       0.93      0.95      0.94      7999

    accuracy                           0.94     16045
   macro avg       0.94      0.94      0.94     16045
weighted avg       0.94      0.94      0.94     16045

0.9386724836397632


## Random Forests

In [197]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)
y_pred_rf = RF.predict(X_valid)

#evaluating
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_valid, y_pred_rf))
print(classification_report(y_valid, y_pred_rf))
print(accuracy_score(y_valid, y_pred_rf))

[[7838  208]
 [ 389 7610]]
              precision    recall  f1-score   support

         0.0       0.95      0.97      0.96      8046
         1.0       0.97      0.95      0.96      7999

    accuracy                           0.96     16045
   macro avg       0.96      0.96      0.96     16045
weighted avg       0.96      0.96      0.96     16045

0.9627921470863198
