In [151]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [152]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')

print(train.shape, test.shape)

(13645, 22) (8745, 20)


# Preparing Classifier

In [153]:
print(train.shape)

train1 = train[[ 'LanguageOfCommunication', 'Age', 'Gender',
       'JobProfileIDApplyingFor', 'HighestDegree', 'DegreeBranch',
       'GraduatingInstitute', 'LatestDegreeCGPA', 'YearsOfExperince',
       'CurrentCTC', 'MartialStatus',
       'EmpScore', 'CurrentDesignation', 'CurrentCompanyType',
       'DepartmentInCompany', 'TotalLeavesTaken']].copy()
print(train1.shape)

train1['GraduatingInstitute'] = train['GraduatingInstitute'].map({'Tier 1':3,'Tier 2':2, 'Tier 3':1})
train1['ExpectedHike'] = (train['ExpectedCTC']-train['CurrentCTC'])/train['CurrentCTC']*100
print(train1.shape)

target_col = train['BiasInfluentialFactor'].fillna('Unknown')

X = pd.get_dummies(train1, drop_first=True).values
y = target_col.copy().values
print(X.shape,y.shape)

(13645, 22)
(13645, 16)
(13645, 17)
(13645, 46) (13645,)


In [154]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_en = le.fit_transform(y)


from sklearn.model_selection import train_test_split

X_train, X_val, y_train,y_val = train_test_split(X,y_en, test_size=.2, stratify=y_en,random_state=123)
print(X_train.shape, X_val.shape, y_train.shape,y_val.shape)

(10916, 46) (2729, 46) (10916,) (2729,)


In [155]:
from imblearn.over_sampling import SMOTE


sampling_strategy_dict={
 2: 700,
 5: 700,
 3: 500,
 7: 500,
 6: 500}

sm = SMOTE( sampling_strategy=sampling_strategy_dict,
              random_state=123, n_jobs=-1)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

print(X_train.shape,y_train.shape)
print(X_train_res.shape, y_train_res.shape)

(10916, 46) (10916,)
(12127, 46) (12127,)


In [156]:
X_final = np.concatenate([X_train_res,X_val])
y_final = np.concatenate([y_train_res,y_val])
print(X_final.shape,y_final.shape)

(14856, 46) (14856,)


In [157]:
from xgboost import XGBClassifier

model_clf_xgb = XGBClassifier(verbosity=1, n_estimators=1000, objective='multi:softmax',
                        use_label_encoder=False, n_jobs=-1, num_parallel_tree=1,
                       colsample_bytree= 0.7765351154508043,
                     gamma= 4.545832475162831,
                     learning_rate= 0.07231521329322932,
                     max_depth= 6,
                     min_child_weight= 18.0,
                     subsample= 0.8465452157807187)

model_clf_xgb.fit(X_final,y_final, eval_metric=['mlogloss'])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7765351154508043,
              gamma=4.545832475162831, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.07231521329322932,
              max_delta_step=0, max_depth=6, min_child_weight=18.0, missing=nan,
              monotone_constraints='()', n_estimators=1000, n_jobs=-1,
              num_parallel_tree=1, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=0.8465452157807187, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, verbosity=1)

# Preparing Regressor

In [158]:
print(train.shape)

train2 = train[[ 'LanguageOfCommunication', 'Age', 'Gender',
       'JobProfileIDApplyingFor', 'HighestDegree', 'DegreeBranch',
       'GraduatingInstitute', 'LatestDegreeCGPA', 'YearsOfExperince',
       'CurrentCTC', 'MartialStatus',
       'EmpScore', 'CurrentDesignation', 'CurrentCompanyType',
       'DepartmentInCompany', 'TotalLeavesTaken']].copy()
print(train2.shape)

train2['GraduatingInstitute'] = train['GraduatingInstitute'].map({'Tier 1':3,'Tier 2':2, 'Tier 3':1})
train2['ExpectedHike'] = (train['ExpectedCTC']-train['CurrentCTC'])/train['CurrentCTC']*100
train2['BiasInfluentialFactor'] = train['BiasInfluentialFactor'].fillna('Unknown')
print(train2.shape)

target_col2 = train['FitmentPercent']

X2 = pd.get_dummies(train2, drop_first=True).values
y2 = target_col2.copy().values
print(X2.shape,y2.shape)

(13645, 22)
(13645, 16)
(13645, 18)
(13645, 55) (13645,)


### Regressor : xgb

In [159]:
from xgboost import XGBRegressor

model_rg_xgb = XGBRegressor(verbosity=1, n_estimators=500,
                         n_jobs=-1, num_parallel_tree=5,
                       colsample_bytree= 0.8026236086240719,
                         gamma= 7.949076522205543,
                         learning_rate= 0.09093358640056634,
                         max_depth= 7,
                         min_child_weight= 16.0,
                         subsample= 0.9292019673341835)

model_rg_xgb.fit(X2,y2, eval_metric=['rmse'])

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8026236086240719,
             gamma=7.949076522205543, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.09093358640056634,
             max_delta_step=0, max_depth=7, min_child_weight=16.0, missing=nan,
             monotone_constraints='()', n_estimators=500, n_jobs=-1,
             num_parallel_tree=5, random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=0.9292019673341835,
             tree_method='exact', validate_parameters=1, verbosity=1)

### Regressor : lgbm

In [160]:
from lightgbm import LGBMRegressor

model_rg_lgbm = LGBMRegressor(verbosity=1, n_estimators=500,
                         n_jobs=-1,
                         colsample_bytree= 0.7788161685191014,
                         learning_rate=0.07217558041072707,
                         max_depth= 8,
                         min_child_weight= 9.0,
                         reg_alpha= 0.5121021692895791,
                         reg_lambda= 0.5437507642672469,
                         subsample= 0.9456987855642878
                        )

model_rg_lgbm.fit(X2,y2)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 316
[LightGBM] [Info] Number of data points in the train set: 13645, number of used features: 55
[LightGBM] [Info] Start training from score 75.880093


LGBMRegressor(colsample_bytree=0.7788161685191014,
              learning_rate=0.07217558041072707, max_depth=8,
              min_child_weight=9.0, n_estimators=500,
              reg_alpha=0.5121021692895791, reg_lambda=0.5437507642672469,
              subsample=0.9456987855642878, verbosity=1)

# Test Data

## Classification

In [161]:
print(test.shape)

test1 = test[[ 'LanguageOfCommunication', 'Age', 'Gender',
       'JobProfileIDApplyingFor', 'HighestDegree', 'DegreeBranch',
       'GraduatingInstitute', 'LatestDegreeCGPA', 'YearsOfExperince',
       'CurrentCTC', 'MartialStatus',
       'EmpScore', 'CurrentDesignation', 'CurrentCompanyType',
       'DepartmentInCompany', 'TotalLeavesTaken']].copy()
print(test1.shape)

test1['GraduatingInstitute'] = test['GraduatingInstitute'].map({'Tier 1':3,'Tier 2':2, 'Tier 3':1})
test1['ExpectedHike'] = (test['ExpectedCTC']-test['CurrentCTC'])/test['CurrentCTC']*100
print(test1.shape)

X_test = pd.get_dummies(test1, drop_first=True).values
print(X_test.shape)

(8745, 20)
(8745, 16)
(8745, 17)
(8745, 46)


In [162]:
y_pred_clf = le.inverse_transform(model_clf_xgb.predict(X_test))

## Regression

In [163]:
print(test.shape)

test2 = test[[ 'LanguageOfCommunication', 'Age', 'Gender',
       'JobProfileIDApplyingFor', 'HighestDegree', 'DegreeBranch',
       'GraduatingInstitute', 'LatestDegreeCGPA', 'YearsOfExperince',
       'CurrentCTC', 'MartialStatus',
       'EmpScore', 'CurrentDesignation', 'CurrentCompanyType',
       'DepartmentInCompany', 'TotalLeavesTaken']].copy()
print(test2.shape)

test2['GraduatingInstitute'] = test['GraduatingInstitute'].map({'Tier 1':3,'Tier 2':2, 'Tier 3':1})
test2['ExpectedHike'] = (test['ExpectedCTC']-test['CurrentCTC'])/test['CurrentCTC']*100
test2['BiasInfluentialFactor'] = y_pred_clf
print(test2.shape)


X_test2 = pd.get_dummies(test2, drop_first=True).values
print(X_test2.shape)

(8745, 20)
(8745, 16)
(8745, 18)
(8745, 55)


In [164]:
y_pred_rg_xgb = model_rg_xgb.predict(X_test2)
y_pred_rg_lgbm = model_rg_lgbm.predict(X_test2)

y_pred_rg = (y_pred_rg_xgb+y_pred_rg_lgbm)/2

## Submission

In [165]:
sub = test[['EmpID']].copy().reset_index(drop=True)
sub['BiasInfluentialFactor'] = np.where(y_pred_clf=='Unknown',np.nan,y_pred_clf)
sub['FitmentPercent'] = np.around(y_pred_rg, decimals=2)

sub.head()

Unnamed: 0,EmpID,BiasInfluentialFactor,FitmentPercent
0,5664,DegreeBranch,89.29
1,23568,DegreeBranch,92.56
2,21490,MartialStatus,82.56
3,8363,,45.87
4,6165,Gender,80.69


In [166]:
sub.to_csv('submission_file.csv', index=False)