In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
train_data = pd.read_csv('./dataset/train.csv')
test_data = pd.read_csv('./dataset/test.csv')

In [3]:
f'SHAPE - train - {train_data.shape}, test - {test_data.shape}'

'SHAPE - train - (22750, 9), test - (12250, 8)'

In [4]:
print(train_data.describe())
print(train_data.info())

        Designation  Resource Allocation  Mental Fatigue Score     Burn Rate
count  22750.000000         21369.000000          20633.000000  21626.000000
mean       2.178725             4.481398              5.728188      0.452005
std        1.135145             2.047211              1.920839      0.198226
min        0.000000             1.000000              0.000000      0.000000
25%        1.000000             3.000000              4.600000      0.310000
50%        2.000000             4.000000              5.900000      0.450000
75%        3.000000             6.000000              7.100000      0.590000
max        5.000000            10.000000             10.000000      1.000000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22750 entries, 0 to 22749
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Employee ID           22750 non-null  object 
 1   Date of Joining       22750 non-null  obj

In [5]:
train_data.head()

Unnamed: 0,Employee ID,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate
0,fffe32003000360033003200,2008-09-30,Female,Service,No,2.0,3.0,3.8,0.16
1,fffe3700360033003500,2008-11-30,Male,Service,Yes,1.0,2.0,5.0,0.36
2,fffe31003300320037003900,2008-03-10,Female,Product,Yes,2.0,,5.8,0.49
3,fffe32003400380032003900,2008-11-03,Male,Service,Yes,1.0,1.0,2.6,0.2
4,fffe31003900340031003600,2008-07-24,Female,Service,No,3.0,7.0,6.9,0.52


In [6]:
def date_into_d_m_y(df):
    df['Date of Joining'] = pd.to_datetime(df['Date of Joining'])
    df['year'] = df['Date of Joining'].dt.year
    df['month'] = df['Date of Joining'].dt.month
    df['day'] = df['Date of Joining'].dt.day
    return df

train_data = date_into_d_m_y(train_data)
test_data = date_into_d_m_y(test_data)

In [7]:
train_data.head()

Unnamed: 0,Employee ID,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate,year,month,day
0,fffe32003000360033003200,2008-09-30,Female,Service,No,2.0,3.0,3.8,0.16,2008,9,30
1,fffe3700360033003500,2008-11-30,Male,Service,Yes,1.0,2.0,5.0,0.36,2008,11,30
2,fffe31003300320037003900,2008-03-10,Female,Product,Yes,2.0,,5.8,0.49,2008,3,10
3,fffe32003400380032003900,2008-11-03,Male,Service,Yes,1.0,1.0,2.6,0.2,2008,11,3
4,fffe31003900340031003600,2008-07-24,Female,Service,No,3.0,7.0,6.9,0.52,2008,7,24


#### Checking number of nan values in train set each columns

In [8]:
pd.DataFrame({'Name': train_data.columns, 'NaN values' : train_data.isna().sum().values, 'Dtype': [train_data[col].dtypes for col in train_data.columns]})

Unnamed: 0,Name,NaN values,Dtype
0,Employee ID,0,object
1,Date of Joining,0,datetime64[ns]
2,Gender,0,object
3,Company Type,0,object
4,WFH Setup Available,0,object
5,Designation,0,float64
6,Resource Allocation,1381,float64
7,Mental Fatigue Score,2117,float64
8,Burn Rate,1124,float64
9,year,0,int64


In [9]:
pd.DataFrame({'Name': test_data.columns, 'NaN values' : test_data.isna().sum().values, 'Dtype': [test_data[col].dtypes for col in test_data.columns]})

Unnamed: 0,Name,NaN values,Dtype
0,Employee ID,0,object
1,Date of Joining,0,datetime64[ns]
2,Gender,0,object
3,Company Type,0,object
4,WFH Setup Available,0,object
5,Designation,0,float64
6,Resource Allocation,0,float64
7,Mental Fatigue Score,0,float64
8,year,0,int64
9,month,0,int64


#### Imputing the values, as we can see that only float values we have to impute so we can replace the values with median values

In [10]:
for col in train_data.columns:
    if train_data[col].dtypes == 'float64':
        train_data[col].fillna(train_data[col].median(), inplace=True)

#### Checking the NaN values, to confirm they have been replaced

In [11]:
pd.DataFrame({'Name': train_data.columns, 'NaN values' : train_data.isna().sum().values, 'Dtype': [train_data[col].dtypes for col in train_data.columns]})

Unnamed: 0,Name,NaN values,Dtype
0,Employee ID,0,object
1,Date of Joining,0,datetime64[ns]
2,Gender,0,object
3,Company Type,0,object
4,WFH Setup Available,0,object
5,Designation,0,float64
6,Resource Allocation,0,float64
7,Mental Fatigue Score,0,float64
8,Burn Rate,0,float64
9,year,0,int64


In [12]:
def replacing_column_index(train_data):
    d = train_data['Burn Rate']
    train_data.drop(columns = ['Burn Rate'], inplace=True)
    train_data['Burn Rate'] = d
    return train_data

train_data = replacing_column_index(train_data)

### We need to change the categorical values to numerical data, checking unique values in each object columns

In [13]:
for col in train_data.columns:
    if train_data[col].dtype == 'object':
        print(f'{col} -> {train_data[col].nunique()}')

Employee ID -> 22750
Gender -> 2
Company Type -> 2
WFH Setup Available -> 2


#### Removing the Employee ID, and encoding the caterical data

In [14]:
from sklearn.preprocessing import LabelBinarizer
lbl = LabelBinarizer()
for col in train_data.columns:
    if train_data[col].dtype == 'object':
        if col != 'Employee ID':
            train_data[col] = lbl.fit_transform(train_data[col].values)
            print(f'{col} -> {train_data[col].nunique()}')
        else:
            train_data.drop(columns=[col], inplace=True)

Gender -> 2
Company Type -> 2
WFH Setup Available -> 2


In [15]:
train_data.head()

Unnamed: 0,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,year,month,day,Burn Rate
0,2008-09-30,0,1,0,2.0,3.0,3.8,2008,9,30,0.16
1,2008-11-30,1,1,1,1.0,2.0,5.0,2008,11,30,0.36
2,2008-03-10,0,0,1,2.0,4.0,5.8,2008,3,10,0.49
3,2008-11-03,1,1,1,1.0,1.0,2.6,2008,11,3,0.2
4,2008-07-24,0,1,0,3.0,7.0,6.9,2008,7,24,0.52


#### Dividing train data into train and test split to check the accuracy of different models

In [16]:
train_data = train_data.drop(columns = ['Date of Joining'])
X = train_data[train_data.columns[:-1]]
y = train_data[train_data.columns[-1]]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

In [17]:
f'train shape - {X_train.shape}, test shape - {X_test.shape}'

'train shape - (15925, 9), test shape - (6825, 9)'

In [18]:
X_train.describe()

Unnamed: 0,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,year,month,day
count,15925.0,15925.0,15925.0,15925.0,15925.0,15925.0,15925.0,15925.0,15925.0
mean,0.477363,0.65394,0.540597,2.172998,4.441633,5.743353,2008.0,6.513972,15.691177
std,0.499503,0.475727,0.498365,1.138224,1.986784,1.831352,0.0,3.432872,8.78594
min,0.0,0.0,0.0,0.0,1.0,0.0,2008.0,1.0,1.0
25%,0.0,0.0,0.0,1.0,3.0,4.7,2008.0,4.0,8.0
50%,0.0,1.0,1.0,2.0,4.0,5.9,2008.0,7.0,16.0
75%,1.0,1.0,1.0,3.0,6.0,6.9,2008.0,9.0,23.0
max,1.0,1.0,1.0,5.0,10.0,10.0,2008.0,12.0,31.0


In [19]:
y_train.describe()

count    15925.000000
mean         0.451359
std          0.193081
min          0.000000
25%          0.320000
50%          0.450000
75%          0.580000
max          1.000000
Name: Burn Rate, dtype: float64

#### As we have done, feature preprocessing. We can run initial models and check which model performs best and later upgrade that to search for better parameters.

In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error

In [28]:
model_dict = {
    'lr' : LinearRegression(),
    'rr' : Ridge(),
    'dtr': DecisionTreeRegressor(),
    'etr': ExtraTreesRegressor(),
    'rfr': RandomForestRegressor(),
    'xgb': xgboost.XGBRFRegressor(),
    'abr': AdaBoostRegressor(random_state=0, n_estimators=100),
    'knn': KNeighborsRegressor(n_neighbors=4),
    'en': ElasticNet(),
    'gbr': GradientBoostingRegressor(random_state=0)
}
Dict = {'Model':[], 'RMSE': [], 'r2_score':[]}

In [29]:
for key, model in model_dict.items():
    exp_model = model
    exp_model.fit(X_train,y_train)
    pred = exp_model.predict(X_test)
    print(f'Model Evaluation - {key}')
    Dict['Model'].append(key)
    rmse_score = np.sqrt(mean_squared_error(y_test, pred))
    Dict['RMSE'].append(rmse_score)
    r2_sc = r2_score(y_test, pred)
    print(f'RMSE - {rmse_score}, r2 score - {r2_sc}')
    Dict['r2_score'].append(r2_sc)

Model Evaluation - lr
RMSE - 0.07905942097287587, r2 score - 0.8334039932735589
Model Evaluation - rr
RMSE - 0.07905948544376774, r2 score - 0.833403721564068
Model Evaluation - dtr
RMSE - 0.10685464777404749, r2 score - 0.6956704226425121
Model Evaluation - etr
RMSE - 0.0813739922664417, r2 score - 0.8235065569929321
Model Evaluation - rfr
RMSE - 0.07727847152834955, r2 score - 0.8408251768246814
Model Evaluation - xgb
RMSE - 0.07410040283717895, r2 score - 0.853648065744012
Model Evaluation - abr
RMSE - 0.088622846236836, r2 score - 0.7906616887294093
Model Evaluation - knn
RMSE - 0.0866746262898828, r2 score - 0.7997644042159034
Model Evaluation - en
RMSE - 0.19370490228067058, r2 score - -8.877471619594957e-05
Model Evaluation - gbr
RMSE - 0.07299222400693797, r2 score - 0.8579927486182375


In [30]:
Dict

{'Model': ['lr', 'rr', 'dtr', 'etr', 'rfr', 'xgb', 'abr', 'knn', 'en', 'gbr'],
 'RMSE': [0.07905942097287587,
  0.07905948544376774,
  0.10685464777404749,
  0.0813739922664417,
  0.07727847152834955,
  0.07410040283717895,
  0.088622846236836,
  0.0866746262898828,
  0.19370490228067058,
  0.07299222400693797],
 'r2_score': [0.8334039932735589,
  0.833403721564068,
  0.6956704226425121,
  0.8235065569929321,
  0.8408251768246814,
  0.853648065744012,
  0.7906616887294093,
  0.7997644042159034,
  -8.877471619594957e-05,
  0.8579927486182375]}

In [34]:
f_table = pd.DataFrame(Dict)

In [36]:
f_table

Unnamed: 0,Model,RMSE,r2_score
0,lr,0.079059,0.833404
1,rr,0.079059,0.833404
2,dtr,0.106855,0.69567
3,etr,0.081374,0.823507
4,rfr,0.077278,0.840825
5,xgb,0.0741,0.853648
6,abr,0.088623,0.790662
7,knn,0.086675,0.799764
8,en,0.193705,-8.9e-05
9,gbr,0.072992,0.857993


In [37]:
f_table.sort_values(by=['RMSE'])

Unnamed: 0,Model,RMSE,r2_score
9,gbr,0.072992,0.857993
5,xgb,0.0741,0.853648
4,rfr,0.077278,0.840825
0,lr,0.079059,0.833404
1,rr,0.079059,0.833404
3,etr,0.081374,0.823507
7,knn,0.086675,0.799764
6,abr,0.088623,0.790662
2,dtr,0.106855,0.69567
8,en,0.193705,-8.9e-05


#### Here we can observe that gradient boosting algorithm works best and then extending gradient boosting works second best

In [44]:
gbr = GradientBoostingRegressor(random_state=0)
xgbr = xgboost.XGBRFRegressor()

#### Preprocessing test data

In [39]:
test_data.head()

Unnamed: 0,Employee ID,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,year,month,day
0,fffe31003300390039003000,2008-12-10,Female,Service,No,2.0,5.0,7.7,2008,12,10
1,fffe31003300310037003800,2008-08-14,Female,Product,Yes,1.0,2.0,5.2,2008,8,14
2,fffe33003400380035003900,2008-11-13,Male,Product,Yes,1.0,3.0,5.9,2008,11,13
3,fffe3100370039003200,2008-02-07,Female,Service,No,3.0,6.0,4.6,2008,2,7
4,fffe32003600390036003700,2008-07-17,Female,Product,No,2.0,5.0,6.4,2008,7,17


In [40]:
from sklearn.preprocessing import LabelBinarizer
lbl = LabelBinarizer()
for col in test_data.columns:
    if test_data[col].dtype == 'object':
        if col != 'Employee ID':
            test_data[col] = lbl.fit_transform(test_data[col].values)
            print(f'{col} -> {test_data[col].nunique()}')
        else:
            test_data.drop(columns=[col], inplace=True)

Gender -> 2
Company Type -> 2
WFH Setup Available -> 2


In [42]:
test_data.head()
test_data.drop(columns=['Date of Joining'], inplace=True)

In [45]:
pred_gbr = gbr.fit(X_train, y_train).predict(test_data)
pred_xgbr = xgbr.fit(X_train, y_train).predict(test_data)

In [46]:
test_data.head()

Unnamed: 0,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,year,month,day
0,0,1,0,2.0,5.0,7.7,2008,12,10
1,0,0,1,1.0,2.0,5.2,2008,8,14
2,1,0,1,1.0,3.0,5.9,2008,11,13
3,0,1,0,3.0,6.0,4.6,2008,2,7
4,0,0,0,2.0,5.0,6.4,2008,7,17


In [52]:
test_data1 = pd.read_csv('./dataset/test.csv')
sample_gbr = pd.concat([test_data1['Employee ID'], pd.DataFrame(pred_gbr)], axis=1)
sample_xgbr = pd.concat([test_data1['Employee ID'], pd.DataFrame(pred_xgbr)], axis=1)
sample_gbr.to_csv('sample_gbr.csv', index=False)
sample_xgbr.to_csv('sample_xgbr.csv', index=False)