## Predicting Salary class in the dataset

### Problem Statement

Need to classify the normal salary range of different employees based on information provided by different features in the dataset. The Class variable provides the range of salary into 3 categories that are low, mid and high range.

### Data Ingestion

In [3]:
import numpy as np
import pandas as pd

In [4]:
#Loading the datasets
test=pd.read_csv('H_test.csv')
train=pd.read_csv('H_train.csv')

In [9]:
# Shape and Size of the dataset
print("The shape of the dataset",train.shape)
print("The size of the dataset",train.size)
print("The shape of the dataset",test.shape)
print("The size of the dataset",test.size)

The shape of the dataset (149087, 21)
The size of the dataset 3130827
The shape of the dataset (63895, 20)
The size of the dataset 1277900


In [10]:
#Check the null values in the dataset
train.isnull().sum()

ID                         0
Year Type                  0
Year                       0
Organization Group Code    0
Organization Group         0
Department Code            0
Department                 0
Union Code                 0
Union                      0
Job Family Code            0
Job Family                 0
Job Code                   0
Job                        0
Employee Identifier        0
Overtime                   0
Other Salaries             0
Retirement                 0
Health/Dental              0
Other Benefits             0
Total Benefits             0
Class                      0
dtype: int64

In [11]:
#Check the null values in the dataset
test.isnull().sum()

ID                         0
Year Type                  0
Year                       0
Organization Group Code    0
Organization Group         0
Department Code            0
Department                 0
Union Code                 0
Union                      0
Job Family Code            0
Job Family                 0
Job Code                   0
Job                        0
Employee Identifier        0
Overtime                   0
Other Salaries             0
Retirement                 0
Health/Dental              0
Other Benefits             0
Total Benefits             0
dtype: int64

In [16]:
testid=test['ID']

In [14]:
pd.set_option('display.max_columns', 999)

In [15]:
train.head()

Unnamed: 0,ID,Year Type,Year,Organization Group Code,Organization Group,Department Code,Department,Union Code,Union,Job Family Code,Job Family,Job Code,Job,Employee Identifier,Overtime,Other Salaries,Retirement,Health/Dental,Other Benefits,Total Benefits,Class
0,9248,Fiscal,2017,3,Human Welfare & Neighborhood Development,DSS,HSA Human Services Agency,535,"SEIU - Human Services, Local 1021",2900,Human Services,2905,Senior Eligibility Worker,41351,0.0,240.0,11896.36,13765.55,5248.43,30910.34,2
1,44541,Fiscal,2014,6,General Administration & Finance,ASR,ASR Assessor / Recorder,21,"Prof & Tech Engineers - Miscellaneous, Local 21",4200,Appraisal & Taxation,4222,Sr Personal Property Auditor,41792,0.0,400.0,15429.94,9337.37,5599.01,30366.32,2
2,47031,Fiscal,2014,3,Human Welfare & Neighborhood Development,DSS,HSA Human Services Agency,535,"SEIU - Human Services, Local 1021",2900,Human Services,2910,Social Worker,9357,0.0,1080.0,9682.0,8848.03,3463.92,21993.95,2
3,139416,Fiscal,2014,1,Public Protection,FIR,FIR Fire Department,798,"Firefighters - Miscellaneous, Local 798",H000,Fire Services,H002,Firefighter,28022,25730.46,18414.18,24222.26,13911.13,2416.58,40549.97,3
4,123780,Fiscal,2013,2,"Public Works, Transportation & Commerce",MTA,MTA Municipal Transprtn Agncy,790,"SEIU - Miscellaneous, Local 1021",1600,"Payroll, Billing & Accounting",1224,Pr Payroll & Personnel Clerk,51052,1138.28,2148.11,15437.62,12828.15,7246.54,35512.31,3


In [18]:
train['salary']=train['Overtime']+train['Other Salaries']+train['Total Benefits']
test['salary']=test['Overtime']+test['Other Salaries']+test['Total Benefits']
train= train.drop(['ID','Department','Overtime','Total Benefits', 'Other Salaries','Organization Group','Union','Job Family','Job','Retirement','Health/Dental','Other Benefits','Year Type','Employee Identifier'],axis=1)
test= test.drop(['ID','Department','Overtime','Total Benefits', 'Other Salaries','Organization Group','Union','Job Family','Job','Retirement','Health/Dental','Other Benefits','Year Type','Employee Identifier'],axis=1)

In [22]:
from sklearn.preprocessing import LabelEncoder,StandardScaler

In [23]:
lm=LabelEncoder()
train['Department Code']=lm.fit_transform(train['Department Code'])
train['Job Family Code']=lm.fit_transform(train['Job Family Code'])
train['Job Code'] = lm.fit_transform(train['Job Code'])
test['Department Code']=lm.fit_transform(test['Department Code'])
test['Job Family Code']=lm.fit_transform(test['Job Family Code'])
test['Job Code']= lm.fit_transform(test['Job Code'])

In [24]:
train.head(2)

Unnamed: 0,Year,Organization Group Code,Department Code,Union Code,Job Family Code,Job Code,Class,salary
0,2017,3,22,535,18,373,2,31150.34
1,2014,6,5,21,25,509,2,30766.32


In [25]:
test.head(2)

Unnamed: 0,Year,Organization Group Code,Department Code,Union Code,Job Family Code,Job Code,salary
0,2015,4,20,790,13,261,4217.44
1,2016,4,20,791,12,245,54975.04


### Model Building

In [26]:
X=train.drop('Class',axis=1)
y=train['Class']

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
Xtrain,Xtest,ytrain,ytest=train_test_split(X,y,test_size=0.3,random_state=5)

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,roc_curve,auc,classification_report,confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

import warnings
warnings.filterwarnings('ignore')

### Logistic Regression

In [34]:
logreg_model = LogisticRegression()
logreg_model.fit(Xtrain, ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [35]:
print('Training score =', logreg_model.score(Xtrain, ytrain))
print('Test score =', logreg_model.score(Xtest, ytest))

Training score = 0.8008623993867382
Test score = 0.8032061171104702


In [36]:
predictions = logreg_model.predict(Xtest)

In [38]:
print(classification_report(ytest,predictions))

              precision    recall  f1-score   support

           1       0.90      0.96      0.93     14602
           2       0.77      0.60      0.67     14864
           3       0.74      0.86      0.79     15261

    accuracy                           0.80     44727
   macro avg       0.80      0.80      0.80     44727
weighted avg       0.80      0.80      0.80     44727



In [44]:
ypredictions =pd.DataFrame(logreg_model.predict(Xtest))

In [45]:
accuracy_score(ytest,ypredictions)

0.8032061171104702

### Random Forest Classifier

In [40]:
RF=RandomForestClassifier(n_estimators=100)

In [41]:
RF.fit(Xtrain,ytrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [42]:
ypred=pd.DataFrame(RF.predict(Xtest))

In [43]:
accuracy_score(ytest,ypred)

0.9289243633599392

In [None]:
Test_pred=RF.predict(test)
testdf=pd.DataFrame({'ID':testid,'Class':Test_pred})

In [None]:
testdf.to_csv('submission1.csv',index=False)