In [1]:
#Importing required packages

import numpy as np 
import pandas as pd
import os

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from xgboost import XGBRegressor
import catboost as cb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error, accuracy_score, log_loss
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from matplotlib import pyplot as plt
import optuna

In [2]:
#Reading training, testing, submission file

train_data = pd.read_csv(r'train.csv')
test_data = pd.read_csv(r'test.csv')
submission_data = pd.read_csv(r'submission.csv')

In [3]:
train_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,52,Private,98926,HS-grad,9,Never-married,Other-service,Unmarried,White,Male,-3,0,44,United-States,<=50K
1,76,Self-emp-not-inc,132607,Bachelors,12,Married-civ-spouse,Sales,Husband,White,Male,7527,0,44,United-States,>50K
2,40,Private,243258,Bachelors,13,Married-civ-spouse,Transport-moving,Husband,Amer-Indian-Eskimo,Male,19,0,57,United-States,>50K
3,76,State-gov,181259,Bachelors,12,Married-civ-spouse,Prof-specialty,Husband,White,Male,7720,0,44,United-States,>50K
4,36,Self-emp-inc,115379,Masters,13,Divorced,Exec-managerial,Not-in-family,White,Male,12,0,58,United-States,<=50K


In [4]:
# Segregating categorical and numeric columns

categorical_cols = []
numeric_cols = []
for col in train_data.columns:
#     print(col)
    if train_data[col].dtype == 'object':
#         print(col, train_data[col].dtype)
        categorical_cols.append(col)
    elif col == 'hours-per-week':
        target_col = ['hours-per-week']
    else:
        numeric_cols.append(col)
print('The categorical cols are: ', categorical_cols)
print('The numeric cols are:', numeric_cols)
print('The target cols are:', target_col)

The categorical cols are:  ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']
The numeric cols are: ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss']
The target cols are: ['hours-per-week']


In [5]:
# Checking the number of columns in the training data with null values.
num_cols_with_missing = sum(train_data.isnull().sum() > 0)
num_cols_with_missing

0

In [6]:
# Getting the count distribution of different categorical variables.

for col in categorical_cols:
    print('Column:', col)
    print(train_data[col].value_counts())

Column: workclass
 Private             11700
 Self-emp-not-inc     3089
 Local-gov            1159
 Federal-gov           995
 Self-emp-inc          861
 State-gov             819
 Without-pay           258
 Never-worked           63
Name: workclass, dtype: int64
Column: education
 HS-grad         5462
 Some-college    3745
 Bachelors       3337
 11th            1000
 Masters          941
 Assoc-acdm       796
 Assoc-voc        769
 7th-8th          750
 Prof-school      476
 Doctorate        387
 10th             386
 5th-6th          319
 9th              200
 12th             184
 1st-4th          148
 Preschool         44
Name: education, dtype: int64
Column: marital-status
 Married-civ-spouse       8311
 Never-married            5857
 Divorced                 2697
 Separated                 772
 Widowed                   624
 Married-spouse-absent     461
 Married-AF-spouse         222
Name: marital-status, dtype: int64
Column: occupation
 Adm-clerical         3957
 Prof-specialty

In [7]:
#Splitting the dataset into train test and validation.

y = train_data[target_col]
X = train_data.drop(target_col, axis=1)
X_test = test_data.drop(target_col, axis=1)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=42)

In [8]:
# OneHotEncoding

from sklearn.preprocessing import OneHotEncoder



OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[categorical_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[categorical_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(categorical_cols, axis=1)
num_X_valid = X_valid.drop(categorical_cols, axis=1)

OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)


OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test[categorical_cols]))

# One-hot encoding removed index; put it back
OH_cols_test.index = X_test.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_test = X_test.drop(categorical_cols, axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

In [9]:
#hyperparameters tuned using Optuna

params = {'learning_rate': 0.029461025206827262,
 'reg_lambda': 2.1256796771411217,
 'reg_alpha': 0.00016302671940194723,
 'subsample': 0.4830761540400466,
 'colsample_bytree': 0.6822417969114689,
 'max_depth': 4}

In [10]:
# Defining the model fitting the model to training, and testing data to get evaluation metics.

model = XGBRegressor(
    random_state=0, 
    #tree_method='gpu_hist',
    #gpu_id=0,
    #predictor="gpu_predictor",
    n_estimators=5000,
    **params
)

model.fit(OH_X_train, y_train, early_stopping_rounds=300, eval_set=[(OH_X_valid, y_valid)], verbose=1000)
preds_valid = model.predict(OH_X_valid)
rmse = mean_squared_error(y_valid, preds_valid, squared=False)
print(rmse)

[0]	validation_0-rmse:39.79749
[485]	validation_0-rmse:9.55095
9.516934884291608


In [11]:
# Predicting on the test file

final_preds_xgboost = model.predict(OH_X_test)

In [12]:
# Writing to the submission csv to submit to the portal.

submission_data['hours-per-week'] = final_preds_xgboost

submission_data.to_csv("submission_xg_boost_fine_tune_1.csv", index=False)