In [15]:
import pandas as pd
pd.set_option('display.max_columns', None)
from datetime import timedelta,datetime
import datetime as dt
import pickle
import numpy as np

In [16]:
test_data = pd.read_csv('./Data/test_data.csv')
data=test_data.copy()

test_data.loc[test_data['income']=='<=50K.','income']='<=50K'
test_data.loc[test_data['income']=='>50K.','income']='>50K'

#One Hot Encoding of the Categorical features 
cols_to_one_hot=["workClass","education","marital-status","occupation","relationship","race","sex","native-country"]
one_hot_data=pd.get_dummies(data[cols_to_one_hot])

#Merging one hot encoded features with our dataset 'data' 
data = pd.concat([data,one_hot_data],axis=1)

#Removing categorical features 
data.drop(['workClass','education','marital-status','occupation','relationship','race','sex','native-country','education-num'],axis=1,inplace=True)

#Here our target variable is 'Income' with values as 1 or 0.  
data.loc[data['income'].isin(['<=50K','<=50K.']), 'income']=0
data.loc[data['income'].isin(['>50K','>50K.']) ,'income']=1

#Put income column to the end
data.drop(columns=["income"],inplace=True)

#Add native-country_Holand-Netherlands one hot column
data['native-country_Holand-Netherlands']=0

In [17]:
#Read Model and Make Prediction
model = pickle.load(open("./Model_Data/XGBOOST_model.pickle", "rb"))
cols_in_model=model.get_booster().feature_names
y_pred = model.predict_proba(data[cols_in_model], ntree_limit = model.get_booster().best_iteration)[:,1]
y_pred=pd.DataFrame({ "income_proba":y_pred })

In [18]:
#Concatenate Data Before Preprocessing, Preprocessed Data and Predicted Probabilities
data=pd.concat(
[test_data[['education','education-num',
 'income',
 'marital-status',
 'native-country',
 'occupation',
 'race',
 'relationship',
 'sex',
 'workClass']],data,y_pred],axis=1)

In [19]:
#Assign Predicted Income Levels
data['predicted_income']=0
data.loc[data['income_proba']>=0.21393804,'predicted_income']=1

data.loc[data['predicted_income']==0,'predicted_income_level']='<=50K'
data.loc[data['predicted_income']==1,'predicted_income_level']='>50K'

In [20]:
# Add Made Up Columns for Visualization
data['date']=datetime.today()
for _day in reversed(range(1,8)):
    data.loc[int(data.shape[0]*(7-_day)/7):int(data.shape[0]*(7-_day+1)/7),'date']=data['date']-dt.timedelta(days=_day)

data=data[data['date']>='2020-10-03']

In [21]:
data['date'].sort_values().value_counts()

2020-10-07 17:03:39.757307    2326
2020-10-05 17:03:39.757307    2326
2020-10-03 17:03:39.757307    2326
2020-10-06 17:03:39.757307    2325
2020-10-09 17:03:39.757307    2325
2020-10-04 17:03:39.757307    2325
2020-10-08 17:03:39.757307    2325
Name: date, dtype: int64

In [22]:
#Add Rank Column
data['row_num'] = np.arange(len(data))

In [23]:
#Show Data Ready to Visualize
data.head()

Unnamed: 0,education,education-num,income,marital-status,native-country,occupation,race,relationship,sex,workClass,age,fnlwgt,capital-gain,capital-loss,hours-per-week,workClass_Federal-gov,workClass_Local-gov,workClass_Never-worked,workClass_Private,workClass_Self-emp-inc,workClass_Self-emp-not-inc,workClass_State-gov,workClass_Without-pay,education_10th,education_11th,education_12th,education_1st-4th,education_5th-6th,education_7th-8th,education_9th,education_Assoc-acdm,education_Assoc-voc,education_Bachelors,education_Doctorate,education_HS-grad,education_Masters,education_Preschool,education_Prof-school,education_Some-college,marital-status_Divorced,marital-status_Married-AF-spouse,marital-status_Married-civ-spouse,marital-status_Married-spouse-absent,marital-status_Never-married,marital-status_Separated,marital-status_Widowed,occupation_Adm-clerical,occupation_Armed-Forces,occupation_Craft-repair,occupation_Exec-managerial,occupation_Farming-fishing,occupation_Handlers-cleaners,occupation_Machine-op-inspct,occupation_Other-service,occupation_Priv-house-serv,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,relationship_Husband,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Female,sex_Male,native-country_Cambodia,native-country_Canada,native-country_China,native-country_Columbia,native-country_Cuba,native-country_Dominican-Republic,native-country_Ecuador,native-country_El-Salvador,native-country_England,native-country_France,native-country_Germany,native-country_Greece,native-country_Guatemala,native-country_Haiti,native-country_Honduras,native-country_Hong,native-country_Hungary,native-country_India,native-country_Iran,native-country_Ireland,native-country_Italy,native-country_Jamaica,native-country_Japan,native-country_Laos,native-country_Mexico,native-country_Nicaragua,native-country_Outlying-US(Guam-USVI-etc),native-country_Peru,native-country_Philippines,native-country_Poland,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,native-country_Holand-Netherlands,income_proba,predicted_income,predicted_income_level,date,row_num
0,11th,7,<=50K,Never-married,United-States,Machine-op-inspct,Black,Own-child,Male,Private,25,226802,0,0,40,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.001765,0,<=50K,2020-10-03 17:03:39.757307,0
1,HS-grad,9,<=50K,Married-civ-spouse,United-States,Farming-fishing,White,Husband,Male,Private,38,89814,0,0,50,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.245173,1,>50K,2020-10-03 17:03:39.757307,1
2,Assoc-acdm,12,>50K,Married-civ-spouse,United-States,Protective-serv,White,Husband,Male,Local-gov,28,336951,0,0,40,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.460227,1,>50K,2020-10-03 17:03:39.757307,2
3,Some-college,10,>50K,Married-civ-spouse,United-States,Machine-op-inspct,Black,Husband,Male,Private,44,160323,7688,0,40,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.995759,1,>50K,2020-10-03 17:03:39.757307,3
4,Some-college,10,<=50K,Never-married,United-States,,White,Own-child,Female,,18,103497,0,0,30,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.000886,0,<=50K,2020-10-03 17:03:39.757307,4


In [24]:
data.to_csv("./Data_to_Visualize/Income_Data_to_Visualize.csv")

End of Data Preparation