# Data

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt



In [2]:
df_adult=pd.read_csv('adult.csv')

In [7]:
df_adult=df_adult.replace('?', np.nan)

In [8]:
df_adult.to_csv('adult2.csv',index=False)

In [10]:
# pd.read_csv('adult2.csv')

# Training Script

In [16]:
# LIBRARY
#Basic Operation
import pandas as pd
import numpy as np

#ML models
from sklearn.linear_model import LogisticRegression

#Feature Engineering
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import category_encoders as ce

# Evaluation
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, train_test_split
from sklearn.metrics import f1_score

# Saving models
import pickle
import joblib

# DATA
adult= pd.read_csv('adult2.csv')

#Preprocessing
binary_encoder_pipe=Pipeline([
    ('imputer',SimpleImputer(strategy='constant', fill_value='NC')),
    ('binary encoding',ce.BinaryEncoder())
])

transformer= ColumnTransformer([
    ('one hot encoder', OneHotEncoder(drop= 'first'), ['relationship','race','sex']),
    ('binary encoder', binary_encoder_pipe,['workclass','marital.status','occupation','native.country'])
], remainder='passthrough')

#data splitting
x=adult.drop(columns=['fnlwgt','income','education'])
y=np.where(adult['income']=='>50K',1,0)

# Model Selection

model= LogisticRegression(solver='liblinear', random_state=2020)
estimator=Pipeline([
    ('preprocess', transformer),
    ('estimator', model)
])

hyperparam_space= {
    'estimator__C':[100,10,1,0.1,0.001],
    'estimator__solver':['liblinear','newton-cg']
}

skfold= StratifiedKFold(n_splits=5)

grid_search= GridSearchCV(
    estimator,
    param_grid=hyperparam_space,
    cv=skfold,
    scoring='f1',
    n_jobs=-1
)

grid_search.fit(x,y)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('preprocess',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('one '
                                                                         'hot '
                                                                         'encoder',
                                                                         OneHotEncoder(drop='first'),
                                                                         ['relationship',
                                                                          'race',
                                                                          'sex']),
                                                                        ('binary '
                                                                         'encoder',
                

## Pickle
### Saving Model

In [21]:
# Model Picling
best_model= grid_search.best_estimator_.fit(x,y) #final model
file_name='Model Final.sav'
pickle.dump(best_model,open(file_name,'wb'))

### Predict with Saved Model

In [22]:
# load model
file_name='Model Final.sav'
loaded_model=pickle.load(open(file_name,'rb'))

In [58]:
data1=df_adult.loc[0:5].drop(columns=['fnlwgt','income','education'])

In [59]:
loaded_model.predict(data1)

array([0, 1, 0, 0, 0, 0])

In [60]:
y[0:5]

array([0, 0, 0, 0, 0])

In [62]:
x.columns

Index(['age', 'workclass', 'education.num', 'marital.status', 'occupation',
       'relationship', 'race', 'sex', 'capital.gain', 'capital.loss',
       'hours.per.week', 'native.country'],
      dtype='object')

In [67]:
df_predict=pd.DataFrame({
    'age':[43],
    'workclass':['Self-emp-not-inc'],
    'education.num':[13],
    'marital.status':['Separated'],
    'occupation':['Craft-repair'],
    'relationship':['Unmarried'],
    'race': ['White'],
    'sex': ['Male'],
    'capital.gain':[0],
    'capital.loss':[0],
    'hours.per.week':[35],
    'native.country':['United-States']
})
df_predict

Unnamed: 0,age,workclass,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
0,43,Self-emp-not-inc,13,Separated,Craft-repair,Unmarried,White,Male,0,0,35,United-States


In [68]:
loaded_model.predict(df_predict)

array([0])

In [70]:
loaded_model.predict_proba(df_predict)

array([[0.87066118, 0.12933882]])

## Joblib

### Saving Model

In [74]:
best_model= grid_search.best_estimator_.fit(x,y) #final model
joblib.dump(best_model,'model_joblib')

['model_joblib']

### Predict with Saved Model

In [72]:
df_predict

Unnamed: 0,age,workclass,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
0,43,Self-emp-not-inc,13,Separated,Craft-repair,Unmarried,White,Male,0,0,35,United-States


In [75]:
model_joblib=joblib.load('model_joblib')

In [76]:
model_joblib.predict(df_predict)

array([0])

In [77]:
model_joblib.predict_proba(df_predict)

array([[0.87066118, 0.12933882]])