## 1) Importing libraries

In [1]:
import sys
import os
import lime
import lime.lime_tabular
import dill as pickle
#sys.path.append(os.path.abspath("../../"))
from ml_classification import *
from generic_preprocessing import *
from IPython.display import HTML

This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.



## 2) Importing data

In [2]:
data = np.genfromtxt('adult.data.txt', delimiter=', ', dtype=str)
feature_names = [
    "Age", "Workclass", "fnlwgt", "Education", "Education-Num",
    "Marital Status", "Occupation", "Relationship", "Race", "Sex",
    "Capital Gain", "Capital Loss", "Hours per week", "Country", 'income'
]
data = pd.DataFrame(data)
data.columns = feature_names
data.head()

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours per week,Country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
## Separating train and test data
test_indices = np.random.randint(0,data.shape[0],size = int(data.shape[0]/5))
test_data = data.loc[test_indices,:].reset_index(drop=True)
train_data = data.loc[set(range(data.shape[0]))-set(test_indices),:].reset_index(drop=True)

In [4]:
print_dim(test_data)
print_dim(train_data)

Data size: Rows-6512 Columns-15
Data size: Rows-26665 Columns-15


## 3) Data processing

In [5]:
## Defining numerical and categorical variables
numerical_features = list(data.columns[[0,2,4,10,11,12]])
categorical_features = list(data.columns[[1,3,5,6,7,8,9,13]])

In [6]:
## Converting numerical feature data type to float
for i in numerical_features:
    train_data.loc[:,i] = train_data.loc[:,i].astype(float)

## Checking new datatype
print_dataunique(train_data.loc[:,numerical_features])

0 Age <class 'numpy.float64'> 73 [39. 50. 38. 53. 28.]
1 fnlwgt <class 'numpy.float64'> 18698 [ 77516.  83311. 215646. 234721. 338409.]
2 Education-Num <class 'numpy.float64'> 16 [13.  9.  7. 14.  5.]
3 Capital Gain <class 'numpy.float64'> 116 [ 2174.     0.  2407. 14344.  7688.]
4 Capital Loss <class 'numpy.float64'> 91 [   0. 2042. 1408. 1902. 1573.]
5 Hours per week <class 'numpy.float64'> 93 [40. 13. 16. 45. 30.]


In [7]:
## creating a target variable
y_label = list(train_data.income.apply(lambda x: 1 if x == '>50K' else 0))
train_data.drop(['income'],axis = 1, inplace=True)

In [26]:
## One hot encoding categorical features
data_processed, le_dict = label_encoder(train_data,categorical_features)
data_processed.head()

Label encoding column - Workclass
Label encoding column - Education
Label encoding column - Marital Status
Label encoding column - Occupation
Label encoding column - Relationship
Label encoding column - Race
Label encoding column - Sex
Label encoding column - Country


Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours per week,Country
0,28.0,4,338409.0,9,13.0,2,10,5,2,0,0.0,0.0,40.0,5
1,37.0,4,284582.0,12,14.0,2,4,5,4,0,0.0,0.0,40.0,39
2,31.0,4,45781.0,12,14.0,4,10,1,4,0,14084.0,0.0,50.0,39
3,37.0,4,280464.0,15,10.0,2,4,0,2,1,0.0,0.0,80.0,39
4,30.0,7,141297.0,9,13.0,2,10,0,1,1,0.0,0.0,40.0,19


In [27]:
## Creating hold-out CV
X_train, X_test, y_train, y_test = holdout_cv(data_processed,y_label,size =0.3)
X_train = X_train.reset_index(drop='index')
X_test = X_test.reset_index(drop='index')

## 4) Model Building - Binary Classification

In [31]:
## Running an Xgboost model fo model building and model explaination
pred_y_test, loss,_,rf_model = runRF(X_train.values, y_train, X_test.values, y_test,rounds=187)

Train and Test loss :  0.9369329013864193 0.9131041971553713


## 5) Serializing essential files

In [32]:
## Numerical column names
pickle.dump(numerical_features,open('../model/numerical_list.p','wb'))
pickle.dump(xgb_model,open('../model/rf_model.p','wb'))
pickle.dump(le_dict,open('../model/le_dict.p','wb'))

## 6) Hitting the  API

In [8]:
## Importing libraries  
import json ##for converting data to json format
import requests ## Hitting the api with data

In [9]:
## Converting test data to JSON format
"""
Setting the headers to send and accept json responses
"""
header = {'Content-Type': 'application/json', \
                  'Accept': 'application/json'}

"""Converting Pandas Dataframe to json
"""
data = test_data.drop(['income'],axis=1).to_json(orient='records')

### Run "python main.py" in flask_example folder before running the code below

In [59]:
"""
POST <url>/predict
"""
resp = requests.post("http://localhost:5000/predict", \
                    data = json.dumps(data),\
                    headers= header)

In [60]:
## Printing predictions
print("Response code:{0}".format(resp.status_code))
print(pd.DataFrame(eval(resp.json()['predictions'])).head())

Response code:200
   index  predictions
0      0     0.113134
1      1     0.047628
2      2     0.075332
3      3     0.171609
4      4     0.268776


### Checking api running from docker

- Go to "docker folder"
- Open docker quickstart terminal and go to this "docker" foler
- To create a "docker image run docker build -t flaskexample" , where flaskexample will be the name of docker image
- After image is created run this command - "docker run -p 4000:80 flaskexample" to run a container with our app
- Run the code below to check if API is running properly within docker 

In [63]:
"""
POST <url>/predict
** NOTE-IP address changes depending on the OS the docker is running, for windows below link will work 
"""
resp = requests.post("http://192.168.99.100:4000/predict", \
                    data = json.dumps(data),\
                    headers= header)

In [64]:
## Printing predictions
print("Response code:{0}".format(resp.status_code))
print(pd.DataFrame(eval(resp.json()['predictions'])).head())

Response code:200
   index  predictions
0      0     0.113134
1      1     0.047628
2      2     0.075332
3      3     0.171609
4      4     0.268776
