# Should you question an invoice sent by a supplier


For updates on the way Sagemaker or AWS behave compared to the notebook code, please refer to https://livebook.manning.com/#!/book/machine-learning-for-business/chapter-5/v-5/137

## Part 1: Load and examine the data

To run the code in the notebook cell, change the name of the data_bucket from 'machliba' to the data_bucket holding your data and click into the cell and press Ctrl+Enter.

In [1]:
data_bucket = 'ml4-business' 
subfolder = 'ch05' 
dataset = 'activities.csv' 

In [2]:
import pandas as pd
import boto3
import s3fs
import sagemaker
from sklearn.model_selection import train_test_split
import json
import csv
from time import sleep

role = sagemaker.get_execution_role()
s3 = s3fs.S3FileSystem(anon=False)

In [3]:
df = pd.read_csv(f's3://{data_bucket}/{subfolder}/{dataset}')
display(df[5:8])

Unnamed: 0,Matter Number,Firm Name,Matter Type,Resource,Activity,Minutes,Fee,Total,Error
5,0,Cox Group,Antitrust,Paralegal,Attend Court,110,50,91.67,False
6,0,Cox Group,Antitrust,Junior,Attend Court,505,150,1262.5,True
7,0,Cox Group,Antitrust,Paralegal,Attend Meeting,60,50,50.0,False


In [4]:
df['Error'].value_counts() # Display the number of error lines: False = no error. True = error.

False    103935
True       2030
Name: Error, dtype: int64

In [5]:
print(f'Number of rows in dataset: {df.shape[0]}')
print()
print('Matter types:')
print(df['Matter Type'].value_counts())
print()
print('Resources:')
print(df['Resource'].value_counts())
print()
print('Activities:')
print(df['Activity'].value_counts())

Number of rows in dataset: 105965

Matter types:
Antitrust                 23922
Insolvency                16499
IPO                       14236
Commercial arbitration    12927
Project finance           11776
M&A                        6460
Structured finance         5498
Asset recovery             4913
Tax planning               4871
Securities litigation      4863
Name: Matter Type, dtype: int64

Resources:
Partner      26587
Junior       26543
Paralegal    26519
Senior       26316
Name: Resource, dtype: int64

Activities:
Prepare Opinion    26605
Phone Call         26586
Attend Court       26405
Attend Meeting     26369
Name: Activity, dtype: int64


## Part 2: Get the data into the right shape

In [6]:
encoded_df = pd.get_dummies(df, columns=['Matter Type','Resource','Activity']) 
encoded_df.head(3)

Unnamed: 0,Matter Number,Firm Name,Minutes,Fee,Total,Error,Matter Type_Antitrust,Matter Type_Asset recovery,Matter Type_Commercial arbitration,Matter Type_IPO,...,Matter Type_Structured finance,Matter Type_Tax planning,Resource_Junior,Resource_Paralegal,Resource_Partner,Resource_Senior,Activity_Attend Court,Activity_Attend Meeting,Activity_Phone Call,Activity_Prepare Opinion
0,0,Cox Group,85,70,99.17,False,1,0,0,0,...,0,0,0,1,0,0,0,0,0,1
1,0,Cox Group,505,150,1262.5,False,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,0,Cox Group,100,180,300.0,False,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1


## Part 3: Create training and validation datasets

In [7]:
train_df, val_df, _, _ = train_test_split(encoded_df, encoded_df['Error'], test_size=0.2, random_state=0)
train_df_no_result = train_df.drop(['Error','Firm Name'], axis=1)
val_df_no_result = val_df.drop(['Error','Firm Name'], axis=1)
print(f'{train_df.shape[0]} rows in training data')
print(f'{val_df.shape[0]} rows in validation data')

84772 rows in training data
21193 rows in validation data


## Part 4: Train the model



In [8]:
from sagemaker import RandomCutForest

session = sagemaker.Session()

rcf = RandomCutForest(role=role,
                      train_instance_count=1,
                      train_instance_type='ml.m4.xlarge',
                      data_location=f's3://{data_bucket}/{subfolder}/',
                      output_path=f's3://{data_bucket}/{subfolder}/output',
                      num_samples_per_tree=100,
                      num_trees=50)

# automatically upload the training data to S3 and run the training job
rcf.fit(rcf.record_set(train_df_no_result.values))

2020-04-06 17:53:14 Starting - Starting the training job...
2020-04-06 17:53:15 Starting - Launching requested ML instances......
2020-04-06 17:54:19 Starting - Preparing the instances for training...
2020-04-06 17:55:13 Downloading - Downloading input data...
2020-04-06 17:55:28 Training - Downloading the training image....[34mDocker entrypoint called with argument(s): train[0m
  from numpy.testing.nosetester import import_nose[0m
  from numpy.testing.decorators import setastest[0m
[34m[04/06/2020 17:56:20 INFO 140404825405248] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-conf.json: {u'_ftp_port': 8999, u'num_samples_per_tree': 256, u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'_log_level': u'info', u'_kvstore': u'dist_async', u'force_dense': u'true', u'epochs': 1, u'num_trees': 100, u'eval_metrics': [u'accuracy', u'precision_recall_fscore'], u'_num_kv_servers': u'auto', u'mini_batch_size': 1000}[0m
[34m[04/


2020-04-06 17:56:29 Uploading - Uploading generated training model
2020-04-06 17:56:29 Completed - Training job completed
Training seconds: 76
Billable seconds: 76


## Part 5: Host the model

In [9]:
endpoint_name = 'suspicious-lines'
try:
    session.delete_endpoint(endpoint_name)
    print('Warning: Existing endpoint deleted to make way for your new endpoint.')
    sleep(30)
except:
    pass

In [10]:
rcf_endpoint = rcf.deploy(
    initial_instance_count=1,
    instance_type='ml.m4.xlarge', 
    endpoint_name=endpoint_name
)

---------------!

In [11]:
from sagemaker.predictor import csv_serializer, json_deserializer

rcf_endpoint.content_type = 'text/csv'
rcf_endpoint.serializer = csv_serializer
rcf_endpoint.accept = 'application/json'
rcf_endpoint.deserializer = json_deserializer

## Part 6: Test the model

In [None]:
results = rcf_endpoint.predict(val_df_no_result.values)
scores_df = pd.DataFrame(results['scores'])
val_df = val_df.reset_index(drop=True)
results_df = pd.concat([val_df, scores_df], axis=1)
results_df['Error'].value_counts()

In [None]:
score_cutoff = results_df[results_df['Error'] == True]['score'].median()
print(f'Score cutoff: {score_cutoff}')
results_above_cutoff = results_df[results_df['score'] > score_cutoff]
results_above_cutoff['Error'].value_counts()

In [None]:
results_df['Prediction'] = results_df['score'] > score_cutoff
results_df.head()

## Remove the Endpoint (optional)

Comment out this cell if you want the endpoint to exist after "run all"

In [None]:
session.delete_endpoint(endpoint_name)