## Important Stuff

In [2]:
# Default values when not provided/overridden by job
#dbutils.widgets.text("model_version", "1.0.0")
#dbutils.widgets.text("storage_account", "modelreg")
#dbutils.widgets.text("model_name", "batcomputer")

# Model version, name & storage-account is passed into job, and storage key is kept in Azure Key Vault
STORAGE_KEY       = dbutils.secrets.get("ai-deploy-secrets", "storage-key")
STORAGE_ACCOUNT   = dbutils.widgets.get("storage_account")
MODEL_VERSION     = dbutils.widgets.get("model_version")
STORAGE_CONTAINER = dbutils.widgets.get("model_name")

## Pandas - Extracting data

In [15]:
import pandas as pd
import numpy as np

# Load data from CSV
df = pd.read_csv('../data/prc-outcomes-open-data-aprjun2018-tables.csv', low_memory=False)

## Pandas - Encoding & Extending Data

In [28]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

# Extend Prosecuted_E column with mapping of 'Outcome Type'
# Outcome Type == 1 is a prosecution, other types we count as "not prosecuted"
def mapProsecuted(val):
    if val <= 3:
        return 1
    else:
        return 0
df['Prosecuted_E'] = df['Outcome Type'].map(mapProsecuted).astype(int)

from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
df['Offence Subgroup_E'] = enc.fit_transform(df['Offence Subgroup'])
offence_mapping = dict(zip(enc.classes_, range(len(enc.classes_))))

# enc = LabelEncoder()
# df['Offence Group_E'] = enc.fit_transform(df['Offence Group'])
# offence_grp_mapping = dict(zip(enc.classes_, range(len(enc.classes_))))

enc = LabelEncoder()
df['Force Name_E'] = enc.fit_transform(df['Force Name'])
force_mapping = dict(zip(enc.classes_, range(len(enc.classes_))))

print(df.info())
pp.pprint(df.iloc[1003, [2,5,10,13,14,15]])
pp.pprint(offence_mapping)
# print()
# pp.pprint(offence_grp_mapping)
# print()
# pp.pprint(force_mapping)

# Get our training data in NumPy format
train_data = df.values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 621280 entries, 0 to 621279
Data columns (total 16 columns):
Financial Year                                     621280 non-null object
Financial Quarter                                  621280 non-null int64
Force Name                                         621280 non-null object
Offence Description                                621280 non-null object
Offence Group                                      621280 non-null object
Offence Subgroup                                   621280 non-null object
Offence Code                                       621280 non-null object
Offence code expired                               140800 non-null object
Outcome Description                                621280 non-null object
Outcome Group                                      621280 non-null object
Outcome Type                                       621280 non-null int64
Force outcomes for offences recorded in quarter    621280 non-null object
For

## Scikit-learn - Training the model

In [20]:
#rain_data[0:,14]

from sklearn.ensemble import RandomForestClassifier

# Use RandomForestClassifier
model = RandomForestClassifier(n_estimators = 100)
X = train_data[0:,14:].astype(int)
y = train_data[0:,13].astype(int)
model = model.fit(X, y)

## Test

In [33]:
answer = model.predict_proba([[1, 23]])

for c, v in offence_mapping.items():
    print(c, v)
    print(model.predict_proba([[v, 20]]))

Arson 0
[[0.80677101 0.19322899]]
Bicycle theft 1
[[0.8118852 0.1881148]]
Criminal damage 2
[[0.82576959 0.17423041]]
Domestic burglary 3
[[0.8122679 0.1877321]]
Fraud offences to 2012/13 4
[[0.84987477 0.15012523]]
Harassment 5
[[0.80970685 0.19029315]]
Homicide 6
[[0.80771689 0.19228311]]
Miscellaneous crimes 7
[[0.82224214 0.17775786]]
Non-domestic burglary 8
[[0.80818312 0.19181688]]
Other sexual offences 9
[[0.81100828 0.18899172]]
Other theft offences 10
[[0.8092016 0.1907984]]
Possession of drugs 11
[[0.80916807 0.19083193]]
Possession of weapons offences 12
[[0.80875372 0.19124628]]
Public order offences 13
[[0.82879847 0.17120153]]
Rape 14
[[0.80950269 0.19049731]]
Robbery 15
[[0.80241129 0.19758871]]
Shoplifting 16
[[0.8045839 0.1954161]]
Theft from a vehicle 17
[[0.81033375 0.18966625]]
Theft from the person 18
[[0.80998363 0.19001637]]
Theft of a motor vehicle 19
[[0.80775018 0.19224982]]
Trafficking of drugs 20
[[0.80973842 0.19026158]]
Vehicle interference 21
[[0.81715827

## Pickle model and other mapping files

In [12]:
# Create pickles and data lookup 
from collections import OrderedDict
import pickle

lookup = OrderedDict()

# ORDER IS IMPORTANT! This is why we use OrderedDict and create entries one by one
lookup["Pclass"] = 0
lookup["Age"] = 0
lookup["SibSp"] = 0
lookup["Parch"] = 0
lookup["Fare"] = 0
lookup["Gender"] = {"male": 1, "female": 0}
lookup["Port"] = {"Cherbourg": 1, "Southampton": 2, "Queenstown": 3}
  
# Create output lookup
flags = ["died_proba", "survived_proba"]

# Pickle the whole damn lot
with open("model.pkl" , 'wb') as file:  
    pickle.dump(model, file)
    file.close()
    
with open("lookup.pkl" , 'wb') as file:  
    pickle.dump(lookup, file)
    file.close()
    
with open("flags.pkl" , 'wb') as file:  
    pickle.dump(flags, file)    
    file.close()

In [13]:
#!pip install azure-storage
from azure.storage.blob import BlockBlobService

# Create the BlockBlockService that is used to call the Blob service for the storage account
block_blob_service = BlockBlobService(account_name=STORAGE_ACCOUNT, account_key=STORAGE_KEY) 

# Create a container
block_blob_service.create_container(STORAGE_CONTAINER) 

# Upload the created file, use local_file_name for the blob name
block_blob_service.create_blob_from_path(STORAGE_CONTAINER, MODEL_VERSION + "/model.pkl", "model.pkl")
block_blob_service.create_blob_from_path(STORAGE_CONTAINER, MODEL_VERSION + "/lookup.pkl", "lookup.pkl")
block_blob_service.create_blob_from_path(STORAGE_CONTAINER, MODEL_VERSION + "/flags.pkl", "flags.pkl")