## Important Stuff

In [2]:
# Default values when not provided/overridden by job
#dbutils.widgets.text("model_version", "1.0.0")
#dbutils.widgets.text("storage_account", "modelreg")
#dbutils.widgets.text("model_name", "batcomputer")

# Model version, name & storage-account is passed into job, and storage key is kept in Azure Key Vault
STORAGE_KEY       = dbutils.secrets.get("ai-deploy-secrets", "storage-key")
STORAGE_ACCOUNT   = dbutils.widgets.get("storage_account")
MODEL_VERSION     = dbutils.widgets.get("model_version")
STORAGE_CONTAINER = dbutils.widgets.get("model_name")

## Pandas - Extracting data

In [33]:
import pandas as pd
import numpy as np

# Load data from CSV
df = pd.read_csv('../data/prc-outcomes-open-data-aprjun2018-tables.csv', low_memory=False)

## Pandas - Encoding & Extending Data

In [71]:
# Extend Prosecuted_E column with mapping of 'Outcome Type'
# Outcome Type == 1 is a prosecution, other types we count as "not prosecuted"
def mapProsecuted(val):
    if val == 1:
        return 1
    else:
        return 0

df['Prosecuted_E'] = df['Outcome Type'].map(mapProsecuted)

from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
df['Offence Subgroup_E'] = enc.fit_transform(df['Offence Subgroup'])
offence_mapping = dict(zip(enc.classes_, range(len(enc.classes_))))
#df.iloc[1930:1990, [2,5,13,14]]
#print(offence_mapping)

# Get our training data in NumPy format
train_data = data.values

{'Arson': 0, 'Bicycle theft': 1, 'Criminal damage': 2, 'Domestic burglary': 3, 'Fraud offences to 2012/13': 4, 'Harassment': 5, 'Homicide': 6, 'Miscellaneous crimes': 7, 'Non-domestic burglary': 8, 'Other sexual offences': 9, 'Other theft offences': 10, 'Possession of drugs': 11, 'Possession of weapons offences': 12, 'Public order offences': 13, 'Rape': 14, 'Robbery': 15, 'Shoplifting': 16, 'Theft from a vehicle': 17, 'Theft from the person': 18, 'Theft of a motor vehicle': 19, 'Trafficking of drugs': 20, 'Vehicle interference': 21, 'Violence with injury': 22, 'Violence without injury': 23}


## Scikit-learn - Training the model

In [8]:
from sklearn.ensemble import RandomForestClassifier

# Use RandomForestClassifier
model = RandomForestClassifier(n_estimators = 100)
model = model.fit(train_data[0:,2:], train_data[0:,0])

## Test

In [10]:
answer = model.predict_proba([[3, 42, 0, 0, 2, 1, 1]])

print(answer[0])

## Pickle model and other mapping files

In [12]:
# Create pickles and data lookup 
from collections import OrderedDict
import pickle

lookup = OrderedDict()

# ORDER IS IMPORTANT! This is why we use OrderedDict and create entries one by one
lookup["Pclass"] = 0
lookup["Age"] = 0
lookup["SibSp"] = 0
lookup["Parch"] = 0
lookup["Fare"] = 0
lookup["Gender"] = {"male": 1, "female": 0}
lookup["Port"] = {"Cherbourg": 1, "Southampton": 2, "Queenstown": 3}
  
# Create output lookup
flags = ["died_proba", "survived_proba"]

# Pickle the whole damn lot
with open("model.pkl" , 'wb') as file:  
    pickle.dump(model, file)
    file.close()
    
with open("lookup.pkl" , 'wb') as file:  
    pickle.dump(lookup, file)
    file.close()
    
with open("flags.pkl" , 'wb') as file:  
    pickle.dump(flags, file)    
    file.close()

In [13]:
#!pip install azure-storage
from azure.storage.blob import BlockBlobService

# Create the BlockBlockService that is used to call the Blob service for the storage account
block_blob_service = BlockBlobService(account_name=STORAGE_ACCOUNT, account_key=STORAGE_KEY) 

# Create a container
block_blob_service.create_container(STORAGE_CONTAINER) 

# Upload the created file, use local_file_name for the blob name
block_blob_service.create_blob_from_path(STORAGE_CONTAINER, MODEL_VERSION + "/model.pkl", "model.pkl")
block_blob_service.create_blob_from_path(STORAGE_CONTAINER, MODEL_VERSION + "/lookup.pkl", "lookup.pkl")
block_blob_service.create_blob_from_path(STORAGE_CONTAINER, MODEL_VERSION + "/flags.pkl", "flags.pkl")