## Pandas - Extracting data

In [2]:
import pandas as pd
import numpy as np
import glob

# Load data from multiple CSV files into one dataframe
df = pd.concat([pd.read_csv(f) for f in glob.glob('/dbfs/police-data/2017-*/*.csv')], ignore_index = True)

## Pandas - Encoding & Extending Data

In [4]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()

# Remove junk
try:
    df = df.drop(['Context'], axis=1)
except:
    pass
df = df.dropna()

df['Last outcome category_e'] = enc.fit_transform(df['Last outcome category'])
Outcome_mapping = dict(zip(enc.classes_, range(len(enc.classes_))))

# This maps the various outcomes to 1 or 0, to a "Caught" column
def mapOutcome(val):
    if val in [0, 1, 5, 9, 19, 11, 15, 16, 17, 18, 19, 20, 21, 23]:
        return 1
    else:
        return 0
df['Caught'] = df['Last outcome category_e'].map(mapOutcome).astype(int)

df['Falls within_e'] = enc.fit_transform(df['Falls within'])
FallsWithin_mapping = dict(zip(enc.classes_, range(len(enc.classes_))))

df['Crime type_e'] = enc.fit_transform(df['Crime type'])
CrimeType_mapping = dict(zip(enc.classes_, range(len(enc.classes_))))

# Encode month by striping the string
df['Month_e'] = df['Month'].str.slice(5).astype(int)

#print(FallsWithin_mapping)
print(CrimeType_mapping)
#print(Outcome_mapping)
#df.head(20)
#print(df.shape)
#df.iloc[:5, 15]
#df.info()

# Get our training data in NumPy format
train_data = df.values

## Scikit-learn - Training the model

In [6]:
#train_data[0:,14]

from sklearn.ensemble import RandomForestClassifier

# Use RandomForestClassifier
model = RandomForestClassifier(n_estimators = 100)
X = train_data[0:,13:].astype(int)
y = train_data[0:,12].astype(int)
model = model.fit(X, y)

## Test

In [8]:
# Test all forces, with crime 6 = possesion of a weapon
for f, fi in FallsWithin_mapping.items():
    a = model.predict_proba([[fi, 3, 1]])[0]
    print(f, a[0])

## Pickle model and store in Azure storage

In [10]:
#
# Widgets are how we get values passed from a DataBricks job
#
try:
  #dbutils.widgets.text("model_version", "1.0.0")
  #dbutils.widgets.text("storage_account", "modelreg")
  #dbutils.widgets.text("model_name", "batcomputer")

  # Model version, name & storage-account is passed into job, and storage key is kept in Azure Key Vault
  STORAGE_KEY       = dbutils.secrets.get("ai-deploy-secrets", "storage-key")
  STORAGE_ACCOUNT   = dbutils.widgets.get("storage_account")
  MODEL_VERSION     = dbutils.widgets.get("model_version")
  STORAGE_CONTAINER = dbutils.widgets.get("model_name")
except:
    pass
    
#
# STORAGE_ACCOUNT value should only be set when this Notebook is invoked via a job
# So we only pickle and store in Azure blobs when running as a job
#
if 'STORAGE_ACCOUNT' in vars():
  print("Saving pickles to:", STORAGE_ACCOUNT, " / ", STORAGE_CONTAINER)
  
  # Create pickles and data lookup 
  from collections import OrderedDict
  import pickle

  lookup = OrderedDict()

  # ORDER IS IMPORTANT! This is why we use OrderedDict and create entries one by one
  lookup["Force"] = FallsWithin_mapping
  lookup["Crime"] = CrimeType_mapping

  # Create output lookup
  flags = ["getaway_proba", "busted_proba"]

  # Pickle the whole damn lot
  with open("model.pkl" , 'wb') as file:  
    pickle.dump(model, file)
    file.close()

  with open("lookup.pkl" , 'wb') as file:  
    pickle.dump(lookup, file)
    file.close()

  with open("flags.pkl" , 'wb') as file:  
    pickle.dump(flags, file)    
    file.close()

  from azure.storage.blob import BlockBlobService

  # Create the BlockBlockService that is used to call the Blob service for the storage account
  block_blob_service = BlockBlobService(account_name=STORAGE_ACCOUNT, account_key=STORAGE_KEY) 

  # Create a container
  block_blob_service.create_container(STORAGE_CONTAINER) 

  # Upload the created file, use local_file_name for the blob name
  block_blob_service.create_blob_from_path(STORAGE_CONTAINER, MODEL_VERSION + "/model.pkl", "model.pkl")
  block_blob_service.create_blob_from_path(STORAGE_CONTAINER, MODEL_VERSION + "/lookup.pkl", "lookup.pkl")
  block_blob_service.create_blob_from_path(STORAGE_CONTAINER, MODEL_VERSION + "/flags.pkl", "flags.pkl")