# Capstone Model Development with AWS Sagemaker Built-in Algorithms: Model Productionization to Scheduled Notebook

In [None]:
#this notebook assumes raw data was pulled from original source of data: https://data.austintexas.gov/Health-and-Community-Services/Austin-Animal-Center-Intakes/wter-evkm

In [None]:
#import dependencies

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
#rename file as 'data.csv' to benefit from the pre-processing written for the file

In [7]:
data = pd.read_csv('data.csv')

In [None]:
#convert outcome types to binary classification: 'adopted' vs. 'not adopted'

In [None]:
data['outcome_type'] = dataset['outcome_type'].replace('Adoption','Adopted')
data['outcome_type'] = dataset['outcome_type'].replace('Return to Owner','Not Adopted')
data['outcome_type'] = dataset['outcome_type'].replace('Transfer','Not Adopted')
data['outcome_type'] = dataset['outcome_type'].replace('Euthanasia','Not Adopted')
data['outcome_type'] = dataset['outcome_type'].replace('Died','Not Adopted')
data['outcome_type'] = dataset['outcome_type'].replace('Rto-Adopt','Adopted')
data['outcome_type'] = dataset['outcome_type'].replace('Missing','Not Adopted')
data['outcome_type'] = dataset['outcome_type'].replace('Disposal','Not Adopted')
data['outcome_type'] = dataset['outcome_type'].replace('Relocate','Not Adopted')

In [None]:
#convert categorical data to 'category' data types

In [9]:
data['outcome_type'] = dataset['outcome_type'].astype("category")
data['animal_type'] = dataset['animal_type'].astype("category")
data['breed'] = dataset['breed'].astype("category")
data['sex_upon_outcome'] = dataset['sex_upon_outcome'].astype("category")
data['sex_upon_intake'] = dataset['sex_upon_intake'].astype("category")
data['color'] = dataset['color'].astype("category")
data['intake_type'] = dataset['intake_type'].astype("category")
data['outcome_weekday'] = dataset['outcome_weekday'].astype("category")
data['intake_weekday'] = dataset['intake_weekday'].astype("category")

In [None]:
#drop columns that are irrelevant for the purposes of this model

In [None]:
data.drop('outcome_subtype', axis=1, inplace=True)
data.drop('intake_monthyear', axis=1, inplace=True)
data.drop('count', axis=1, inplace=True)
data.drop('dob_monthyear', axis=1, inplace=True)
data.drop('outcome_monthyear', axis=1, inplace=True)
data.drop('time_in_shelter', axis=1, inplace=True)
data.drop('Unnamed: 0', axis=1, inplace=True)

#if don't currently have outcome characteristics documented:

#data.drop(labels='outcome_month', axis=1, inplace=True)
#data.drop(labels='outcome_year', axis=1, inplace=True)
#data.drop(labels='outcome_weekday', axis=1, inplace=True)
#data.drop(labels='outcome_hour', axis=1, inplace=True)

In [None]:
#encode the column names to ensure the built-in algorithms are able to interpret

In [10]:
import re
data = data.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [None]:
#split data for training and testing

In [12]:
x_train, x_val, y_train, y_val = train_test_split(data.drop('outcome_type', axis=1), data['outcome_type'], test_size=0.2, random_state=42)

In [None]:
#encode data for outcome for binary classification

In [14]:
le = LabelEncoder()

In [15]:
y_train = le.fit_transform(y_train)

In [16]:
y_val = le.transform(y_val)

In [None]:
#install and import best model for problem: lightgbm

In [17]:
!pip install lightgbm

Collecting lightgbm
  Using cached lightgbm-3.3.5-py3-none-manylinux1_x86_64.whl (2.0 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-3.3.5
[0m

In [18]:
import lightgbm as lgb

In [None]:
#train model on data input and monitor losses

In [19]:
#using best hyperparameters from hpo output
model = lgb.LGBMClassifier(subsample = 1.0, reg_lambda= 1, num_leaves = 180, min_child_weight = 0.1, learning_rate  = 0.1, colsample_bytree = 0.5, boosting_type = 'dart', random_state=42)

In [20]:
model.fit(x_train, y_train, eval_set=[(x_val,y_val), (x_train,y_train)], verbose=20, eval_metric='logloss')



[20]	training's binary_logloss: 0.43959	valid_0's binary_logloss: 0.452905
[40]	training's binary_logloss: 0.403812	valid_0's binary_logloss: 0.42421
[60]	training's binary_logloss: 0.368745	valid_0's binary_logloss: 0.392918
[80]	training's binary_logloss: 0.355996	valid_0's binary_logloss: 0.382596
[100]	training's binary_logloss: 0.338522	valid_0's binary_logloss: 0.369246


LGBMClassifier(boosting_type='dart', class_weight=None, colsample_bytree=0.5,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.1, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=180, objective=None,
               random_state=42, reg_alpha=0.0, reg_lambda=1, silent='warn',
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [None]:
#use the model to predict for a file or specific animal

In [None]:
x_val = pd.read_csv('') #insert new subset of data or specific animal record as 'csv' filetype

In [None]:
y_pred = model.predict(x_val)

In [None]:
print(y_pred)