In [1]:
! pip install xgboost

import xgboost as xgb
from xgboost import XGBClassifier, Booster, DMatrix
from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV

import boto3
import sagemaker
import pandas as pd
import numpy as np
from datetime import datetime
import sys
import seaborn as sns
import matplotlib.pyplot as plt

import yaml
import os

sys.path.extend(['../src/preprocess', '../config'])
import helpers.instance as ins
import helpers.s3 as s3_helper
import helpers.utils as ut
import helpers.athena as at

[0m

In [2]:
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
session = sagemaker.Session()

def read_config():
    with open("config.yaml", 'r') as stream:
        cfg = yaml.safe_load(stream)
    return cfg
cfg = ins.read_config('../config/config.yaml') 

In [3]:
#from sklearn import preprocessing
#from sklearn import LabelBinarizer
def generate_initial_model_data(train_path = 'trained_data'):
    train = pd.read_csv(train_path)
    columns = ['unit_num', 'LOCATION', 'AREA', 'REGION', 
              'DISTRICT', 'UNIT_CATEGORY', 'UNIT_SIZE', 'PRODUCT_LINE', 'UNIT_MAKE_CODE', 'UNIT_FUEL_TYPE', 'UNIT_BODY_TYPE', 'UNIT_SOLD_DATE', 'unit_sold_date']
    for i in columns:
        train[i] = train[i].astype('str')
    X_train, y_train = train.iloc[:,1:],train.iloc[:,0]
    dtrain = xgb.DMatrix(X_train, label=y_train)
    return dtrain

In [4]:
'''from sklearn import preprocessing
lbl = preprocessing.LabelEncoder()
def generate_initial_model_data(train_path = 'trained_data'):
    train = pd.read_csv(train_path)
    columns = ['unit_num', 'LOCATION', 'AREA', 'REGION', 
              'DISTRICT', 'UNIT_CATEGORY', 'UNIT_SIZE', 'PRODUCT_LINE', 'UNIT_MAKE_CODE', 'UNIT_FUEL_TYPE', 'UNIT_BODY_TYPE', 'UNIT_SOLD_DATE', 'unit_sold_date']
    encode_columns = train[columns].astype('str')
    encode_columns = encode_columns.apply(lbl.fit_transform)
    droptest = train.drop(columns, axis = 1)
    final = pd.concat([droptest, encode_columns], axis = 1)
    X_train, y_train = train.iloc[:,1:],train.iloc[:,0]
    dtrain = xgb.DMatrix(X_train, label=y_train)
    return dtrain
    '''

"from sklearn import preprocessing\nlbl = preprocessing.LabelEncoder()\ndef generate_initial_model_data(train_path = 'trained_data'):\n    train = pd.read_csv(train_path)\n    columns = ['unit_num', 'LOCATION', 'AREA', 'REGION', \n              'DISTRICT', 'UNIT_CATEGORY', 'UNIT_SIZE', 'PRODUCT_LINE', 'UNIT_MAKE_CODE', 'UNIT_FUEL_TYPE', 'UNIT_BODY_TYPE', 'UNIT_SOLD_DATE', 'unit_sold_date']\n    encode_columns = train[columns].astype('str')\n    encode_columns = encode_columns.apply(lbl.fit_transform)\n    droptest = train.drop(columns, axis = 1)\n    final = pd.concat([droptest, encode_columns], axis = 1)\n    X_train, y_train = train.iloc[:,1:],train.iloc[:,0]\n    dtrain = xgb.DMatrix(X_train, label=y_train)\n    return dtrain\n    "

In [23]:
'''from sklearn.preprocessing import OneHotEncoder
sys.path.extend(['.../notebooks/', '../config'])
train = pd.read_csv('trained_data')
encoder = OneHotEncoder(sparse = False)
onehot = encoder.fit_transform(train)
print(onehot)
'''
#! pip install category_encoders
from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce
columns = ['unit_num', 'LOCATION', 'AREA', 'REGION', 
              'DISTRICT', 'UNIT_CATEGORY', 'UNIT_SIZE', 'PRODUCT_LINE', 'UNIT_MAKE_CODE', 'UNIT_FUEL_TYPE', 'UNIT_BODY_TYPE', 'UNIT_SOLD_DATE', 'unit_sold_date']
encoder = ce.one_hot.OneHotEncoder(cols = columns)
encoder = OneHotEncoder(sparse = False)
onehot = encoder.fit_transform(train[columns])
cleaned = encoder.transform(train[columns])
print(cleaned)

ValueError: Input contains NaN

In [5]:
params = {
    'objective': 'multi:softmax',
    'colsample_bytree': 0.3,
    'num_class': 3,
    'verbosity': 1,
    'eval_metric': 'merror'
}
# general cross validation madel to use a base metric
dtrain = generate_initial_model_data()
cv_results = xgb.cv(params,
                    dtrain,
                    num_boost_round = 999,
                    early_stopping_rounds = 10,
                    nfold = 4,
                    seed = 42          
)
print(f'minimum test merror value: {cv_results["test-merror-mean"].min()}')
cv_results

ValueError: DataFrame.dtypes for data must be int, float, bool or category.  When
categorical type is supplied, DMatrix parameter `enable_categorical` must
be set to `True`. Invalid columns:unit_num, LOCATION, AREA, REGION, DISTRICT, UNIT_CATEGORY, UNIT_SIZE, PRODUCT_LINE, UNIT_MAKE_CODE, UNIT_FUEL_TYPE, UNIT_BODY_TYPE, UNIT_SOLD_DATE, unit_sold_date

In [None]:
path = ['s3://pske-stg-advanalytics/Projects/Unit_Sale_Risk_Interns/Data/Processed/']
s3_helper.persist_file_to_path(df=df,
                       path=path[0],
                       filetype='parquet')
model_train = s3_helper.read_parquet_from_path(path=path[0])
model_train.head()

In [None]:
'''
role = sagemaker.get_execution_role()
region = sagemaker.Session().boto_region_name

bucket = boto3.Session().client(service_name='sagemaker',region_name=region)
prefix = "../src/train/xgboost_template.py"
'''

In [None]:
# Get a SageMaker-compatible role used by this Notebook Instance.
import helpers.notebook_helpers as nobo
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
session = sagemaker.Session()

# Read config.YAML and store relevant paths
cfg = nobo.read_config() 