In [3]:
%load_ext autoreload
%autoreload 2
import sys, os
sys.path.insert(1, os.path.join(sys.path[0], '..'))

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline

# Pipeline
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, f1_score, classification_report, confusion_matrix
from sklearn.externals import joblib
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split

# Algorithms
from sklearn.neighbors import KNeighborsClassifier

# Internal
from autocat.data.datasets import get_training_data, get_totals_data
from autocat.data.features import CombinedFeatureAdder, feature_transactions_per_day, PandasDataFrameTransformer
from autocat.data.filters import FeatureFilters, no_null_StdUnitsShipped_StdNetAmount
from autocat.models import GridSearchModel, Model
from autocat.models.evaluation import plot_learning_curve, get_scorer, financial_loss_scorer
from autocat.models.pipelines import get_scaled_pipeline_v1

## Pipeline

In [5]:
TRAINING_DATA = '../data/processed/train_2018-08-24.csv'

# Create feaure matrix and label vector
X, y = get_training_data(TRAINING_DATA, [
    no_null_StdUnitsShipped_StdNetAmount
], drop_na=True)

In [6]:
# Get the scaled pipeline
pipeline = get_scaled_pipeline_v1(X)
print(pipeline)

Pipeline(memory=None,
     steps=[('feaure_transactions_per_day', CombinedFeatureAdder(attribute_fn=<function feature_transactions_per_day at 0x111262d90>)), ('std_scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('data_frame', PandasDataFrameTransformer(columns=['AvgUnitsShipped', 'StdUnitsShipped', 'MinUnitsShipped', 'MaxUnitsShipped', 'AvgNetAmount', 'StdNetAmount', 'MinNetAmount', 'MaxNetAmount', 'NumberOfTransactions', 'NumberOfTransactionDays', 'TransactionsPerDay']))])


In [7]:
print('Training instances:', len(X))

Training instances: 5610


## Scorer

In [8]:
totals_data = get_totals_data()
scorer = make_scorer(financial_loss_scorer, totals=totals_data, greater_is_better=False)

## Model

See if we can improve on the original model (see notes in the [2.0.1 notebook](2.0.1-knn-model-financial-loss-new-1.ipynb).

In [68]:
# Create the parameter grid to optimize
param_grid = [
   {
       'knn__n_neighbors': np.linspace(25, 30, 5).astype(int), 
       'knn__metric': ['minkowski'],
        'knn__weights': ['uniform', 'distance'],
       'knn__p': [1, 2, 3]
    }
  ]

In [69]:
# Train a model using grid search
model = GridSearchModel(param_grid, name='knn', model=KNeighborsClassifier(), pipeline=pipeline)
model.train(X, y, 10, scorer)

In [70]:
model.model

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=25, p=1,
           weights='distance')

No difference