In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report, confusion_matrix



In [2]:
%load_ext sql
%sql postgresql://airflow_user:airflow@172.16.223.128/airflow_test

'Connected: airflow_user@airflow_test'

## Create tsfresh features 

In [13]:
%%sql
drop table if exists exp.ts_features_tab;
create table exp.ts_features_tab
as
with a as 
(
    select trajectory_id, 
        array_agg(trajectory_id ORDER BY trajectory_id, time DESC) as id,
        array_agg(time ORDER BY trajectory_id, time DESC) as ttime,
        array_agg(distance_miles ORDER BY trajectory_id, time DESC) as dm,
        array_agg(interval_hour ORDER BY trajectory_id, time DESC) as ih ,
        array_agg(speed ORDER BY trajectory_id, time DESC) as s
    from geolife.geolife_trajectory_speed_walk
    --where trajectory_id in ('2008-10-12_11', '2008-10-12_17')
    group by trajectory_id
)
select (tsfresh_features(id, ttime, dm, ih, s)).*
from a

 * postgresql://airflow_user:***@172.16.223.128/airflow_test
Done.
167478 rows affected.


[]

## Pivot the ts_feature table from long form to wide form

In [15]:
%%sql
drop table if exists exp.ts_features_pvt;
drop table if exists exp.ts_features_pvt_dictionary;
select madlib.pivot('exp.ts_features_tab', --source_table
    'exp.ts_features_pvt', --output_table
    'id', --index
    'feature_name', --pivot col
    'value') --pivot_val

 * postgresql://airflow_user:***@172.16.223.128/airflow_test
Done.
Done.
1 rows affected.


pivot


In [23]:
%%sql 
drop table if exists exp.features_all;
create table exp.features_all
as
with l as (
    select trajectory_id as id, 
        label 
    from geolife.geolife_trajectory_speed_walk 
    group by 1, 2
)
select * 
from exp.ts_features_pvt f 
inner join l using (id)

 * postgresql://airflow_user:***@172.16.223.128/airflow_test
Done.
314 rows affected.


[]

## Test/train split

In [24]:
%%sql
DROP TABLE IF EXISTS exp.features_test, exp.features_train;
SELECT madlib.train_test_split(
                                'exp.features_all',    -- Source table
                                'exp.features',     -- Output table
                                0.8,       -- Sample proportion
                                0.2,       -- Sample proportion
                                NULL, -- Strata definition
                                NULL, -- Columns to output
                                FALSE,     -- Sample without replacement
                                TRUE);    -- Do not separate output tables

 * postgresql://airflow_user:***@172.16.223.128/airflow_test
Done.
1 rows affected.


train_test_split


## Run random forest model

In [33]:
%%sql
DROP TABLE IF EXISTS exp.rf_output, exp.rf_output_group, exp.rf_output_summary;
SELECT madlib.forest_train('exp.features_train',         -- source table
                           'exp.rf_output',    -- output model table
                           'id',              -- id column
                           'label',           -- response
                           '*',   -- features
                           NULL,              -- exclude columns
                           NULL,              -- grouping columns
                           20::integer,       -- number of trees
                           2::integer,        -- number of random features
                           TRUE::boolean,     -- variable importance
                           1::integer,        -- num_permutations
                           8::integer,        -- max depth
                           3::integer,        -- min split
                           1::integer,        -- min bucket
                           10::integer        -- number of splits per continuous variable
                           );

 * postgresql://airflow_user:***@172.16.223.128/airflow_test
Done.
1 rows affected.


forest_train


## Model evaluation

In [36]:
%%sql
DROP TABLE IF EXISTS exp.rf_results;
SELECT madlib.forest_predict('exp.rf_output',        -- tree model    
                             'exp.features_test',             -- new data table
                             'exp.rf_results') --,  -- output table
                             --'prob');               -- show probability

 * postgresql://airflow_user:***@172.16.223.128/airflow_test
Done.
1 rows affected.


forest_predict


In [37]:
%%sql
select * from exp.rf_results limit 5

 * postgresql://airflow_user:***@172.16.223.128/airflow_test
5 rows affected.


id,estimated_label
2007-04-15_3,False
2007-04-17_5,True
2007-04-18_4,True
2007-04-18_8,False
2007-04-19_1,True


In [41]:
%%sql
drop table if exists exp.result;
create table exp.result
as
with t as (
select id,
    case when label = True then 1.0 else 0.0 end as obs
from exp.features_test
)
select id,
    obs,
    case when estimated_label = True then 1.0 else 0.0 end as pred
from exp.rf_results r inner join t using (id)

 * postgresql://airflow_user:***@172.16.223.128/airflow_test
Done.
63 rows affected.


[]

In [43]:
%%sql
DROP TABLE IF EXISTS exp.auc;
SELECT madlib.area_under_roc( 'exp.result', 'exp.auc', 'pred', 'obs');
SELECT * FROM exp.auc;

 * postgresql://airflow_user:***@172.16.223.128/airflow_test
Done.
1 rows affected.
1 rows affected.


area_under_roc
0.976190476190476


In [44]:
%%sql
DROP TABLE IF EXISTS exp.cm;
SELECT madlib.confusion_matrix( 'exp.result', 'exp.cm', 'pred', 'obs');
SELECT * FROM exp.cm ORDER BY class;

 * postgresql://airflow_user:***@172.16.223.128/airflow_test
Done.
1 rows affected.
2 rows affected.


row_id,class,confusion_arr
1,0.0,"[Decimal('42'), Decimal('0')]"
2,1.0,"[Decimal('1'), Decimal('20')]"
