In [1]:
import pandas as pd
import numpy as np 
import pickle5
import matplotlib.pyplot as plt
import datetime
from pysurvival.models.semi_parametric import CoxPHModel
from pysurvival.utils.display import display_loss_values
from sklearn.preprocessing import StandardScaler
from pysurvival.utils.display import compare_to_actual
from pysurvival.utils import save_model, load_model
from pysurvival.utils.sklearn_adapter import sklearn_adapter
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import auc, r2_score
from sklearn import metrics
from sklearn.utils import resample
from lifelines.utils import concordance_index

In [3]:
with open("../data/ut_poiV6.pkl", "rb") as fh:
    df = pickle5.load(fh)
    
df.head(3)

Unnamed: 0,life_time,datetime_start,datetime_end,latitude,longitude,distance_center_km,Station_Name,weekday_b,weekday_b_name,hour_b,...,near_inactivity_6H,near_charge_events_6H,service,entertainment,food,childcare,medical,education,parking,waste-management
0,928.5,2018-01-31 19:51:00,2018-02-01 11:19:30,40.018482,-105.281066,0.152203,COMM VITALITY / 1104 SPRUCE1,2,Wednesday,19,...,0.0,0,0.057343,0.109796,0.073649,3.517121,0.466518,0.631771,0.021832,1.145959
1,363.5,2018-02-01 14:03:00,2018-02-01 20:06:30,40.018482,-105.281066,0.152203,COMM VITALITY / 1104 SPRUCE1,3,Thursday,14,...,0.0,0,0.057343,0.109796,0.073649,3.517121,0.466518,0.631771,0.021832,1.145959
2,6828.5,2018-02-01 21:15:00,2018-02-06 15:03:30,40.018482,-105.281066,0.152203,COMM VITALITY / 1104 SPRUCE1,3,Thursday,21,...,0.0,0,0.057343,0.109796,0.073649,3.517121,0.466518,0.631771,0.021832,1.145959


### Remove outliers & nan

In [None]:
# remove rows
df = df.dropna()

In [8]:
# remove outliers
drop_index = [] # list of indexes to keep
names = df['Station_Name'].unique()

for name in names:
    temp = df[df['Station_Name'] == name]
    # Get interquantile ranges
    Q1, Q3 = temp.life_time.quantile([0.25, 0.75])
    IQR = Q3-Q1
    minimum = Q1 - 1.5*IQR
    maximum = Q3 + 1.5*IQR
    # Define observations which should be removed
    temp2 = df[(df['life_time'] < minimum) | (df['life_time'] > maximum) &
                  (df['Station_Name'] == name)]
    print("{n}: {s} ({p} %)".format(n=name, 
                                    s=temp2.shape[0], 
                                    p=round((temp2.shape[0]/temp.shape[0])*100,2)))
    # Add the indexes which should be dropped
    drop_index.extend(list(temp2.index))
    
print("\nThe total amount of lost events: {n} ({p} %)".format(n=len(drop_index), 
                                                              p=round(len(drop_index)/df.shape[0]*100,2)))

df = df.drop(drop_index)
df = df.reset_index(drop=True)

COMM VITALITY / 1104 SPRUCE1: 57 (4.4 %)
COMM VITALITY / 1000WALNUT: 48 (3.32 %)
BOULDER / REC CENTER: 40 (3.51 %)
BOULDER / BASELINE ST1: 42 (3.53 %)
BOULDER / ATRIUM ST1: 65 (6.58 %)
BOULDER / ALPINE ST1: 32 (8.96 %)
COMM VITALITY / 1400 WALNUT1: 38 (7.29 %)
BOULDER / FACILITIES ST1: 84 (16.28 %)
COMM VITALITY / 1500PEARL: 41 (4.5 %)
BOULDER / JUNCTION ST1: 35 (9.97 %)
COMM VITALITY / BOULDER JCTN: 40 (5.28 %)
COMM VITALITY / 1100WALNUT1: 39 (3.51 %)
BOULDER / N BOULDER REC 1: 29 (2.07 %)
BOULDER / BOULDER PARK: 8 (4.55 %)
COMM VITALITY / 2200 BROADWAY1: 0 (0.0 %)
BOULDER / EAST REC: 23 (5.42 %)
BOULDERJUNCTION / JUNCTION ST1: 3 (3.0 %)

The total amount of lost events: 624 (4.91 %)


In [9]:
df.shape

(12075, 71)

### Focus stations

In [10]:
station_subset = ['BOULDER / N BOULDER REC 1', 'COMM VITALITY / 1000WALNUT', 
        'COMM VITALITY / 1104 SPRUCE1', 'BOULDER / BASELINE ST1']

df = df[df['Station_Name'].isin(station_subset)]

### Prepare data for SA

As we are doing SA we need a column which specifies if an event occured at the given time. As all data observations in the dataset are events it is a very simple procedure

In [11]:
df['event'] = np.ones(len(df))

In [12]:
df['tod'] = df['hour_b'].apply(hour_label)

In [13]:
df = df[df['tod'] != 'Night']

### Adding dummies
We define `X`, the features, `T`, the time column and `E` the event columns. First though we need to dummify the categorical variables.

In [14]:
# Categorical features
columns_categorical = ['weekday_b_name','tod','Station_Name']

In [15]:
df = pd.get_dummies(df, columns=columns_categorical, drop_first=True)

## Modeling features

To make coding easier the names of the models are changed:
- M1: Baseline
- M5: Full

In [16]:
time_column = 'life_time'
event_column = 'event'

## M1
features1 = ['weekday_b_name_Monday', 'weekday_b_name_Saturday',
             'weekday_b_name_Sunday', 'weekday_b_name_Thursday',
             'weekday_b_name_Tuesday', 'weekday_b_name_Wednesday',
             'tod_Evening','tod_Midday', 'tod_Morning',
             'Station_Name_BOULDER / N BOULDER REC 1',
             'Station_Name_COMM VITALITY / 1000WALNUT',
             'Station_Name_COMM VITALITY / 1104 SPRUCE1'] # dow + tod

## M5
features5_cat = ['weekday_b_name_Monday', 'weekday_b_name_Saturday',
                 'weekday_b_name_Sunday', 'weekday_b_name_Thursday',
                 'weekday_b_name_Tuesday', 'weekday_b_name_Wednesday',
                 'tod_Evening','tod_Midday', 'tod_Morning',
                 'Station_Name_BOULDER / N BOULDER REC 1',
                 'Station_Name_COMM VITALITY / 1000WALNUT',
                 'Station_Name_COMM VITALITY / 1104 SPRUCE1']
features5_con = ['lag1', 'lag2', 'lag3','near_charge_time_4H', 'near_charge_energy_4H',
                 'charge_time_4H', 'charge_energy_4H','service', 'entertainment', 'food', 
                 'childcare', 'medical', 'education','parking', 'waste-management'] # dow + agg. tod + lag + activ.

In [17]:
## Get point predictions
def point_pred(model, X_test, X_train):
    T_pred = []
    T_pred_train = []
    # Get survival curves
    cph_pred = model.predict_survival(X_test)
    cph_pred_train = model.predict_survival(X_train)
    # get times of survival prediction
    time = model.times
    # test
    for i in range(0,len(cph_pred)):
        T_pred.append(auc(time,cph_pred[i]))
    # train
    for i in range(0,len(cph_pred_train)):
        T_pred_train.append(auc(time,cph_pred_train[i]))
    
    return T_pred, T_pred_train

In [18]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true))*100

In [19]:
## To test model
def test_model(y_test, y_pred):
    MAE = metrics.mean_absolute_error(y_test, y_pred)
    #print('MAE (Mean Absolute Error):', MAE)
    MSE = metrics.mean_squared_error(y_test, y_pred)
    #print('MSE (Mean Squared Error):', MSE)
    RMSE = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    #print('RMSE (Root Mean Squared Error):', RMSE)
    MAPE = mean_absolute_percentage_error(y_test, y_pred)
    
    NRMSE = RMSE/np.mean(y_test)
    return MAE, RMSE, MSE, MAPE, NRMSE

In [20]:
# dictionary to save results in 
results = {}

#data split
split = 0.8

learning_rates = [0.001, 0.01, 0.1]
regularization = [0.01, 0.1, 1.0]

# order data
df = df.sort_values(by=['datetime_start'])
df = df.reset_index(drop=True)

#### CPH - Model 1

In [23]:
results['M1'] = {}
# define data
X = df[features1]
X_train, X_test = np.split(X, [int(split * len(df))])
T_train, T_test = np.split(df[time_column], [int(split * len(df))])
E_train, E_test = np.split(df[event_column], [int(split * len(df))])

##############################################################################
### Find most optimal parameters for every station
best_c = -1
best_lear = None
best_regu = None
best_model = None
fails = {}

total_it = len(learning_rates)*len(regularization)
i=0

# loop over parameters
for lear in learning_rates:
    for regu in regularization:
        cv_res = []
    #try
        kf = KFold(n_splits=3, random_state=42, shuffle=False)
        for train_index, test_index in kf.split(X_train):
            cph = CoxPHModel()
            cph.fit(X_train.loc[train_index], T_train.loc[train_index], E_train.loc[train_index],
                    lr=lear, l2_reg = regu, init_method='zeros', max_iter=800, verbose=False)

            #point pred
            point_predictions = point_pred_single(cph,X_train.loc[test_index])
            # c-index
            c_temp = concordance_index(T_train.loc[test_index], point_predictions)

            cv_res.append(c_temp)
        if np.mean(c_temp) >= best_c:
            print("NEW BEST!!", i)
            best_lear = lear
            best_regu = regu
            best_c = np.mean(c_temp)
    #except:
        print("FAIL, lr:", lear, " l2:", regu)   
        i+=1
        print(round(i/total_it,2), "  ", end='')
print("Parameter tuning done.")
print("Params: learning: {} regularization: {}".format(best_lear,best_regu))

##############################################################################        
### Get predictions
reg = CoxPHModel()
reg.fit(X_train, T_train, E_train, lr=best_lear, l2_reg=best_regu,
        init_method='zeros', max_iter=800, verbose=False)
print("Model fitted")   
print("Best param. lr: {} l2_reg: {}".format(best_lear, best_regu))

## Point predictions
T_test_pred, T_train_pred = point_pred(reg, X_test, X_train)

## Test model
c_test = concordance_index(T_test, T_test_pred)
c_train = concordance_index(T_train, T_train_pred)
print(c_test)
print("Model tested")

## Point predictions
T_test_pred, T_train_pred = point_pred(reg, X_test, X_train)

## Get metrics
MAE_test, RMSE_test, MSE_test, MAPE_test, NRMSE_test = test_model(T_test,T_test_pred)
MAE_train, RMSE_train, MSE_train, MAPE_train, NRMSE_train = test_model(T_train,T_train_pred)    

## Save results
results['M1'] = {'RMSE_train':RMSE_train,'RMSE_test':RMSE_test,
                  'MAE_train': MAE_train,'MAE_test':MAE_test,
                  'R2_test':r2_score(T_test,T_test_pred), 
                  'R2_train':r2_score(T_train,T_train_pred),
                  'MAPE_test':MAPE_test,'MAPE_train':MAPE_train,
                  'NRMSE_test':NRMSE_test,'NRMSE_train':NRMSE_train,
                  'c-train':c_train,'c-test':c_test}

Model fitted
Best param. lr: 0.1 l2_reg: 0.01
0.6333332053197331
Model tested


#### CPH - M5

In [24]:
results['M5'] = {}

# define data
X = df[features5_cat + features5_con]
X_train, X_test = np.split(X, [int(split * len(df))])
T_train, T_test = np.split(df[time_column], [int(split * len(df))])
E_train, E_test = np.split(df[event_column], [int(split * len(df))])

scaler = StandardScaler()
X_train[features5_con] = scaler.fit_transform(X_train[features5_con])
X_test[features5_con] = scaler.fit_transform(X_test[features5_con])


##############################################################################
### Find most optimal parameters for every station
best_c = -1
best_lear = None
best_regu = None
best_model = None
fails = {}

total_it = len(learning_rates)*len(regularization)
i=0

# loop over parameters
for lear in learning_rates:
    for regu in regularization:
        cv_res = []
        try:
            kf = KFold(n_splits=3, random_state=42, shuffle=False)
            for train_index, test_index in kf.split(X_train):
                cph = CoxPHModel()
                cph.fit(X_train.loc[train_index], T_train.loc[train_index], E_train.loc[train_index],
                        lr=lear, l2_reg = regu, init_method='zeros', max_iter=800, verbose=False)
                #point pred
                point_predictions = point_pred_single(cph,X_train.loc[test_index])
                # c-index
                c_temp = concordance_index(T_train.loc[test_index], point_predictions)
                cv_res.append(c_temp)
                
            if np.mean(c_temp) >= best_c:
                print("NEW BEST!!", i)
                best_lear = lear
                best_regu = regu
                best_c = np.mean(c_temp)
        except:
            print("FAIL, lr:", lear, " l2:", regu)   
        i+=1
        print(round(i/total_it,2), "  ", end='')
print("Parameter tuning done.")
print("Params: learning: {} regularization: {}".format(best_lear,best_regu))

##############################################################################        
### Get predictions
reg = CoxPHModel()
reg.fit(X_train, T_train, E_train, lr=best_lear, l2_reg=best_regu,
        init_method='zeros', max_iter=800, verbose=False)
print("Model fitted")   
print("Best param. lr: {} l2_reg: {}".format(best_lear, best_regu))

## Point predictions
T_test_pred, T_train_pred = point_pred(reg, X_test, X_train)

## Test model
c_test = concordance_index(T_test, T_test_pred)
c_train = concordance_index(T_train, T_train_pred)
print(c_test)
print("Model tested")

## Get metrics
MAE_test, RMSE_test, MSE_test, MAPE_test, NRMSE_test = test_model(T_test,T_test_pred)
MAE_train, RMSE_train, MSE_train, MAPE_train, NRMSE_train = test_model(T_train,T_train_pred)    

## Save results
results['M5'] = {'RMSE_train':RMSE_train,'RMSE_test':RMSE_test,
                  'MAE_train': MAE_train,'MAE_test':MAE_test,
                  'R2_test':r2_score(T_test,T_test_pred), 
                  'R2_train':r2_score(T_train,T_train_pred),
                  'MAPE_test':MAPE_test,'MAPE_train':MAPE_train,
                  'NRMSE_test':NRMSE_test,'NRMSE_train':NRMSE_train,
                  'c-train':c_train,'c-test':c_test}

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


NEW BEST!! 0
0.11   



NEW BEST!! 1
0.22   



NEW BEST!! 2
0.33   



NEW BEST!! 3
0.44   



NEW BEST!! 4
0.56   



0.67   



NEW BEST!! 6
0.78   



NEW BEST!! 7
0.89   



1.0   Parameter tuning done.
Params: learning: 0.1 regularization: 0.1
Model fitted
Best param. lr: 0.1 l2_reg: 0.1
0.6286690297977258
Model tested


### Results

In [28]:
for key in results.keys():
    r = results[key]
    print("------- {} -------".format(key))
    print("Test:  C-index: {}  R2: {} RMSE: {} MAE: {} MAPE: {}".format(round(r['c-test'],2),
                                                                        round(r['R2_test'],2),
                                                                        round(r['RMSE_test'],0),
                                                                        round(r['MAE_test'],0),
                                                                        round(r['MAPE_test'],0)))
    
    print("Train:  C-index: {}  R2: {} RMSE: {} MAE: {} MAPE: {}".format(round(r['c-train'],2),
                                                                        round(r['R2_train'],2),
                                                                        round(r['RMSE_train'],0),
                                                                        round(r['MAE_train'],0),
                                                                        round(r['MAPE_train'],0)))

------- M1 -------
Test:  C-index: 0.63  R2: 0.2 RMSE: 427.0 MAE: 349.0 MAPE: 1920.0
Train:  C-index: 0.63  R2: 0.18 RMSE: 456.0 MAE: 356.0 MAPE: 1412.0
------- M5 -------
Test:  C-index: 0.63  R2: 0.19 RMSE: 431.0 MAE: 350.0 MAPE: 1869.0
Train:  C-index: 0.64  R2: 0.19 RMSE: 454.0 MAE: 355.0 MAPE: 1416.0
