In [25]:
import pandas as pd
from datetime import timedelta

%matplotlib inline

In [26]:
# load a station
station_id = '39.csv'
url = 'data_output/'
data = pd.read_csv(url+str(station_id), index_col='date_time', parse_dates=True)
data['minutes_empty'] = (data.minutes_empty > 0).astype(int)


In [27]:
# mark the first positive observation in a series
previous = 0
list = []
for i in data.minutes_empty:
    current = i + previous * i
    previous = i
    list.append(current)

In [28]:
# mark the first positive ovservation in a series as 1 for y
test = pd.DataFrame(list, index=data.index, columns=['y'])
test['y'] = (test.y == 1).astype(int)
data = pd.concat([data,test], axis=1)
data = data[['y','mean_count']]

In [93]:
# self join data set offset on hours
for i in range(4,20):
    feature = pd.read_csv(url+str(station_id), index_col='date_time', parse_dates=True)
    feature = feature[['mean_count']]
    feature.index = feature.index + timedelta(minutes=(i*30))
    columns = str(i*30)+'_'+ feature.columns
    feature.columns = columns
    data = pd.concat([data,feature], axis=1, join='inner')



In [21]:
# build a data frame of each positive event and the proceeding 6 hours
events = data[data.y == 1].index
result = pd.DataFrame()
for index, i in enumerate(events):
    iteration = index
    set = data[(data.index <= events[iteration]) & (data.index > events[iteration] - timedelta(minutes=(60*6))) ]
    set['rep'] = iteration
    result = pd.concat([result, set], axis=0)
    
result['events'] = 'event'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [22]:
# build a data frame of a random sample of observations and proceeding 6 hours
import random
events = data[data.y != 1].index
events = random.sample(events,500)
result_sample = pd.DataFrame()

for index, i in enumerate(events):
    iteration = index
    set = data[(data.index <= events[iteration]) & (data.index > events[iteration] - timedelta(minutes=(60*6))) ]
    set['rep'] = iteration
    result_sample = pd.concat([result_sample, set], axis=0)
result_sample['events'] = 'not event'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [23]:
final = pd.concat([result, result_sample], axis=0)

In [24]:
final.to_csv('data_decisiontree/timeseries_test.csv', index=True)

In [95]:
# dummy hours
hours = data.index.hour
hours = pd.DataFrame(hours, index=data.index, columns=['hour'])
dummies = pd.get_dummies(hours.hour, prefix='hour')
dummies = dummies[dummies.columns[1:]]
data = pd.concat([data, dummies], axis=1)

In [143]:
features = [i for i in X.columns if 'hour' in i]

In [144]:
X = data[features]
y = data.y

In [145]:
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier


In [151]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=20)

In [154]:
# tune n_estimators then max_features
rfclass = RandomForestClassifier(n_estimators=150, max_features=3, random_state=3)
rfclass.fit(X_train, y_train)
y_pred = rfclass.predict(X_test)

In [155]:
confusion = metrics.confusion_matrix(y_test, y_pred)
confusion

array([[5240,    0],
       [  26,    0]])

In [156]:
feature_cols = X.columns
feature_importance = pd.DataFrame({'feature':feature_cols, 'importance':rfclass.feature_importances_})
feature_importance.sort_values(by='importance',ascending=False ).head(10)

Unnamed: 0,feature,importance
16,hour_17,0.442535
22,hour_23,0.169384
18,hour_19,0.13279
20,hour_21,0.022402
21,hour_22,0.019177
19,hour_20,0.018597
15,hour_16,0.018341
2,hour_3,0.015715
12,hour_13,0.01568
4,hour_5,0.014855
