In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("./data/pen-collected-items.csv")
print df.shape

(3599735, 5)


In [3]:
# extract creation date, convert to appropriate timestamp to EST timezone and extract the day of the week
days = {0:'Mon',1:'Tues',2:'Weds',3:'Thurs',4:'Fri',5:'Sat',6:'Sun'}
df['created_dateformat'] = pd.to_datetime(df.created, unit='s')
df['created_date'] = df['created_dateformat'].apply(pd.datetools.normalize_date)
df['created_dateformat_utc'] = df['created_dateformat'].dt.tz_localize('UTC')
df['created_dateformat_est'] = df['created_dateformat_utc'].dt.tz_convert('US/Eastern')
df['created_datedayofweek']  = df['created_dateformat'].apply(lambda x: days[x.dayofweek])
df.head()

Unnamed: 0,id,tool_id,bundle_id,refers_to_object_id,created,created_dateformat,created_date,created_dateformat_utc,created_dateformat_est,created_datedayofweek
0,0,0,54Sa,18356743,1414081555,2014-10-23 16:25:55,2014-10-23,2014-10-23 16:25:55+00:00,2014-10-23 12:25:55-04:00,Thurs
1,1,0,54ZB,18316213,1414432504,2014-10-27 17:55:04,2014-10-27,2014-10-27 17:55:04+00:00,2014-10-27 13:55:04-04:00,Mon
2,2,0,55ap,18637615,1414526581,2014-10-28 20:03:01,2014-10-28,2014-10-28 20:03:01+00:00,2014-10-28 16:03:01-04:00,Tues
3,3,0,55ap,18643273,1414526584,2014-10-28 20:03:04,2014-10-28,2014-10-28 20:03:04+00:00,2014-10-28 16:03:04-04:00,Tues
4,4,0,55bK,18448597,1414529537,2014-10-28 20:52:17,2014-10-28,2014-10-28 20:52:17+00:00,2014-10-28 16:52:17-04:00,Tues


### Tagging after closing hours

In [4]:
# are objects being tagged after the closing time (closing time is 9pm on Sat, else 6pm)
def tagged_after_closing(df_in):    
    closing_time = ' 18:00:00'
    if df_in.created_datedayofweek == 'Sat': closing_time = ' 21:00:00'
    ts_close = pd.Timestamp(str(df_in.created_date).split()[0] + closing_time)
    ts_close = ts_close.tz_localize('US/Eastern')
    ts_tag_error = df_in.created_dateformat_est > ts_close
    #print df_in.created_dateformat_est, ts_close
    #print ts_tag_error
    return int(ts_tag_error)

# perform analysis on a subset of the data
data_slice = 10000
df_query = df[:data_slice].copy()
# looks like there are some objects being tagged after closing time!!!
df_query['tagged_after_close'] = df_query.apply(tagged_after_closing, axis=1)
df_tagged_ts_err = df_query[df_query.tagged_after_close == 1]
print "Num Observations in Slice={0}, Num Errors in Slice:{1}".format(df_query.shape[0], df_tagged_ts_err.shape[0])
df_tagged_ts_err.head(10)

Num Observations in Slice=10000, Num Errors in Slice:110


Unnamed: 0,id,tool_id,bundle_id,refers_to_object_id,created,created_dateformat,created_date,created_dateformat_utc,created_dateformat_est,created_datedayofweek,tagged_after_close
7,7,0,55gr,18481649,1414536158,2014-10-28 22:42:38,2014-10-28,2014-10-28 22:42:38+00:00,2014-10-28 18:42:38-04:00,Tues,1
8,8,0,55gr,18575263,1414536160,2014-10-28 22:42:40,2014-10-28,2014-10-28 22:42:40+00:00,2014-10-28 18:42:40-04:00,Tues,1
11,11,0,55qn,18382947,1414625562,2014-10-29 23:32:42,2014-10-29,2014-10-29 23:32:42+00:00,2014-10-29 19:32:42-04:00,Weds,1
12,12,0,55qn,18609951,1414625565,2014-10-29 23:32:45,2014-10-29,2014-10-29 23:32:45+00:00,2014-10-29 19:32:45-04:00,Weds,1
13,13,0,55qX,18114821,1414626217,2014-10-29 23:43:37,2014-10-29,2014-10-29 23:43:37+00:00,2014-10-29 19:43:37-04:00,Weds,1
14,14,0,55qX,18222401,1414626220,2014-10-29 23:43:40,2014-10-29,2014-10-29 23:43:40+00:00,2014-10-29 19:43:40-04:00,Weds,1
34,34,0,56ca,18297899,1415143099,2014-11-04 23:18:19,2014-11-04,2014-11-04 23:18:19+00:00,2014-11-04 18:18:19-05:00,Tues,1
39,39,0,56ca,18468369,1415143087,2014-11-04 23:18:07,2014-11-04,2014-11-04 23:18:07+00:00,2014-11-04 18:18:07-05:00,Tues,1
43,43,0,56ca,18616669,1415144225,2014-11-04 23:37:05,2014-11-04,2014-11-04 23:37:05+00:00,2014-11-04 18:37:05-05:00,Tues,1
44,44,0,56ca,18620731,1415143074,2014-11-04 23:17:54,2014-11-04,2014-11-04 23:17:54+00:00,2014-11-04 18:17:54-05:00,Tues,1


### Conditions where the refers_to_object_id == 0 and tool_id == 0

In [5]:
df_objectlabel_err = df[(df.tool_id == 0) & (df.refers_to_object_id == 0)]
print "Num Observations in Error: {0}".format(df_objectlabel_err.shape[0])
df_objectlabel_err.head()

Num Observations in Error: 239


Unnamed: 0,id,tool_id,bundle_id,refers_to_object_id,created,created_dateformat,created_date,created_dateformat_utc,created_dateformat_est,created_datedayofweek
1365637,1365637,0,8Fj1T,0,1438540207,2015-08-02 18:30:07,2015-08-02,2015-08-02 18:30:07+00:00,2015-08-02 14:30:07-04:00,Sun
1443703,1443703,0,8GCU4,0,1439239149,2015-08-10 20:39:09,2015-08-10,2015-08-10 20:39:09+00:00,2015-08-10 16:39:09-04:00,Mon
2797705,2797705,0,95TW8,0,1451147746,2015-12-26 16:35:46,2015-12-26,2015-12-26 16:35:46+00:00,2015-12-26 11:35:46-05:00,Sat
2802207,2802207,0,95VZX,0,1451350688,2015-12-29 00:58:08,2015-12-29,2015-12-29 00:58:08+00:00,2015-12-28 19:58:08-05:00,Tues
2817837,2817837,0,968uM,0,1451169457,2015-12-26 22:37:37,2015-12-26,2015-12-26 22:37:37+00:00,2015-12-26 17:37:37-05:00,Sat
