## Trip Lab notebook PREP V8
Prepare dataset with numberic values, boolean values, duration features from dates, and date attributes.
Saturate the outlier y values to +/- 30 minutes. Create one-hot encoded features for some date features.
Reduce one-hot date features. Make date attributes integers.

Based on what is known before the driver starts driving, predict when the driver will arrive at the pick up location relative to the expected start time. What factors are most predictive of lateness?

In [1]:
import pandas as pd
import numpy as np

In [2]:
dateColNames=['canceled_at_pdt', 'claimed_at_pdt', 'departs_for_trip_at_pdt',
       'created_at_pdt', 'scheduled_starts_at_pdt', 'scheduled_ends_at_pdt',
       'origin_arrived_at_pdt', 'origin_departed_at_pdt',
       'destination_arrived_at_pdt', 'lead_organizer_created_at_pdt',
       'driver_created_at_pdt']

In [3]:
df=pd.read_csv('/Users/bob 2/Projects/TripsData2017.txt', sep='\t', parse_dates=dateColNames)

In [4]:
df.shape

(217348, 74)

In [5]:
list(df)

['id',
 'trip_state',
 'canceled_by',
 'canceled_at_pdt',
 'canceled_before_scheduled_start',
 'claimed_at_pdt',
 'driver_id',
 'lead_organizer_id',
 'creator_id',
 'carpool',
 'is_repeating_ride',
 'shuttle',
 'trip_template_id',
 'departs_for_trip_at_pdt',
 'time_anchor',
 'created_at_pdt',
 'scheduled_starts_at_pdt',
 'scheduled_ends_at_pdt',
 'origin_arrived_at_pdt',
 'origin_departed_at_pdt',
 'destination_arrived_at_pdt',
 'origin_location_id',
 'destination_location_id',
 'origin_region_id',
 'origin_analysis_metro_name',
 'destination_region_id',
 'destination_region_name',
 'origin_region_name',
 'origin_metro_area_name',
 'destination_metro_area_name',
 'destination_analysis_metro_name',
 'origin_lat',
 'origin_lon',
 'destination_lat',
 'destination_lon',
 'passengers_ids',
 'organizers_count',
 'passengers_count',
 'driver_home_lat',
 'driver_home_lon',
 'organizer_home_lat',
 'organizer_home_lon',
 'rating',
 'canceled_by_id',
 'route_legs_count',
 'start_waypoints_zipcode

In [6]:
df.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217348 entries, 0 to 217347
Columns: 74 entries, id to driver_has_driven_this_route_before
dtypes: bool(8), datetime64[ns](11), float64(29), int64(5), object(21)
memory usage: 111.1+ MB


In [7]:
df.info

<bound method DataFrame.info of             id trip_state canceled_by     canceled_at_pdt  \
0       367322   canceled    Employee 2017-03-08 13:18:55   
1       492666   canceled    Employee 2017-09-11 09:09:43   
2       527985   canceled    Employee 2017-09-27 14:02:52   
3       415557   canceled    Employee 2017-05-22 13:47:32   
4       317210   canceled    Employee 2017-01-18 10:14:22   
5       338150   complete         NaN                 NaT   
6       310958   complete         NaN                 NaT   
7       338135   complete         NaN                 NaT   
8       324615   complete         NaN                 NaT   
9       247127   complete         NaN                 NaT   
10      306833   complete         NaN                 NaT   
11      436184   complete         NaN                 NaT   
12      473473   complete         NaN                 NaT   
13      354050   complete         NaN                 NaT   
14      415507   complete         NaN                

#### columns_from_the_future must not be used in predictions but some are needed to label the data

In [8]:
columns_from_the_future = ['trip_state', 'canceled_by', 'canceled_at_pdt', 'canceled_before_scheduled_start', 'departs_for_trip_at_pdt', 'origin_arrived_at_pdt', 'origin_departed_at_pdt', 'destination_arrived_at_pdt', 'rating', 'trip_paid_time', 'driver_fare_multiplier', 'events', 'has_cd_unclaimed_event', 'is_unfilled_canceled', 'is_cd_cancel', 'canceled_by_id', 'commute_distance', 'predicted_driver_fare_with_multiplier']
print(columns_from_the_future)
print(len(columns_from_the_future))

['trip_state', 'canceled_by', 'canceled_at_pdt', 'canceled_before_scheduled_start', 'departs_for_trip_at_pdt', 'origin_arrived_at_pdt', 'origin_departed_at_pdt', 'destination_arrived_at_pdt', 'rating', 'trip_paid_time', 'driver_fare_multiplier', 'events', 'has_cd_unclaimed_event', 'is_unfilled_canceled', 'is_cd_cancel', 'canceled_by_id', 'commute_distance', 'predicted_driver_fare_with_multiplier']
18


In [9]:
list(df.select_dtypes(['object']).columns)

['trip_state',
 'canceled_by',
 'time_anchor',
 'origin_analysis_metro_name',
 'destination_region_name',
 'origin_region_name',
 'origin_metro_area_name',
 'destination_metro_area_name',
 'destination_analysis_metro_name',
 'passengers_ids',
 'rating',
 'start_waypoints_zipcodes',
 'end_waypoints_zipcodes',
 'lead_organizer_platform',
 'lead_organizer_app_version',
 'driver_platform',
 'driver_app_version',
 'driver_gender',
 'coupon',
 'events',
 'has_cd_unclaimed_event']

In [10]:
df.dtypes

id                                                  int64
trip_state                                         object
canceled_by                                        object
canceled_at_pdt                            datetime64[ns]
canceled_before_scheduled_start                   float64
claimed_at_pdt                             datetime64[ns]
driver_id                                         float64
lead_organizer_id                                   int64
creator_id                                          int64
carpool                                              bool
is_repeating_ride                                    bool
shuttle                                              bool
trip_template_id                                  float64
departs_for_trip_at_pdt                    datetime64[ns]
time_anchor                                        object
created_at_pdt                             datetime64[ns]
scheduled_starts_at_pdt                    datetime64[ns]
scheduled_ends

#### pick features to use as input to prediction. here just numerics, booleans, durations, date attributes and not those from the future.
todo: object features as categorical dummies, saturate outliers in data set

In [11]:
# compute duration features
durationTuples=[('claimed_before_trip_start_secs', 'claimed_at_pdt', 'scheduled_starts_at_pdt'), 
                ('trip_created_before_trip_start_secs', 'created_at_pdt', 'scheduled_starts_at_pdt'),
                ('trip_est_duration_secs', 'scheduled_starts_at_pdt', 'scheduled_ends_at_pdt'),
                ('org_signup_before_trip_start_secs', 'lead_organizer_created_at_pdt', 'scheduled_starts_at_pdt'),
                ('driver_signup_before_trip_start_secs', 'driver_created_at_pdt', 'scheduled_starts_at_pdt'),
                ('trip_claimed_after_trip_created', 'claimed_at_pdt', 'created_at_pdt'),
                ('driver_signup_before_trip_claimed_secs', 'driver_created_at_pdt', 'claimed_at_pdt'),
                ('org_signup_before_trip_created_secs', 'lead_organizer_created_at_pdt', 'created_at_pdt'),
                ('driver_signup_before_trip_created_secs', 'driver_created_at_pdt', 'created_at_pdt'),
                ('driver_signup_before_org_signup_secs', 'driver_created_at_pdt', 'lead_organizer_created_at_pdt')
                ]

#duration_col_name, from_time, to_time = durationTuples[1]
#print (duration_col_name, from_time, to_time)
#print (df.iloc[0][[from_time, to_time]])
#print((df.iloc[0][to_time] - df.iloc[0][from_time]).total_seconds())
#df[duration_col_name] = (df[to_time] - df[from_time]).dt.total_seconds()
#df[duration_col_name].head()

for duration_col_name, from_time, to_time in durationTuples:
    print("creating %s from %s - %s" % (duration_col_name, to_time, from_time))
    df[duration_col_name] = (df[to_time] - df[from_time]).dt.total_seconds()

creating claimed_before_trip_start_secs from scheduled_starts_at_pdt - claimed_at_pdt
creating trip_created_before_trip_start_secs from scheduled_starts_at_pdt - created_at_pdt
creating trip_est_duration_secs from scheduled_ends_at_pdt - scheduled_starts_at_pdt
creating org_signup_before_trip_start_secs from scheduled_starts_at_pdt - lead_organizer_created_at_pdt
creating driver_signup_before_trip_start_secs from scheduled_starts_at_pdt - driver_created_at_pdt
creating trip_claimed_after_trip_created from created_at_pdt - claimed_at_pdt
creating driver_signup_before_trip_claimed_secs from claimed_at_pdt - driver_created_at_pdt
creating org_signup_before_trip_created_secs from created_at_pdt - lead_organizer_created_at_pdt
creating driver_signup_before_trip_created_secs from created_at_pdt - driver_created_at_pdt
creating driver_signup_before_org_signup_secs from lead_organizer_created_at_pdt - driver_created_at_pdt


In [12]:
# compute date attributes
# which dates are not from the future
# make any missing values the lowest number
date_columns_to_process=list(set(df.select_dtypes(include=['datetime64[ns]']).columns) - set(columns_from_the_future))
print(date_columns_to_process)
for col_name in date_columns_to_process:
    print("creating date attributes from %s" % col_name)
    df[col_name + "_quarter"]    = df[col_name].dt.quarter.replace([np.inf, -np.inf, np.nan], 1).astype(int)
    df[col_name + "_month"]      = df[col_name].dt.month.replace([np.inf, -np.inf, np.nan], 1).astype(int)
    df[col_name + "_day"]        = df[col_name].dt.day.replace([np.inf, -np.inf, np.nan], 1).astype(int)
    df[col_name + "_hour"]       = df[col_name].dt.hour.replace([np.inf, -np.inf, np.nan], 1).astype(int)
    df[col_name + "_weekday"]    = df[col_name].dt.weekday.replace([np.inf, -np.inf, np.nan], 0).astype(int)
    df[col_name + "_weekofyear"] = df[col_name].dt.weekofyear.replace([np.inf, -np.inf, np.nan], 1).astype(int)
    df[col_name + "_dayofyear"]  = df[col_name].dt.dayofyear.replace([np.inf, -np.inf, np.nan], 1).astype(int)

['scheduled_starts_at_pdt', 'driver_created_at_pdt', 'scheduled_ends_at_pdt', 'created_at_pdt', 'claimed_at_pdt', 'lead_organizer_created_at_pdt']
creating date attributes from scheduled_starts_at_pdt
creating date attributes from driver_created_at_pdt
creating date attributes from scheduled_ends_at_pdt
creating date attributes from created_at_pdt
creating date attributes from claimed_at_pdt
creating date attributes from lead_organizer_created_at_pdt


In [13]:
# select the date columns to make one-hot columns from
one_hot_source_columns=['scheduled_starts_at_pdt_quarter','scheduled_starts_at_pdt_month',
                        'scheduled_starts_at_pdt_hour','scheduled_starts_at_pdt_weekday',
                        'claimed_at_pdt_quarter','claimed_at_pdt_month',
                        'claimed_at_pdt_hour','claimed_at_pdt_weekday'
                       ]
print(one_hot_source_columns)

['scheduled_starts_at_pdt_quarter', 'scheduled_starts_at_pdt_month', 'scheduled_starts_at_pdt_hour', 'scheduled_starts_at_pdt_weekday', 'claimed_at_pdt_quarter', 'claimed_at_pdt_month', 'claimed_at_pdt_hour', 'claimed_at_pdt_weekday']


In [14]:
# create date one-hot features from those columns
# added drop_first=True to avoid co-linearity
date_dummies = pd.get_dummies(df[one_hot_source_columns],
                              columns=one_hot_source_columns,
                              prefix=one_hot_source_columns,
                              drop_first=True)
date_dummies.head()

Unnamed: 0,scheduled_starts_at_pdt_quarter_2,scheduled_starts_at_pdt_quarter_3,scheduled_starts_at_pdt_quarter_4,scheduled_starts_at_pdt_month_2,scheduled_starts_at_pdt_month_3,scheduled_starts_at_pdt_month_4,scheduled_starts_at_pdt_month_5,scheduled_starts_at_pdt_month_6,scheduled_starts_at_pdt_month_7,scheduled_starts_at_pdt_month_8,...,claimed_at_pdt_hour_20,claimed_at_pdt_hour_21,claimed_at_pdt_hour_22,claimed_at_pdt_hour_23,claimed_at_pdt_weekday_1,claimed_at_pdt_weekday_2,claimed_at_pdt_weekday_3,claimed_at_pdt_weekday_4,claimed_at_pdt_weekday_5,claimed_at_pdt_weekday_6
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [15]:
date_dummies.columns

Index(['scheduled_starts_at_pdt_quarter_2',
       'scheduled_starts_at_pdt_quarter_3',
       'scheduled_starts_at_pdt_quarter_4', 'scheduled_starts_at_pdt_month_2',
       'scheduled_starts_at_pdt_month_3', 'scheduled_starts_at_pdt_month_4',
       'scheduled_starts_at_pdt_month_5', 'scheduled_starts_at_pdt_month_6',
       'scheduled_starts_at_pdt_month_7', 'scheduled_starts_at_pdt_month_8',
       'scheduled_starts_at_pdt_month_9', 'scheduled_starts_at_pdt_month_10',
       'scheduled_starts_at_pdt_month_11', 'scheduled_starts_at_pdt_month_12',
       'scheduled_starts_at_pdt_hour_1', 'scheduled_starts_at_pdt_hour_2',
       'scheduled_starts_at_pdt_hour_3', 'scheduled_starts_at_pdt_hour_4',
       'scheduled_starts_at_pdt_hour_5', 'scheduled_starts_at_pdt_hour_6',
       'scheduled_starts_at_pdt_hour_7', 'scheduled_starts_at_pdt_hour_8',
       'scheduled_starts_at_pdt_hour_9', 'scheduled_starts_at_pdt_hour_10',
       'scheduled_starts_at_pdt_hour_11', 'scheduled_starts_at_pdt_ho

In [16]:
df = pd.concat([df, date_dummies], axis=1)
df.head()

Unnamed: 0,id,trip_state,canceled_by,canceled_at_pdt,canceled_before_scheduled_start,claimed_at_pdt,driver_id,lead_organizer_id,creator_id,carpool,...,claimed_at_pdt_hour_20,claimed_at_pdt_hour_21,claimed_at_pdt_hour_22,claimed_at_pdt_hour_23,claimed_at_pdt_weekday_1,claimed_at_pdt_weekday_2,claimed_at_pdt_weekday_3,claimed_at_pdt_weekday_4,claimed_at_pdt_weekday_5,claimed_at_pdt_weekday_6
0,367322,canceled,Employee,2017-03-08 13:18:55,18.68,2017-03-08 13:16:39,62908.0,87303,87303,False,...,0,0,0,0,0,1,0,0,0,0
1,492666,canceled,Employee,2017-09-11 09:09:43,0.84,2017-08-31 12:33:50,62930.0,78527,78527,False,...,0,0,0,0,0,0,1,0,0,0
2,527985,canceled,Employee,2017-09-27 14:02:52,1.07,2017-09-27 13:59:44,62930.0,78527,78527,False,...,0,0,0,0,0,1,0,0,0,0
3,415557,canceled,Employee,2017-05-22 13:47:32,7.47,2017-05-22 13:18:52,62930.0,131139,131139,False,...,0,0,0,0,0,0,0,0,0,0
4,317210,canceled,Employee,2017-01-18 10:14:22,95.76,2017-01-18 10:12:03,62966.0,101326,101326,False,...,0,0,0,0,0,1,0,0,0,0


In [17]:
# confirm added 275 more features
df.shape

(217348, 212)

In [18]:
df.dtypes

id                                          int64
trip_state                                 object
canceled_by                                object
canceled_at_pdt                    datetime64[ns]
canceled_before_scheduled_start           float64
claimed_at_pdt                     datetime64[ns]
driver_id                                 float64
lead_organizer_id                           int64
creator_id                                  int64
carpool                                      bool
is_repeating_ride                            bool
shuttle                                      bool
trip_template_id                          float64
departs_for_trip_at_pdt            datetime64[ns]
time_anchor                                object
created_at_pdt                     datetime64[ns]
scheduled_starts_at_pdt            datetime64[ns]
scheduled_ends_at_pdt              datetime64[ns]
origin_arrived_at_pdt              datetime64[ns]
origin_departed_at_pdt             datetime64[ns]


In [19]:
# which booleans are not from the future
bool_columns_to_convert=list(set(df.select_dtypes(include=['bool']).columns) - set(columns_from_the_future))
print(bool_columns_to_convert)

['is_affiliate', 'is_same_day_ride', 'shuttle', 'is_repeating_ride', 'driver_has_driven_this_route_before', 'carpool']


In [20]:
##df[bool_columns_to_convert].astype(int).head()
bool_columns_as_int_names=[s + "_int" for s in bool_columns_to_convert]
# create new int columns containing 1,0 from bool columns which are True,False
df[bool_columns_as_int_names] = df[bool_columns_to_convert].astype(int)
df[bool_columns_as_int_names].head()

Unnamed: 0,is_affiliate_int,is_same_day_ride_int,shuttle_int,is_repeating_ride_int,driver_has_driven_this_route_before_int,carpool_int
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,1,0,0,0,0
3,0,1,0,0,0,0
4,0,0,0,0,0,0


In [21]:
feature_columns_to_use=list(set(df.select_dtypes(include=['number']).columns) - set(columns_from_the_future))
print(feature_columns_to_use)

['scheduled_ends_at_pdt_weekday', 'scheduled_starts_at_pdt_month_7', 'claimed_at_pdt_weekday_3', 'trip_predicted_raw_fare', 'scheduled_starts_at_pdt_hour_7', 'total_predicted_distance_miles', 'driver_signup_before_trip_created_secs', 'scheduled_starts_at_pdt_hour_2', 'scheduled_starts_at_pdt_day', 'scheduled_ends_at_pdt_hour', 'organizers_count', 'claimed_at_pdt_month_4', 'driver_created_at_pdt_hour', 'claimed_at_pdt_quarter_4', 'driver_created_at_pdt_weekofyear', 'claimed_at_pdt_hour_22', 'is_same_day_ride_int', 'origin_lat', 'route_legs_count', 'scheduled_starts_at_pdt_month_3', 'lead_organizer_created_at_pdt_weekofyear', 'driver_signup_before_org_signup_secs', 'scheduled_starts_at_pdt_month_12', 'claimed_at_pdt_hour', 'claimed_at_pdt_weekday_2', 'lead_organizer_id', 'scheduled_starts_at_pdt_hour_21', 'claimed_at_pdt_month_12', 'claimed_before_trip_start_secs', 'scheduled_starts_at_pdt_month_4', 'scheduled_starts_at_pdt_month_6', 'driver_created_at_pdt_day', 'claimed_at_pdt_hour_23',

#### compute arrived_late label, ah should be 5 min late. At 0 seconds, 9% of trips are "late"!

In [22]:
# note this is just for testing, the real calc is below see "materialize the prediction target"
arrival_time_df=df[['id','scheduled_starts_at_pdt','origin_arrived_at_pdt']]
arrival_time_df.shape
arrival_time_df=arrival_time_df.assign(
    arrived_seconds_after_scheduled_start=lambda x:
        (x['origin_arrived_at_pdt'] - x['scheduled_starts_at_pdt']).dt.total_seconds().clip(-1800,1800)
)

# show number of nulls
arrival_time_df.isnull().sum()



id                                           0
scheduled_starts_at_pdt                      0
origin_arrived_at_pdt                    91673
arrived_seconds_after_scheduled_start    91673
dtype: int64

In [23]:
arrival_time_df.dropna().shape


(125675, 4)

In [24]:
arrival_time_df.dropna().hist(column='arrived_seconds_after_scheduled_start',bins=np.linspace(-1800,1800,60),grid=False)


array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1100139e8>]],
      dtype=object)

In [25]:
# arrives 7 minutes early on average partially due to saturation
arrival_time_df[['arrived_seconds_after_scheduled_start']].dropna().mean()/60

arrived_seconds_after_scheduled_start   -6.93286
dtype: float64

In [26]:
# most often arrives 30 minutes early because of saturation!
arrival_time_df[['arrived_seconds_after_scheduled_start']].dropna().mode()/60

Unnamed: 0,arrived_seconds_after_scheduled_start
0,-30.0


In [27]:
# show rows and label to verify logic
# arrived_late_df[arrived_late_df['label']].head # version where label is TRUE
arrival_time_df.head()

Unnamed: 0,id,scheduled_starts_at_pdt,origin_arrived_at_pdt,arrived_seconds_after_scheduled_start
0,367322,2017-03-09 08:00:00,NaT,
1,492666,2017-09-11 10:00:00,NaT,
2,527985,2017-09-27 15:07:00,NaT,
3,415557,2017-05-22 21:16:00,NaT,
4,317210,2017-01-22 10:00:00,NaT,


In [28]:
# features look ok, contain some NaNs
df[feature_columns_to_use].head()

Unnamed: 0,scheduled_ends_at_pdt_weekday,scheduled_starts_at_pdt_month_7,claimed_at_pdt_weekday_3,trip_predicted_raw_fare,scheduled_starts_at_pdt_hour_7,total_predicted_distance_miles,driver_signup_before_trip_created_secs,scheduled_starts_at_pdt_hour_2,scheduled_starts_at_pdt_day,scheduled_ends_at_pdt_hour,...,claimed_at_pdt_weekofyear,lead_organizer_created_at_pdt_hour,scheduled_starts_at_pdt_hour_10,claimed_at_pdt_hour_20,claimed_at_pdt_hour_12,destination_region_id,scheduled_starts_at_pdt_month_10,total_predicted_duration,claimed_at_pdt_quarter_2,claimed_at_pdt_hour_2
0,3,0,0,,0,1.0,21333580.0,0,9,8,...,10,20,0,0,0,25.0,0,13.75,0,0
1,0,0,1,16.0,0,0.0,36535721.0,0,11,10,...,35,19,1,0,1,33.0,0,6.72,0,0
2,2,0,0,16.0,0,0.0,38875396.0,0,27,15,...,39,19,0,0,0,33.0,0,4.67,0,0
3,0,0,0,22.11,0,9.0,27813694.0,0,22,21,...,21,14,0,0,0,34.0,0,18.9,1,0
4,6,0,0,16.0,0,2.0,17088930.0,0,22,10,...,3,14,1,0,0,36.0,0,15.82,0,0


In [29]:
# hmmm too many NaNs
df[feature_columns_to_use].dropna().shape

(64, 172)

In [30]:
# replace NaNs with 0s, although another solution for missing latlons is probably better
df[feature_columns_to_use].fillna(0).shape

(217348, 172)

In [31]:
# eliminate rows where on-time arrival cannot be computed (canceled before arrival, possibly filled).
df.loc[df['origin_arrived_at_pdt'].notnull()].shape

(125675, 218)

In [32]:
# how many observations of each final state do we have left where on-time arrival can be labeled
df.loc[df['origin_arrived_at_pdt'].notnull()].groupby(['trip_state']).size()

trip_state
canceled      1529
complete    124146
dtype: int64

In [33]:
# how many are filled; all should be but 1 wierd trip arrived with no driver! 
# chaotic race condition when scheduled start time approaching, cancel by organizer, removing driver, and driver arriving 
# It will be filled with 0 which is ok.
df.loc[df['origin_arrived_at_pdt'].notnull()].groupby(['trip_state',df.driver_id.notnull()]).size()

trip_state  driver_id
canceled    False             1
            True           1528
complete    True         124146
dtype: int64

In [34]:
# materialize the useful dataset for training and testing
df_X=df.loc[df['origin_arrived_at_pdt'].notnull()][feature_columns_to_use].fillna(0)
df_X.shape

(125675, 172)

In [35]:
# materialize the prediction target (arrived_seconds_after_scheduled_start)
y=(df.loc[df['origin_arrived_at_pdt'].notnull()].origin_arrived_at_pdt 
   - df.loc[df['origin_arrived_at_pdt'].notnull()].scheduled_starts_at_pdt).dt.total_seconds().clip(-1800,1800)

# describe and convert from scientific notation
y.describe().apply(lambda x: '%.0f' % x)

count    125675
mean       -416
std         509
min       -1800
25%        -657
50%        -324
75%         -96
max        1800
dtype: object

In [36]:
# write out features (df_X) and targets (y) into files
fileprefix='./lateness_PREP_V8_'
df_X.to_csv(fileprefix+'X.csv', sep='\t')


In [37]:
y.to_csv(path=fileprefix+'y.csv', sep='\t')

In [38]:
y.shape

(125675,)

In [39]:
y.head()

5   -497.0
6      5.0
7   -313.0
8   -296.0
9   -166.0
dtype: float64