In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#Import statsmodels package for training a linear regression model.
import statsmodels.formula.api as sm

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

ImportError: No module named 'sklearn.model_selection'

# Section 1 - Business/Problem Understanding


# Section 2 - Data Understanding

In [15]:
#Read in the data from a csv file using Pandas
df = pd.read_csv("bus_data/cleaned_data/line15.csv", low_memory=False, header=None)
df.columns = ["Timestamp", "LineID", "Direction", "JourneyPatternID", "TimeFrame", 
              "VehicleJourneyID", "Operator", "Congestion", "Lon", "Lat", 
              "Delay", "BlockID", "VehicleID", "StopID", "AtStop"]

In [16]:
#Select all columns of type 'object'
object_columns = df.select_dtypes(['object']).columns

In [17]:
#Convert selected columns to type 'category'
for column in object_columns:
    df[column] = df[column].astype('category')

In [18]:
# Convert Unix timestamp to datetime

# Take uneccesary trailing zeroes off of unix timestamp. 
df['Timestamp'] = df['Timestamp'].apply(lambda x: x//1000000)

# Convert the timestamp
df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')

In [19]:
# Add day of week column
df['Day'] = df['Timestamp'].dt.dayofweek

In [20]:
# Add hour of day column
df['Hour'] = df['Timestamp'].dt.hour

In [21]:
# Convert some features to categorical
for column in ['LineID', 'Direction','VehicleJourneyID', 'Congestion', 'BlockID', 'VehicleID', 'AtStop','Day','Hour']:
    df[column] = df[column].astype('category')

In [22]:
df.dtypes

Timestamp           datetime64[ns]
LineID                    category
Direction                 category
JourneyPatternID          category
TimeFrame                 category
VehicleJourneyID          category
Operator                  category
Congestion                category
Lon                        float64
Lat                        float64
Delay                        int64
BlockID                   category
VehicleID                 category
StopID                    category
AtStop                    category
Day                       category
Hour                      category
dtype: object

### Continuous Columns

In [23]:
# Data preparation - Select columns containing continuous data
continuous_columns = df[['Lon','Lat','Delay']].columns

In [24]:
#Drop the duplicates, but keep the first occurance of when they occur
df = df.drop_duplicates(keep='first')

In [33]:
df = df.loc[df['JourneyPatternID'] == '00150001']
df

Unnamed: 0,Timestamp,LineID,Direction,JourneyPatternID,TimeFrame,VehicleJourneyID,Operator,Congestion,Lon,Lat,Delay,BlockID,VehicleID,StopID,AtStop,Day,Hour
0,2012-11-06 00:00:00,15,0,00150001,2012-11-05,5826,RD,0,-6.258584,53.340099,-361,15013,33210,4870,0,1,0
3,2012-11-06 00:00:10,15,0,00150001,2012-11-05,5843,RD,0,-6.323327,53.277756,-463,15021,33254,4869,0,1,0
6,2012-11-06 00:00:19,15,0,00150001,2012-11-05,5826,RD,0,-6.257967,53.342365,-361,15013,33210,4870,0,1,0
9,2012-11-06 00:00:31,15,0,00150001,2012-11-05,5843,RD,0,-6.327923,53.276974,-490,15021,33254,4869,0,1,0
13,2012-11-06 00:00:39,15,0,00150001,2012-11-05,5826,RD,0,-6.257433,53.342899,-361,15013,33210,4870,0,1,0
18,2012-11-06 00:00:49,15,0,00150001,2012-11-05,5843,RD,0,-6.331139,53.276196,-505,15021,33254,4870,0,1,0
21,2012-11-06 00:00:59,15,0,00150001,2012-11-05,5826,RD,0,-6.254167,53.342182,-361,15013,33210,4870,0,1,0
23,2012-11-06 00:01:09,15,0,00150001,2012-11-05,5843,RD,0,-6.331205,53.276012,-487,15021,33254,3007,0,1,0
26,2012-11-06 00:01:19,15,0,00150001,2012-11-05,5826,RD,0,-6.251433,53.342201,-361,15013,33210,4870,0,1,0
28,2012-11-06 00:01:30,15,0,00150001,2012-11-05,5843,RD,0,-6.331040,53.274563,-487,15021,33254,3007,0,1,0


In [None]:
morning = df[(df.VehicleJourneyID) & (df.Timef == 0) & (df.TimeFrame == "2012-11-20")]

In [58]:
duration = start-end
duration.head()

VehicleJourneyID
2564   -5 days +23:19:19
2565   -5 days +23:22:01
2566   -5 days +22:50:31
2567   -5 days +23:08:51
2568   -5 days +23:08:56
Name: Timestamp, dtype: timedelta64[ns]

In [29]:
Trips = df.groupby(['TimeFrame'])

In [30]:
Trips.VehicleJourneyID.value_counts()

TimeFrame   VehicleJourneyID
2012-11-05  5843                 97
            5929                 70
            3339                 64
            3130                 58
            5582                 51
            5826                 17
            3118                  4
2012-11-06  3282                494
            5912                493
            5833                394
            5799                385
            5816                378
            5900                378
            3280                377
            5921                372
            5903                371
            5806                362
            5790                360
            5823                360
            5887                359
            5936                356
            5893                354
            3257                353
            5838                353
            5782                350
            5890                350
            5951                348

Transform Time Frame to exclude the night before and night after the journey's main day of operation

In [83]:
morning = df[(df.VehicleJourneyID == 2564) & (df.AtStop == 0) & (df.TimeFrame == "2012-11-20")]
morning

Unnamed: 0,Timestamp,JourneyPatternID,TimeFrame,VehicleJourneyID,Congestion,Lon,Lat,Delay,VehicleID,StopID,AtStop,Day,Hour
1245329,2012-11-20 06:24:04,,2012-11-20,2564,0,-6.334267,53.307049,0,33484,1105,1,1,6
1245348,2012-11-20 06:24:24,015A1001,2012-11-20,2564,0,-6.334267,53.307049,0,33484,1105,1,1,6
1245364,2012-11-20 06:24:42,015A1001,2012-11-20,2564,0,-6.334267,53.307049,0,33484,1105,1,1,6
1245457,2012-11-20 06:26:04,015A1001,2012-11-20,2564,0,-6.332076,53.30439,-21,33484,1107,1,1,6
1245520,2012-11-20 06:27:03,015A1001,2012-11-20,2564,0,-6.325751,53.304428,-21,33484,1109,1,1,6
1245589,2012-11-20 06:28:05,015A1001,2012-11-20,2564,0,-6.322103,53.305256,-21,33484,1111,1,1,6
1245656,2012-11-20 06:29:04,015A1001,2012-11-20,2564,0,-6.318578,53.307621,-58,33484,1112,1,1,6
1245721,2012-11-20 06:30:03,015A1001,2012-11-20,2564,0,-6.310774,53.310291,-58,33484,1114,1,1,6
1245826,2012-11-20 06:31:23,015A1001,2012-11-20,2564,0,-6.30252,53.312592,-37,33484,2437,1,1,6
1245975,2012-11-20 06:33:23,015A1001,2012-11-20,2564,0,-6.28799,53.310123,-83,33484,1120,1,1,6


In [94]:
duration = morning.Timestamp.max() - morning.Timestamp.min()
duration

Timedelta('0 days 00:36:39')

In [99]:
def hours_minutes(td):
    return td.seconds//3600, (td.seconds//60)%60

In [100]:
print(hours_minutes(duration))

(0, 36)


### Data Quality plan

## Data Quality Plan

| Feature                 | Data Quality Issue   | Handling Strategy             |
|-------------------------|----------------------|-------------------------------|
| Timestamp               | None                 | Retain Feature                |



### Data Relationships

# Section 4 -  Predictive Modeling

# Section 5 - Evaluation