In [1]:
# general 
import datetime

# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#Import statsmodels package for training a linear regression model.
import statsmodels.formula.api as sm

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Importing and Cleaning Data

In [2]:
#Read in the data from a csv file using Pandas
df = pd.read_csv("bus_data/cleaned_data/line15.csv", low_memory=False, header=None)
df.columns = ["Timestamp", "LineID", "Direction", "JourneyPatternID", "TimeFrame", 
              "VehicleJourneyID", "Operator", "Congestion", "Lon", "Lat", 
              "Delay", "BlockID", "VehicleID", "StopID", "AtStop"]

In [3]:
#Select all columns of type 'object'
object_columns = df.select_dtypes(['object']).columns

In [4]:
#Convert selected columns to type 'category'
for column in object_columns:
    df[column] = df[column].astype('category')   

In [5]:
# Convert Unix timestamp to datetime

# Take uneccesary trailing zeroes off of unix timestamp. 
df['Timestamp'] = df['Timestamp'].apply(lambda x: x//1000000)

# Convert the timestamp
df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')

In [6]:
# Add day of week column
df['Day'] = df['Timestamp'].dt.dayofweek

In [7]:
# Add hour of day column
df['Hour'] = df['Timestamp'].dt.hour

In [8]:
# Convert some features to categorical
for column in ['LineID', 'Direction','VehicleJourneyID', 'Congestion', 'BlockID', 'VehicleID', 'AtStop','Day','Hour']:
    df[column] = df[column].astype('category')

In [9]:
# Remove irrelevant features
df = df.drop('BlockID', 1)
df = df.drop('Operator', 1)
df = df.drop('Delay', 1)
df = df.drop('Congestion', 1)

In [10]:
# Reduce df to journey pattern we are interested in
df = df.loc[df['JourneyPatternID'] == '00150001']

## Creating Basetable

In [22]:
# Making list of dictionaries, to be turned into a dataframe

rows = []

for date in df.TimeFrame.unique():    
    temp_df = df[df.TimeFrame == date]
    
    for journey in temp_df.VehicleJourneyID.unique():
        
        inner_df = temp_df[temp_df.VehicleJourneyID == journey]
        
        row = {}
    
        duration = inner_df.Timestamp.max()-inner_df.Timestamp.min()
        
        if (duration > datetime.timedelta(minutes=30)):
    
            row['date'] = inner_df.Timestamp.min().date()
            row['hour'] = inner_df.Timestamp.min().hour
            row['day'] = inner_df.Timestamp.min().dayofweek
            row['duration'] = duration.seconds//60

            rows.append(row)

In [52]:
# Creating dataframe from the list we made in the last cell
basetable = pd.DataFrame(rows)

In [53]:
basetable.head(10)

Unnamed: 0,date,day,duration,hour
0,2012-11-06,1,105,6
1,2012-11-06,1,92,6
2,2012-11-06,1,93,6
3,2012-11-06,1,94,6
4,2012-11-06,1,148,6
5,2012-11-06,1,96,6
6,2012-11-06,1,108,6
7,2012-11-06,1,107,7
8,2012-11-06,1,107,7
9,2012-11-06,1,103,7


In [54]:
# Monday is 0 and Sunday is 6.
basetable['weekday'] = basetable['day'] < 5
basetable.weekday = basetable.weekday.astype(int)

In [55]:
basetable.head(5)

Unnamed: 0,date,day,duration,hour,weekday
0,2012-11-06,1,105,6,1
1,2012-11-06,1,92,6,1
2,2012-11-06,1,93,6,1
3,2012-11-06,1,94,6,1
4,2012-11-06,1,148,6,1


In [57]:
weather = pd.read_csv("bus_data/rain.csv", low_memory=False, header=None, sep = '\t')
#Read in the data from a csv file using Pandas
weather.columns = ['datetime','rain']
weather.head()

Unnamed: 0,datetime,rain
0,1/11/2012 0:00,0.0
1,1/11/2012 1:00,0.1
2,1/11/2012 2:00,0.0
3,1/11/2012 3:00,0.0
4,1/11/2012 4:00,0.0


In [58]:
weather.dtypes

datetime     object
rain        float64
dtype: object

In [59]:
type(weather.datetime[0])

str

In [60]:
weather.datetime = pd.to_datetime(weather['datetime'])

In [61]:
weather['hour'] = weather['datetime'].dt.hour
weather['date'] = weather['datetime'].dt.date

In [62]:
weather['day'] = weather['datetime'].dt.dayofweek

In [63]:
weather.dtypes

datetime    datetime64[ns]
rain               float64
hour                 int64
date                object
day                  int64
dtype: object

In [64]:
# frames = [basetable, weather]
# new_df = pd.concat(frames, axis=1)

In [69]:
new_df = pd.merge(basetable, weather, how='left' , on=['date','hour'])

In [70]:
new_df.head()

Unnamed: 0,date,day_x,duration,hour,weekday,datetime,rain,day_y
0,2012-11-06,1,105,6,1,NaT,,
1,2012-11-06,1,92,6,1,NaT,,
2,2012-11-06,1,93,6,1,NaT,,
3,2012-11-06,1,94,6,1,NaT,,
4,2012-11-06,1,148,6,1,NaT,,


In [71]:
new_df.dtypes

date                object
day_x                int64
duration             int64
hour                 int64
weekday              int32
datetime    datetime64[ns]
rain               float64
day_y              float64
dtype: object

In [73]:
#Zeros showing as nan
new_df.rain[0]

nan

In [74]:
new_df.describe()

Unnamed: 0,day_x,duration,hour,weekday,rain,day_y
count,4353.0,4353.0,4353.0,4353.0,2326.0,2326.0
mean,2.584654,99.937285,13.271307,0.839191,0.131083,2.567068
std,1.769784,11.663321,4.627051,0.367397,0.416221,1.802154
min,0.0,30.0,6.0,0.0,0.0,0.0
25%,1.0,95.0,9.0,1.0,0.0,1.0
50%,3.0,100.0,13.0,1.0,0.0,2.0
75%,4.0,105.0,17.0,1.0,0.0,4.0
max,6.0,254.0,23.0,1.0,4.0,6.0


In [80]:
new_df.fillna(0, inplace=True)

In [83]:
new_df.head()

Unnamed: 0,date,day_x,duration,hour,weekday,datetime,rain,day_y
0,2012-11-06,1,105,6,1,1970-01-01,0.0,0.0
1,2012-11-06,1,92,6,1,1970-01-01,0.0,0.0
2,2012-11-06,1,93,6,1,1970-01-01,0.0,0.0
3,2012-11-06,1,94,6,1,1970-01-01,0.0,0.0
4,2012-11-06,1,148,6,1,1970-01-01,0.0,0.0


In [84]:
new_df = new_df.drop('date', 1)
new_df = new_df.drop('datetime', 1)
new_df = new_df.drop('day_y', 1)

In [87]:
new_df.head()

Unnamed: 0,day,duration,hour,weekday,rain
0,1,105,6,1,0.0
1,1,92,6,1,0.0
2,1,93,6,1,0.0
3,1,94,6,1,0.0
4,1,148,6,1,0.0


In [86]:
new_df.columns = ['day','duration','hour','weekday','rain']


In [90]:
new_df.to_csv('00150001.rain.csv', encoding='utf-8')