In [73]:
# general 
import datetime

# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline




# Importing and Cleaning Data

In [115]:
line = "15"

In [116]:
# Read in our data
df = pd.read_csv("bus_data/clean_data3/" + line + ".csv", low_memory=False, header=None)
df.columns = ["Timestamp", "LineID", "JourneyPatternID", "TimeFrame", 
              "VehicleJourneyID", "Lon", "Lat", "VehicleID", "StopID", 
              "AtStop", "HumanTime", "Day", "Hour", "JourneyGroup", "Runtime"]

In [117]:
df.head()

Unnamed: 0,Timestamp,LineID,JourneyPatternID,TimeFrame,VehicleJourneyID,Lon,Lat,VehicleID,StopID,AtStop,HumanTime,Day,Hour,JourneyGroup,Runtime
0,1352182204000000,15,1,2012-11-06,5899,-6.151132,53.402328,33498,6318,0,2012-11-06 06:10:04,1,6,2012-11-065899,0
1,1352192390000000,15,1,2012-11-06,5828,-6.150883,53.402351,33523,6318,1,2012-11-06 08:59:50,1,8,2012-11-065828,0
2,1352192858000000,15,1,2012-11-06,5835,-6.150987,53.402309,33254,6318,1,2012-11-06 09:07:38,1,9,2012-11-065835,0
3,1352193393000000,15,1,2012-11-06,5853,-6.150987,53.402309,33209,6318,1,2012-11-06 09:16:33,1,9,2012-11-065853,0
4,1352194244000000,15,1,2012-11-06,5878,-6.151132,53.402328,33020,6318,0,2012-11-06 09:30:44,1,9,2012-11-065878,0


In [118]:
# read in weather data

weather = pd.read_csv("bus_data/hourly_weather_data.csv", low_memory=False)
#Read in the data from a csv file using Pandas
weather.tail()

Unnamed: 0,date,ind,rain,ind.1,temp,ind.2,wetb,dewpt,vappr,rhum,msl
86611,30-jun-2017 20:00,0,0.0,0,13.4,0,10.7,8.0,10.7,69,1016.5
86612,30-jun-2017 21:00,0,0.0,0,11.4,0,10.0,8.5,11.1,82,1017.2
86613,30-jun-2017 22:00,0,0.0,0,9.7,0,8.8,7.7,10.5,87,1017.6
86614,30-jun-2017 23:00,0,0.0,0,8.2,0,7.8,7.2,10.1,93,1017.9
86615,01-jul-2017 00:00,0,0.0,0,8.2,0,7.8,7.2,10.2,93,1018.2


### Weather Info

Station Name: PhoenixPark
Station Height: 48 M 
Latitude:53.358  ,Longitude: -6.342


date:  -  Date and Time (utc)
rain:  -  Precipitation Amount (mm)	  
temp:  -  Air Temperature (C)	
wetb:  -  Wet Bulb Temperature (C)
dewpt: -  Dew Point Temperature (C)
vappr: -  Vapour Pressure (hPa)		                 
rhum:  -  Relative Humidity (%) 
msl:   -  Mean Sea Level Pressure (hPa)
ind:   -  Indicator

In [119]:
weather['datetime'] = pd.to_datetime(weather['date'])

In [120]:
weather.head()

Unnamed: 0,date,ind,rain,ind.1,temp,ind.2,wetb,dewpt,vappr,rhum,msl,datetime
0,31-may-2007 01:00,-1,,4,,4,,,,,,2007-05-31 01:00:00
1,31-may-2007 02:00,-1,,4,,4,,,,,,2007-05-31 02:00:00
2,31-may-2007 03:00,-1,,4,,4,,,,,,2007-05-31 03:00:00
3,31-may-2007 04:00,-1,,4,,4,,,,,,2007-05-31 04:00:00
4,31-may-2007 05:00,-1,,4,,4,,,,,,2007-05-31 05:00:00


In [121]:
weather['Hour'] = weather['datetime'].dt.hour
weather['Date'] = weather['datetime'].dt.date
weather = weather.drop(['datetime'], axis=1)

In [122]:
# Dropping everything we don't want from weather 

weather = weather[['Date', 'Hour', 'rain']]

In [123]:
weather.head()

Unnamed: 0,Date,Hour,rain
0,2007-05-31,1,
1,2007-05-31,2,
2,2007-05-31,3,
3,2007-05-31,4,
4,2007-05-31,5,


## Merging data & weather

In [124]:
df['Date'] = pd.to_datetime(df['Timestamp'], unit='us')
df['Date'] = df['Date'].dt.date

In [134]:
# replacing empty cells with 0, converting to float
weather['rain'] = pd.to_numeric(weather['rain'], errors='coerce')
weather['rain'].fillna(0, inplace=True)

In [150]:
# Merging rain info with bus data
new_df = pd.merge(df, weather, how='left' , on=['Date','Hour'])

# renaming rain to Rain
new_df = new_df.rename(columns={'rain': 'Rain'})

In [155]:
new_df['Rain'] = new_df['Rain'].apply(lambda x: 1 if (x >= 1) else 0)

In [93]:
new_df.dtypes

Timestamp             int64
LineID                int64
JourneyPatternID      int64
TimeFrame            object
VehicleJourneyID      int64
Lon                 float64
Lat                 float64
VehicleID             int64
StopID                int64
AtStop                int64
HumanTime            object
Day                   int64
Hour                  int64
JourneyGroup         object
Runtime               int64
Date                 object
rain                 object
dtype: object

In [73]:
#Zeros showing as nan
new_df.rain[0]

nan

In [74]:
new_df.describe()

Unnamed: 0,day_x,duration,hour,weekday,rain,day_y
count,4353.0,4353.0,4353.0,4353.0,2326.0,2326.0
mean,2.584654,99.937285,13.271307,0.839191,0.131083,2.567068
std,1.769784,11.663321,4.627051,0.367397,0.416221,1.802154
min,0.0,30.0,6.0,0.0,0.0,0.0
25%,1.0,95.0,9.0,1.0,0.0,1.0
50%,3.0,100.0,13.0,1.0,0.0,2.0
75%,4.0,105.0,17.0,1.0,0.0,4.0
max,6.0,254.0,23.0,1.0,4.0,6.0


In [80]:
# new_df.fillna(0, inplace=True)

In [83]:
new_df.head()

Unnamed: 0,date,day_x,duration,hour,weekday,datetime,rain,day_y
0,2012-11-06,1,105,6,1,1970-01-01,0.0,0.0
1,2012-11-06,1,92,6,1,1970-01-01,0.0,0.0
2,2012-11-06,1,93,6,1,1970-01-01,0.0,0.0
3,2012-11-06,1,94,6,1,1970-01-01,0.0,0.0
4,2012-11-06,1,148,6,1,1970-01-01,0.0,0.0


In [84]:
new_df = new_df.drop('date', 1)
new_df = new_df.drop('datetime', 1)
new_df = new_df.drop('day_y', 1)

In [87]:
new_df.head()

Unnamed: 0,day,duration,hour,weekday,rain
0,1,105,6,1,0.0
1,1,92,6,1,0.0
2,1,93,6,1,0.0
3,1,94,6,1,0.0
4,1,148,6,1,0.0


In [86]:
new_df.columns = ['day','duration','hour','weekday','rain']


In [90]:
new_df.to_csv('00150001.rain.csv', encoding='utf-8')