In [50]:
# general 
import datetime
import os

# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline




# Importing and Cleaning Data

In [51]:
def read_data(data_file):
    df = pd.read_csv(data_file, low_memory=False, header=None)
    df.columns = ["Timestamp", "LineID", "JourneyPatternID", "TimeFrame", 
                  "VehicleJourneyID", "Lon", "Lat", "VehicleID", "StopID", 
                  "AtStop", "HumanTime", "Day", "Hour", "JourneyGroup", "Runtime"]
        
    return df


In [52]:
def read_weather(weather_file):
    weather = pd.read_csv(weather_file, low_memory=False)
    
    return weather


In [53]:
def clean_weather(weather):
    weather['datetime'] = pd.to_datetime(weather['date'])
    
    weather['Hour'] = weather['datetime'].dt.hour
    weather['Date'] = weather['datetime'].dt.date
    weather = weather.drop(['datetime'], axis=1)
    
    weather = weather[['Date', 'Hour', 'rain']]
    
    # replacing empty cells with 0, converting to float
    weather['rain'] = pd.to_numeric(weather['rain'], errors='coerce')
    weather['rain'].fillna(0, inplace=True)
    
    # renaming rain to Rain
    weather = weather.rename(columns={'rain': 'Rain'})

    # converting rain to boolean
    weather['Rain'] = weather['Rain'].apply(lambda x: 1 if (x >= 1) else 0)
    
    return weather
    
    

In [54]:
def clean_df(df):
    df['Date'] = pd.to_datetime(df['Timestamp'], unit='us')
    df['Date'] = df['Date'].dt.date
    
    return df

In [55]:
def merge_save(df, weather, write_file):
    
    # Merging rain info with bus data
    new_df = pd.merge(df, weather, how='left' , on=['Date','Hour'])
    
    if new_df.shape[0] != df.shape[0]:
        raise ValueError("Imperfect mapping")
    
    new_df.to_csv(write_file)

In [60]:
def main(read_directory, write_directory, weather_file):
    
    print("Cleaning weather data...")
    weather = read_weather(weather_file)
    weather = clean_weather(weather)
    
    for read_file in os.listdir(read_directory):
        if read_file.endswith(".csv"):

            data_file = read_directory + "/" + read_file
            print(data_file)
            
            df = read_data(data_file)
            df = clean_df(df)
            
            try:
                merge_save(df, weather, write_directory + "/" + read_file)
            except ValueError as e:
                print(e)     

            print("Finished", read_file)
            print()
    print("Finished main!")

In [61]:
read_directory = "bus_data/clean_data3"
write_directory = "bus_data/clean_data4"
weather_file = "bus_data/hourly_weather_data.csv"

main(read_directory, write_directory, weather_file)

Cleaning weather data...
bus_data/clean_data3/1.csv
Finished 1.csv

bus_data/clean_data3/104.csv
Finished 104.csv

bus_data/clean_data3/11.csv
Finished 11.csv

bus_data/clean_data3/111.csv
Finished 111.csv

bus_data/clean_data3/114.csv
Finished 114.csv

bus_data/clean_data3/116.csv
Finished 116.csv

bus_data/clean_data3/118.csv
Finished 118.csv

bus_data/clean_data3/120.csv
Finished 120.csv

bus_data/clean_data3/123.csv
Finished 123.csv

bus_data/clean_data3/13.csv
Finished 13.csv

bus_data/clean_data3/130.csv
Finished 130.csv

bus_data/clean_data3/14.csv
Finished 14.csv

bus_data/clean_data3/140.csv
Finished 140.csv

bus_data/clean_data3/142.csv
Finished 142.csv

bus_data/clean_data3/145.csv
Finished 145.csv

bus_data/clean_data3/14C.csv
Finished 14C.csv

bus_data/clean_data3/15.csv
Finished 15.csv

bus_data/clean_data3/150.csv
Finished 150.csv

bus_data/clean_data3/151.csv
Finished 151.csv

bus_data/clean_data3/15A.csv
Finished 15A.csv

bus_data/clean_data3/15B.csv
Finished 15B.csv



In [115]:
line = "15"

In [116]:
# Read in our data
df = pd.read_csv("bus_data/clean_data3/" + line + ".csv", low_memory=False, header=None)
df.columns = ["Timestamp", "LineID", "JourneyPatternID", "TimeFrame", 
              "VehicleJourneyID", "Lon", "Lat", "VehicleID", "StopID", 
              "AtStop", "HumanTime", "Day", "Hour", "JourneyGroup", "Runtime"]

In [117]:
df.head()

Unnamed: 0,Timestamp,LineID,JourneyPatternID,TimeFrame,VehicleJourneyID,Lon,Lat,VehicleID,StopID,AtStop,HumanTime,Day,Hour,JourneyGroup,Runtime
0,1352182204000000,15,1,2012-11-06,5899,-6.151132,53.402328,33498,6318,0,2012-11-06 06:10:04,1,6,2012-11-065899,0
1,1352192390000000,15,1,2012-11-06,5828,-6.150883,53.402351,33523,6318,1,2012-11-06 08:59:50,1,8,2012-11-065828,0
2,1352192858000000,15,1,2012-11-06,5835,-6.150987,53.402309,33254,6318,1,2012-11-06 09:07:38,1,9,2012-11-065835,0
3,1352193393000000,15,1,2012-11-06,5853,-6.150987,53.402309,33209,6318,1,2012-11-06 09:16:33,1,9,2012-11-065853,0
4,1352194244000000,15,1,2012-11-06,5878,-6.151132,53.402328,33020,6318,0,2012-11-06 09:30:44,1,9,2012-11-065878,0


In [118]:
# read in weather data

weather = pd.read_csv("bus_data/hourly_weather_data.csv", low_memory=False)
#Read in the data from a csv file using Pandas
weather.tail()

Unnamed: 0,date,ind,rain,ind.1,temp,ind.2,wetb,dewpt,vappr,rhum,msl
86611,30-jun-2017 20:00,0,0.0,0,13.4,0,10.7,8.0,10.7,69,1016.5
86612,30-jun-2017 21:00,0,0.0,0,11.4,0,10.0,8.5,11.1,82,1017.2
86613,30-jun-2017 22:00,0,0.0,0,9.7,0,8.8,7.7,10.5,87,1017.6
86614,30-jun-2017 23:00,0,0.0,0,8.2,0,7.8,7.2,10.1,93,1017.9
86615,01-jul-2017 00:00,0,0.0,0,8.2,0,7.8,7.2,10.2,93,1018.2


### Weather Info

Station Name: PhoenixPark
Station Height: 48 M 
Latitude:53.358  ,Longitude: -6.342


date:  -  Date and Time (utc)
rain:  -  Precipitation Amount (mm)	  
temp:  -  Air Temperature (C)	
wetb:  -  Wet Bulb Temperature (C)
dewpt: -  Dew Point Temperature (C)
vappr: -  Vapour Pressure (hPa)		                 
rhum:  -  Relative Humidity (%) 
msl:   -  Mean Sea Level Pressure (hPa)
ind:   -  Indicator

In [119]:
weather['datetime'] = pd.to_datetime(weather['date'])

In [120]:
weather.head()

Unnamed: 0,date,ind,rain,ind.1,temp,ind.2,wetb,dewpt,vappr,rhum,msl,datetime
0,31-may-2007 01:00,-1,,4,,4,,,,,,2007-05-31 01:00:00
1,31-may-2007 02:00,-1,,4,,4,,,,,,2007-05-31 02:00:00
2,31-may-2007 03:00,-1,,4,,4,,,,,,2007-05-31 03:00:00
3,31-may-2007 04:00,-1,,4,,4,,,,,,2007-05-31 04:00:00
4,31-may-2007 05:00,-1,,4,,4,,,,,,2007-05-31 05:00:00


In [121]:
weather['Hour'] = weather['datetime'].dt.hour
weather['Date'] = weather['datetime'].dt.date
weather = weather.drop(['datetime'], axis=1)

In [122]:
# Dropping everything we don't want from weather 

weather = weather[['Date', 'Hour', 'rain']]

In [123]:
weather.head()

Unnamed: 0,Date,Hour,rain
0,2007-05-31,1,
1,2007-05-31,2,
2,2007-05-31,3,
3,2007-05-31,4,
4,2007-05-31,5,


## Merging data & weather

In [124]:
df['Date'] = pd.to_datetime(df['Timestamp'], unit='us')
df['Date'] = df['Date'].dt.date

In [134]:
# replacing empty cells with 0, converting to float
weather['rain'] = pd.to_numeric(weather['rain'], errors='coerce')
weather['rain'].fillna(0, inplace=True)

In [150]:
# Merging rain info with bus data
new_df = pd.merge(df, weather, how='left' , on=['Date','Hour'])