In [1]:
import pandas as pd
import os
from pandas import DataFrame
from metadata import BICYCLEMETADATA

In [2]:
def DropColumns(dataFrame: DataFrame, renameColumns: dict, columnsRetain: list = ['day','Total']) -> DataFrame:
    dataFrame = dataFrame[columnsRetain]
    dataFrame = dataFrame.rename(columns=renameColumns)
    return dataFrame


In [3]:
def ConvertDaytoDateTime(dataFrame: DataFrame) -> DataFrame:
    dataFrame['day'] = pd.to_datetime(dataFrame['day'])
    dataFrame = dataFrame.sort_values(by='day')
    return dataFrame


In [4]:
def Get1HrIntervals(dataFrame: DataFrame, columnName: str) -> DataFrame:
    dataFrame = dataFrame.resample('1H', on=columnName).sum().reset_index()
    return dataFrame
    

In [5]:
def ConcatDataFrames(dataFrames: list) -> DataFrame:
    dataFrame = pd.concat(dataFrames, axis=1)
    retainColumns = ~dataFrame.columns.duplicated()
    dataFrame = dataFrame.loc[:,retainColumns]
    return dataFrame
    

In [6]:
def LoadDataSet(folderPath: str, metaData: list) -> DataFrame:
    if(os.path.isdir(folderPath) and folderPath[-1]!="/"):
        print("enter a valid folderPath")
    else:
        finalDataFrame = None
        files = os.listdir(folderPath)
        # for filename in files:
        #     print("Reading DataSet from", filename)
        #     dataFrame =  pd.read_csv(folderPath+filename, index_col=None, header=0)
        for data in metaData:
            print("Reading DataSet from", data['filename'])
            dataFrame =  pd.read_csv(folderPath+data['filename'], index_col=None, header=0)
            dataFrame = ConvertDaytoDateTime(dataFrame)
            dataFrame = Get1HrIntervals(dataFrame, 'day')
            dataFrame = DropColumns(dataFrame, renameColumns=data['renameColumns'] )
            dataFrame['Zipcode'] = data['Zipcode']
            DFColumns = list(dataFrame.columns)
            columnsRearrange = [DFColumns[0], DFColumns[-1]] +  DFColumns[1:-1]
            dataFrame = dataFrame[columnsRearrange]
            if(type(finalDataFrame) == type(None)):
                finalDataFrame = dataFrame
            else:
                finalDataFrame = pd.merge(finalDataFrame, dataFrame, on=['day','Zipcode' ], how='outer')
                
            
    # dataFrame = ConcatDataFrames(dataFrames)
    return finalDataFrame

In [7]:
folderPath = "Dataset/Bicycle Dataset/"

In [8]:
dataFrame = LoadDataSet(folderPath, BICYCLEMETADATA)
dataFrame.columns

Reading DataSet from Colorado_and_30th_Northbound.csv
Reading DataSet from Colorado_and_30th_Southbound.csv
Reading DataSet from Colorado_and_30th_Westbound.csv


Index(['day', 'Zipcode', 'NorthBound', 'SouthBound', 'WestBound'], dtype='object')

In [9]:
dataFrame = dataFrame.dropna()

In [10]:
dataFrame.head()

Unnamed: 0,day,Zipcode,NorthBound,SouthBound,WestBound
0,2017-08-01 00:00:00,80203,1,2.0,2.0
1,2017-08-01 01:00:00,80203,0,1.0,0.0
2,2017-08-01 02:00:00,80203,1,0.0,0.0
3,2017-08-01 03:00:00,80203,0,0.0,0.0
4,2017-08-01 04:00:00,80203,0,0.0,0.0


In [11]:
dataFrame.shape
#2293 x 5

(55032, 5)