In [6]:
import pandas as pd
import os

In [7]:
def pull_data(readfile, line_id):
    """  This function opens a file (filename), pulls out the rows for line_id, and returns 
        a dataframe containing only these rows
    """
    
    df = pd.read_csv(readfile, low_memory=False, header=None)
    df.columns = ["Timestamp", "LineID", "Direction", "JourneyPatternID", "TimeFrame", 
                  "VehicleJourneyID", "Operator", "Congestion", "LonWGS84", "LatWGS84", 
                  "Delay", "BlockID", "VehicleID", "StopID", "AtStop"]
    
    # not sure exactly why I need to check both string/int versions of LineID
    # In some of the files LineID first shows up as a float64, in others as an object
    # This seems to affect how LineID can be accessed once it's cast to a category
    line_id_str = str(line_id)

    linedata = df[(df.LineID == line_id)|(df.LineID == str(line_id))]
        
    return linedata

In [8]:
def insert_into_file(df, writefile):
    """  This function writes a dataframe (df) to a file (writefile),
        creating that file if it doesn't exist.
    """
    try:
        with open(writefile, 'a') as f:
            df.to_csv(f, header=False, index=False)
    except IOError:
        with open(writefile, 'w+') as f:
            df.to_csv(f, header=False, index=False)

In [9]:
def main(directory, line_id, writefile):
    """  This function loops through all csv files in specified directory
        It then pulls all of the rows for the specified line, and inserts these into a file (writefile)
    """
    for readfile in os.listdir(directory):
        if readfile.endswith(".csv"): 
            readfile = directory + "/" + readfile
            print("Reading", readfile)
            df = pull_data(readfile, line_id)
            print("Writing to", writefile, "...")
            insert_into_file(df, writefile)
            print()
    print("Finished!")

In [10]:
main("bus_data/Dcc", 46, "bus_data/cleaned_data/line46.csv")

Reading bus_data/Dcc/siri.20121106.csv
Writing to bus_data/cleaned_data/line46.csv ...

Reading bus_data/Dcc/siri.20121107.csv
Writing to bus_data/cleaned_data/line46.csv ...

Reading bus_data/Dcc/siri.20121108.csv
Writing to bus_data/cleaned_data/line46.csv ...

Reading bus_data/Dcc/siri.20121109.csv
Writing to bus_data/cleaned_data/line46.csv ...

Reading bus_data/Dcc/siri.20121110.csv
Writing to bus_data/cleaned_data/line46.csv ...

Reading bus_data/Dcc/siri.20121111.csv
Writing to bus_data/cleaned_data/line46.csv ...

Reading bus_data/Dcc/siri.20121112.csv
Writing to bus_data/cleaned_data/line46.csv ...

Reading bus_data/Dcc/siri.20121113.csv
Writing to bus_data/cleaned_data/line46.csv ...

Reading bus_data/Dcc/siri.20121114.csv
Writing to bus_data/cleaned_data/line46.csv ...

Reading bus_data/Dcc/siri.20121115.csv
Writing to bus_data/cleaned_data/line46.csv ...

Reading bus_data/Dcc/siri.20121116.csv
Writing to bus_data/cleaned_data/line46.csv ...

Reading bus_data/Dcc/siri.201211

In [11]:
main("bus_data/Sir", 46, "bus_data/cleaned_data/line46.csv")

Reading bus_data/Sir/siri.20130101.csv
Writing to bus_data/cleaned_data/line46.csv ...

Reading bus_data/Sir/siri.20130102.csv
Writing to bus_data/cleaned_data/line46.csv ...

Reading bus_data/Sir/siri.20130103.csv
Writing to bus_data/cleaned_data/line46.csv ...

Reading bus_data/Sir/siri.20130104.csv
Writing to bus_data/cleaned_data/line46.csv ...

Reading bus_data/Sir/siri.20130105.csv
Writing to bus_data/cleaned_data/line46.csv ...

Reading bus_data/Sir/siri.20130106.csv
Writing to bus_data/cleaned_data/line46.csv ...

Reading bus_data/Sir/siri.20130107.csv
Writing to bus_data/cleaned_data/line46.csv ...

Reading bus_data/Sir/siri.20130108.csv
Writing to bus_data/cleaned_data/line46.csv ...

Reading bus_data/Sir/siri.20130109.csv
Writing to bus_data/cleaned_data/line46.csv ...

Reading bus_data/Sir/siri.20130110.csv
Writing to bus_data/cleaned_data/line46.csv ...

Reading bus_data/Sir/siri.20130111.csv
Writing to bus_data/cleaned_data/line46.csv ...

Reading bus_data/Sir/siri.201301