In [34]:
import pandas as pd
import os
import time

import matplotlib.pyplot as plt
%matplotlib inline

In [35]:
def read_schedule(schedule_file):
    schedule = pd.read_csv(schedule_file, low_memory=False, encoding="ISO-8859-1")
    
    return schedule


In [36]:
def clean_schedule(schedule):
    
    # drop null values
    schedule.dropna(0, inplace=True)
    
    # extract StopID
    schedule["StopID"] = schedule['stop_id'].apply(lambda x: x[-4:])
    
    # Convert StopIDs back to int
    schedule['StopID'] = schedule['StopID'].astype('int64')
    
    # Extracting LineID from trip_id
    schedule["LineID"] = schedule["trip_id"].str.extract('\-(.*?)\-')
    
    # Extracting direction from trip_id
    schedule["Direction"] = schedule['trip_id'].apply(lambda x: x[-1:])
    
    # Dropping non-major journeypatterns
    schedule = schedule[(schedule.Direction == "I") | (schedule.Direction == "O")]
    
    # Converting Direction to JourneyPatternID
    schedule["JourneyPatternID"] = schedule['Direction'].apply(lambda x: "1001" if x == "I" else "0001")
    schedule['JourneyPatternID'] = schedule['JourneyPatternID'].astype('int64')
    
#     # dropping irrelevant columns
#     for column in ['trip_id', 'stop_id', 'arrival_time', 'departure_time', 
#                    'shape_dist_traveled', 'Direction', 'stop_sequence', 'stop_headsign']:
#             schedule = schedule.drop(column, 1)
            
    # Renaming 'Long' to 'Lon'
    schedule = schedule.rename(columns={'Long': 'Lon'})
    
    return schedule

In [37]:
# MIGHT HAVE TO CHANGE COLUMN NAMES - NOT SURE IF I HAVE THE CORRECT AMOUNT

def read_data(filename):

    df = pd.read_csv(filename, low_memory=False, header=None)
    df.columns = ["Timestamp", "LineID", "Direction", "JourneyPatternID", "TimeFrame",
                  "VehicleJourneyID", "Operator", "Congestion", "Lon", "Lat",
                  "Delay", "BlockID", "VehicleID", "StopID", "AtStop"]
    
    
    
    return df

In [38]:
# ADDING HOUR, DAY, HUMANTIME COLUMNS
# DROPPING IRRELEVANT COLUMNS
# CAN REMAIN UNCHANGED - MIGHT WANT TO MAKE HOUR MORE GRANULAR? 

def add_features(df):

    # Add column for human readable time
    df['HumanTime'] = pd.to_datetime(df['Timestamp'], unit='us')


    # Add day of week column
    df['Day'] = df['HumanTime'].dt.dayofweek

    # Add hour of day column
    df['Hour'] = df['HumanTime'].dt.hour


    # Dropping irrelevant columns
    for column in ['BlockID', 'Direction', 'Operator', 'Delay', 'Congestion']:
        df = df.drop(column, 1)
    
    return df

In [39]:
def add_journeygroup(df):
    # For testing only - filter_direction is causing errors
    # So we need to test
    # converting data, adding compound feature
    for column in ['TimeFrame', 'VehicleJourneyID',]:
            df[column] = df[column].astype('str')

    df["JourneyGroup"] = df["TimeFrame"] + df["VehicleJourneyID"]
    
    return df
    

In [40]:
# REMOVING STOPS THAT AREN'T IN THE SCHEDULE. 

def schedule_validate(df, schedule):
    schedule_stops = schedule.StopID.unique().tolist()
    
    df = df[df.StopID.isin(schedule_stops)]
    
    return df


In [41]:
def filter_direction2(df, schedule):
    
    # casting StopID to int, removing null values:
    try:
        df['StopID'] = df['StopID'].astype('int64')
    except:
        df = df[df.StopID != 'null']
        df['StopID'] = df['StopID'].astype('int64')
    
    # getting line variable
    line = df.LineID.iloc[0]
    line = str(line)

    # converting data, adding compound feature
    for column in ['TimeFrame', 'VehicleJourneyID',]:
            df[column] = df[column].astype('str')

    # creating compound feature, getting patterns, creating temp df
    df["JourneyGroup"] = df["TimeFrame"] + df["VehicleJourneyID"]
    patterns = df.JourneyPatternID.unique()
    tempdf = pd.DataFrame()

    for pattern in patterns:
        
        # Getting first 5 stops for all variations in schedule 
        pattern_sched = schedule[(schedule.LineID == line) & (schedule.JourneyPatternID == pattern)]
        starting_stops = set()
        headsigns = pattern_sched.stop_headsign.unique()
        for sign in headsigns:
            headsign_sched = pattern_sched[pattern_sched.stop_headsign == sign]
            first_5 = set(headsign_sched.head(5).StopID.tolist())
            starting_stops = first_5.union(starting_stops)
            
        starting_stops = list(starting_stops)
                
        # Getting first stops of all journeys in our data
        patterndf = df[df.JourneyPatternID == pattern]
        firstlines = patterndf.groupby(["TimeFrame", "VehicleJourneyID"]).head(1)
        
        # removing stops from our data that don't appear in schedule 
        patterndf = schedule_validate(patterndf, pattern_sched)

        # Getting all journeys that start at in the first 5
        valid_journeys = []
        for index, row in firstlines.iterrows():
            if row.StopID in starting_stops:
                valid_journeys.append(row.JourneyGroup)

        # removing journeys that don't start at the right stop
        patterndf = patterndf[patterndf.JourneyGroup.isin(valid_journeys)]

        if tempdf.empty:
            tempdf = patterndf
        else: 
            tempdf = pd.concat([tempdf, patterndf], axis=0)

    return tempdf

In [42]:
# DROPPING ROWS WHERE BUS ISN'T AT STOP 
# THIS SECTION NEEDS TO BE CHANGED, INCLUDE ONLY ROWS WHERE STOPID CHANGES

def drop_rows(df):

    # drop duplicate rows
    df = df.drop_duplicates(["TimeFrame", "VehicleJourneyID", "StopID"])

    # mean = df.JourneyGroup.value_counts().mean()
    # drop trips with less than 5 stops
    df = df[df.groupby('JourneyGroup').JourneyGroup.transform(len) > 5]
    
    return df

In [43]:
def runtime_function(row, mydict):
    # Takes a row and a dictionary of start times
    # returns time elapsed (seconds) between that row's timestamp and the start of the line
#     start = mydict[row.TimeFrame, row.VehicleID, row.VehicleJourneyID]["time"]
    start = mydict[row.TimeFrame, row.VehicleJourneyID]["time"]
    
    current = row.Timestamp
    
    if current - start < 0:
        print(row.StopID, "negative value")
    return (current - start) // 1000000

In [44]:
# ADDING RUNTIME COLUMN - THIS CAN REMAIN UNCHANGED

def add_runtime(df):
    # Putting the first sightings of a vehiclejourneyid and timeframe combo timestamp into a dictionary
    start_times = {}
    
#     df = df.sort_values(['Timestamp'])

    df = df.sort_values(['Timestamp'])

    # This gives you the first line anything has been seen by
#     firstlines = df.groupby(["TimeFrame", "VehicleID", "VehicleJourneyID"]).head(1)
    firstlines = df.groupby(["TimeFrame", "VehicleJourneyID"]).head(1)


    # This iterates through them and assigns values to the dictionary
    for index, row in firstlines.iterrows():
        start_times[row.TimeFrame, row.VehicleJourneyID] = {"time":row.Timestamp, "loc":[row.Lat, row.Lon]}

#         start_times[row.TimeFrame, row.VehicleID, row.VehicleJourneyID] = {"time":row.Timestamp, "loc":[row.Lat, row.Lon]}


    df['Runtime'] = ""

    # Applies this function to the newdf
    df['Runtime'] = df.apply(lambda row: runtime_function(row, start_times),axis=1)
    
    return df

In [45]:
def insert_into_file(df, writefile):
    """  This function writes a dataframe (df) to a file (writefile),
        or does nothing if the file doesn't exist
        
        CHANGED FROM OTHER
    """
    try:
        with open(writefile, 'a') as f:
#             df.to_csv(f, header=False, index=False)
            print(writefile, "exists")
            pass
    except IOError:
        with open(writefile, 'w+') as f:
            df.to_csv(f, header=False, index=False)


In [46]:
def main(read_directory, schedule_file, write_directory):
    
    schedule = read_schedule(schedule_file)
    schedule = clean_schedule(schedule)
    
    for read_file in os.listdir(read_directory):
        if read_file.endswith(".csv"):
            if os.path.isfile(write_directory + "/" + read_file):
                print(read_file, "exists")
            else:
                print("Reading", read_file, "from", read_directory)
                try:
                    df = read_data(read_directory + "/" + read_file)
                    df = add_features(df)
                    df = filter_direction2(df, schedule)
                    df = add_journeygroup(df)
                    df = drop_rows(df)
                    df = add_runtime(df)
        


                    with open(write_directory + "/" + read_file, 'w+') as f:
                        df.to_csv(f, header=False, index=False)
                except (ValueError, IndexError) as error:
                    print(error, "!")
                    print("Couldn't finish", read_file)
                    

                print("Finished", read_file)
                print()
    print("Finished main!")

In [47]:
# MAIN SECTION

read_directory = "bus_data/line_data2"
write_directory = "bus_data/clean_data5"


schedule_file = 'bus_data/dublinbus_scheduledData2013csv.csv'

main(read_directory, schedule_file, write_directory)



Reading 1.csv from bus_data/line_data2
Finished 1.csv

Reading 102.csv from bus_data/line_data2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Finished 102.csv

Reading 104.csv from bus_data/line_data2
Finished 104.csv

Reading 11.csv from bus_data/line_data2
Finished 11.csv

Reading 111.csv from bus_data/line_data2
Finished 111.csv

Reading 114.csv from bus_data/line_data2
Finished 114.csv

Reading 116.csv from bus_data/line_data2
Finished 116.csv

Reading 118.csv from bus_data/line_data2
Finished 118.csv

Reading 120.csv from bus_data/line_data2
Finished 120.csv

Reading 122.csv from bus_data/line_data2
Finished 122.csv

Reading 123.csv from bus_data/line_data2
Finished 123.csv

Reading 13.csv from bus_data/line_data2
Finished 13.csv

Reading 130.csv from bus_data/line_data2
Finished 130.csv

Reading 14.csv from bus_data/line_data2
Finished 14.csv

Reading 140.csv from bus_data/line_data2
Finished 140.csv

Reading 142.csv from bus_data/line_data2
Finished 142.csv

Reading 145.csv from bus_data/line_data2
Finished 145.csv

Reading 14C.csv from bus_data/line_data2
Finished 14C.csv

Reading 15.csv from bus_data/line_data2
Fini

In [21]:
line = "102"

readfile = "bus_data/line_data2/" + line + ".csv"


schedule_file = 'bus_data/dublinbus_scheduledData2013csv.csv'
schedule = read_schedule(schedule_file)
schedule = clean_schedule(schedule)
    
    
df = read_data(readfile)
df = add_features(df)
df = filter_direction2(df, schedule)
df = add_journeygroup(df)
df = drop_rows(df)

# df.to_csv('bus_data/clean_data_rough/' + line + '.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [30]:
newdf = add_runtime(df)

In [31]:
# newdf.to_csv('bus_data/clean_data_rough/' + line + '.csv')

In [32]:
newdf[newdf.Runtime < 0].StopID.value_counts()

Series([], Name: StopID, dtype: int64)

In [None]:
df.head()

In [None]:
df = df[df.JourneyPatternID == 1]

In [None]:
df.StopID.unique().shape

In [None]:
out = df[df.JourneyPatternID == 1]

out = out.groupby(['StopID'])['Runtime'].mean().reset_index()

# out
# out.sort_values(['Runtime'])


In [None]:
journeys = df.JourneyGroup.unique()
journeys = journeys[:100]

In [None]:
negs = df[(df.Runtime < 0)]
negs.head(100)

In [None]:
# for index in journeys:
#     for i in range(1000):
#         first = df[df.JourneyGroup == index].head(1)
#     #     if first.Runtime < 0:
#     #         print(first[['HumanTime', 'Runtime']])
#         print(first[['JourneyGroup', 'Runtime']])

In [None]:
journey = df[df.JourneyGroup == "2012-11-065815"]
times = journey[['HumanTime', 'Runtime']]

# middle = journey[(journey.Runtime < 10000) & (times.Runtime > -1000)]

# middle

journey.tail()


In [None]:
# 6282

In [None]:
times.plot()

In [None]:
df.JourneyGroup.value_counts().plot()

In [None]:
locations = newdf[["Lon", "Lat"]]
locations.plot.scatter(x="Lon", y="Lat")

In [None]:
newdf.JourneyGroup.unique()

In [None]:
xxxx = df[df.JourneyGroup == "2013-01-085057"]
xxxx.shape

In [None]:
firstlines = patterndf.groupby(["TimeFrame", "VehicleJourneyID"]).head(1)



In [None]:
# ADDING RUNTIME COLUMN - THIS CAN REMAIN UNCHANGED

# # Putting the first sightings of a vehiclejourneyid and timeframe combo timestamp into a dictionary
# start_times = {}

# # This gives you the first line anything has been seen by
# firstlines = df.groupby(["TimeFrame", "VehicleJourneyID"]).head(1)

# # This iterates through them and assigns values to the dictionary
# for index, row in firstlines.iterrows():
#     start_times[row.TimeFrame, row.VehicleJourneyID] = {"time":row.Timestamp, "loc":[row.Lat, row.Lon]}


# df['Runtime'] = ""

# # Applies this function to the newdf
# df['Runtime'] = df.apply(lambda row: add_runtime(row, start_times),axis=1)