In [3]:
# An exercise for the merge operation
import pandas as pd

bus_schedule = pd.DataFrame(
    {"Bus Number": ["Bx28", "M70", "Q14", "B8", "S79"],
     "Number of stops": [20, 17, 22, 21, 25],
     "Destination": ["Bedford Pk Blvd", "Time Square", "Corona Park",
                     "Prospect Park", "Staten Island Ferry"]}
)

bus_schedule

Unnamed: 0,Bus Number,Number of stops,Destination
0,Bx28,20,Bedford Pk Blvd
1,M70,17,Time Square
2,Q14,22,Corona Park
3,B8,21,Prospect Park
4,S79,25,Staten Island Ferry


In [5]:
bus_info = pd.DataFrame(
    {
        "Bus": ["Bx28", "Bx29", "Bx30", "M70", "M71", "M72", "Q14", "B8", "S79"],
        "Borough": ["Bronx", "Bronx", "Bronx", "Manhattan",
                   "Manhattan", "Manhattan", "Queens", "Brooklyn",
                   "Staten Island"],
        "Number of Drivers": [5, 5, 5, 5, 6, 6, 7, 5, 6]
    }
)
bus_info

Unnamed: 0,Bus,Borough,Number of Drivers
0,Bx28,Bronx,5
1,Bx29,Bronx,5
2,Bx30,Bronx,5
3,M70,Manhattan,5
4,M71,Manhattan,6
5,M72,Manhattan,6
6,Q14,Queens,7
7,B8,Brooklyn,5
8,S79,Staten Island,6


In [7]:
# Merge these data frames
pd.merge(bus_schedule, bus_info, left_on="Bus Number", right_on="Bus")

Unnamed: 0,Bus Number,Number of stops,Destination,Bus,Borough,Number of Drivers
0,Bx28,20,Bedford Pk Blvd,Bx28,Bronx,5
1,M70,17,Time Square,M70,Manhattan,5
2,Q14,22,Corona Park,Q14,Queens,7
3,B8,21,Prospect Park,B8,Brooklyn,5
4,S79,25,Staten Island Ferry,S79,Staten Island,6


In [8]:
# What if we want to keep all record from the second data frame?
pd.merge(bus_schedule, bus_info, left_on="Bus Number", right_on="Bus",
         how="outer")

Unnamed: 0,Bus Number,Number of stops,Destination,Bus,Borough,Number of Drivers
0,Bx28,20.0,Bedford Pk Blvd,Bx28,Bronx,5
1,M70,17.0,Time Square,M70,Manhattan,5
2,Q14,22.0,Corona Park,Q14,Queens,7
3,B8,21.0,Prospect Park,B8,Brooklyn,5
4,S79,25.0,Staten Island Ferry,S79,Staten Island,6
5,,,,Bx29,Bronx,5
6,,,,Bx30,Bronx,5
7,,,,M71,Manhattan,6
8,,,,M72,Manhattan,6


In [9]:
# Save the data into a csv file.
bus_all = pd.merge(bus_schedule, bus_info, left_on="Bus Number", right_on="Bus",
         how="outer")
bus_all.to_csv("data/bus_info.csv")

In [12]:
# Load from the file
df = pd.read_csv("data/bus_info.csv", index_col=0)
df

Unnamed: 0,Bus Number,Number of stops,Destination,Bus,Borough,Number of Drivers
0,Bx28,20.0,Bedford Pk Blvd,Bx28,Bronx,5
1,M70,17.0,Time Square,M70,Manhattan,5
2,Q14,22.0,Corona Park,Q14,Queens,7
3,B8,21.0,Prospect Park,B8,Brooklyn,5
4,S79,25.0,Staten Island Ferry,S79,Staten Island,6
5,,,,Bx29,Bronx,5
6,,,,Bx30,Bronx,5
7,,,,M71,Manhattan,6
8,,,,M72,Manhattan,6


In [13]:
df.to_pickle("data/bus.pickle")
df2 = pd.read_pickle("data/bus.pickle")
df2

Unnamed: 0,Bus Number,Number of stops,Destination,Bus,Borough,Number of Drivers
0,Bx28,20.0,Bedford Pk Blvd,Bx28,Bronx,5
1,M70,17.0,Time Square,M70,Manhattan,5
2,Q14,22.0,Corona Park,Q14,Queens,7
3,B8,21.0,Prospect Park,B8,Brooklyn,5
4,S79,25.0,Staten Island Ferry,S79,Staten Island,6
5,,,,Bx29,Bronx,5
6,,,,Bx30,Bronx,5
7,,,,M71,Manhattan,6
8,,,,M72,Manhattan,6


In [20]:
# Groupby: Split -> Apply -> Aggregate
result1 = df.groupby("Borough")['Number of Drivers'].sum().to_frame("Drivers")
result1

Unnamed: 0_level_0,Drivers
Borough,Unnamed: 1_level_1
Bronx,15
Brooklyn,5
Manhattan,17
Queens,7
Staten Island,6


In [21]:
result2 = df.groupby("Borough")['Number of stops'].mean().to_frame("Average Number of Stops")
result2

Unnamed: 0_level_0,Average Number of Stops
Borough,Unnamed: 1_level_1
Bronx,20.0
Brooklyn,21.0
Manhattan,17.0
Queens,22.0
Staten Island,25.0


In [22]:
pd.merge(result1, result2, left_index=True, right_index=True)

Unnamed: 0_level_0,Drivers,Average Number of Stops
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1
Bronx,15,20.0
Brooklyn,5,21.0
Manhattan,17,17.0
Queens,7,22.0
Staten Island,6,25.0


In [27]:
# I have created a txt file called "timetable.txt" in the data folder.
# Name               Check-in
# Alice              05/09/2023,10:20
# Bob                05/09/2023,11:00
# Clare              05/09/2023,11:15

# Let's load it as a data frame
data = pd.read_csv('data/timetable.txt', sep="\s+")
data

Unnamed: 0,Name,Check-in
0,Alice,"05/09/2023,10:20"
1,Bob,"05/09/2023,11:00"
2,Clare,"05/09/2023,11:15"


In [28]:
data.dtypes

Name        object
Check-in    object
dtype: object

In [37]:
# Turn the strings in the check-in column to datetime objects
# import datetime 
# pd.to_datetime(data['Check-in'])

from dateutil.parser import parse
data['check-in'] = data['Check-in'].apply(datetime.datetime.strptime, args=('%m/%d/%Y,%H:%M',))
# datetime.datetime.strptime('05/09/2023,10:20', '%m/%d/%Y,%H:%M')
data

Unnamed: 0,Name,Check-in,check-in
0,Alice,"05/09/2023,10:20",2023-05-09 10:20:00
1,Bob,"05/09/2023,11:00",2023-05-09 11:00:00
2,Clare,"05/09/2023,11:15",2023-05-09 11:15:00


In [40]:
data.sort_values('check-in')

Unnamed: 0,Name,Check-in,check-in
0,Alice,"05/09/2023,10:20",2023-05-09 10:20:00
1,Bob,"05/09/2023,11:00",2023-05-09 11:00:00
2,Clare,"05/09/2023,11:15",2023-05-09 11:15:00
