# Statistica Analysis

1) Determine the average time between the two departures of each of the lines. What are the lines with the most common departures? What are the lines with the least common departures?

In [1]:
import numpy as np
import pandas as pd
import datetime as dt
from pandas import Timedelta

cleaned_data_path = "cleaned_data"

### Reading the data

In [2]:
stopTimes = pd.read_csv(f"{cleaned_data_path}/stop_times.csv", parse_dates=['arrival_time'])
routes = pd.read_csv(f"{cleaned_data_path}/routes.csv")
stops = pd.read_csv(f"{cleaned_data_path}/stops.csv")
trips = pd.read_csv(f"{cleaned_data_path}/trips.csv")
stopsRoutes = pd.read_csv(f"{cleaned_data_path}/stops_routes.csv")

In [3]:
stopTimes.head(2)

Unnamed: 0,trip_id,arrival_time,stop_id,stop_sequence
0,665800,2023-08-28 20:00:00,638,1
1,665800,2023-08-28 20:01:00,148,2


In [4]:
trips.head(2)

Unnamed: 0,route_id,service_id,trip_id,direction_id
0,2,Zimski-Subota,1421378,0
1,2,Zimski-Subota,1421379,0


In [5]:
trips.service_id.value_counts()

Zimski-Radni Dan    26366
Zimski-Subota       20288
Zimski-Nedelja      18373
Name: service_id, dtype: int64

Comment: Only winter schedule is included. Freqiency will be calculated first by route AND direction, and then only by route (day of the week (service_id) won't be considered)

In [6]:
# example: rows that are duplicates after we remove trip_id. should we keep them?
stopTimes[stopTimes.stop_sequence==1].merge(trips[['route_id', 'trip_id', 'direction_id']])[stopTimes[stopTimes.stop_sequence==1].merge(trips[['route_id', 'trip_id', 'direction_id']]).route_id==553].sort_values(['direction_id', 'arrival_time']).head(5)

Unnamed: 0,trip_id,arrival_time,stop_id,stop_sequence,route_id,direction_id
55351,1296049,2023-08-28 04:25:00,3117,1,553,0
50311,1197965,2023-08-28 05:20:00,3117,1,553,0
50332,1197987,2023-08-28 05:25:00,3117,1,553,0
50352,1198009,2023-08-28 05:25:00,3117,1,553,0
55353,1296051,2023-08-28 06:10:00,3117,1,553,0


In [7]:
def format_timedelta(td):
    hours, remainder = divmod(td.total_seconds(), 3600)
    minutes, seconds = divmod(remainder, 60)
    return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02}"

In [8]:
# average time between departures for each line and its direction
avg_freq_by_route_and_direction = (
    stopTimes[stopTimes.stop_sequence==1][['trip_id', 'arrival_time']]
    .merge(trips[['route_id', 'trip_id', 'direction_id']])
    .merge(routes[['route_id', 'route_short_name']])
    .drop(['trip_id'], axis=1)
     #.drop_duplicates() # should this be done? This changes result
    .sort_values(['route_id', 'direction_id', 'arrival_time'])
    .groupby(['route_id', 'route_short_name', 'direction_id'])
    .apply(lambda group: group.assign(difference=group['arrival_time'].diff())) # calculate difference between sequential departures within the same route and direction
    .groupby(['route_id', 'route_short_name', 'direction_id'])
    .agg(
        average=('difference', 'mean') # calculate the average time by route AND direction
    )
    .reset_index()
    .query('not @pd.isna(average)')
    .query('average > @pd.Timedelta(seconds=0)')
    .sort_values(['average'])
    .reset_index(drop=True)
    )
print("The most common routes and directions: ")
avg_freq_by_route_and_direction.assign(average=lambda df: df['average'].apply(format_timedelta)).reset_index(drop=True).head(5)

The most common routes and directions: 


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(lambda group: group.assign(difference=group['arrival_time'].diff())) # calculate difference between sequential departures within the same route and direction


Unnamed: 0,route_id,route_short_name,direction_id,average
0,29,29,1,00:01:46
1,29,29,0,00:01:47
2,41,41,0,00:02:03
3,41,41,1,00:02:04
4,26,26,1,00:02:15


In [9]:
print("The least common routes and directions: ")
avg_freq_by_route_and_direction.assign(average=lambda df: df['average'].apply(format_timedelta)).reset_index(drop=True).tail(10)[::-1]

The least common routes and directions: 


Unnamed: 0,route_id,route_short_name,direction_id,average
348,10035,35L,1,01:13:20
347,10035,35L,0,01:13:20
346,60,60,0,01:11:06
345,60,60,1,01:02:43
344,107,107,0,01:01:06
343,107,107,1,01:00:15
342,10056,56L,1,00:52:30
341,10056,56L,0,00:52:30
340,10032,32E,0,00:32:30
339,10032,32E,1,00:32:18


In [10]:
# average time between departures for each line 
avg_freq_by_route = (
    avg_freq_by_route_and_direction
    .groupby(['route_id', 'route_short_name'])
    .agg(
        average=('average', 'mean') # calculate the average time by route 
    )
    .reset_index()
    .sort_values(['average'])
    .assign(average=lambda df: df['average'].apply(format_timedelta))
    .reset_index(drop=True)
)
print("The most common routes: ")
avg_freq_by_route.head(10)

The most common routes: 


Unnamed: 0,route_id,route_short_name,average
0,29,29,00:01:46
1,41,41,00:02:04
2,26,26,00:02:17
3,31,31,00:02:26
4,309,309,00:02:35
5,95,95,00:02:37
6,50,50,00:02:46
7,16,16,00:02:54
8,15,15,00:02:55
9,511,511,00:02:55


In [11]:
print("The least common routes: ")
avg_freq_by_route.tail(10)[::-1]

The least common routes: 


Unnamed: 0,route_id,route_short_name,average
177,10035,35L,01:13:20
176,60,60,01:06:55
175,107,107,01:00:41
174,10056,56L,00:52:30
173,10032,32E,00:32:24
172,607,607,00:31:57
171,109,109,00:26:03
170,10706,706E,00:25:54
169,110,110,00:25:13
168,553,553,00:24:23


2. Which are the lines with the highest number of stations? Which are the stations with the highest number of lines?

In [12]:
stopsRoutes.head(2)

Unnamed: 0,stop_id,route_id,stop_name
0,638,401,Birčaninova
1,638,59,Birčaninova


In [13]:
lines_with_top_stations = (
    stopsRoutes
    .groupby(['route_id'])['stop_id']
    .nunique()
    .sort_values(ascending=False)
    .reset_index()
    .merge(routes[['route_id', 'route_short_name']])
    #.drop(['route_id'], axis=1)
    .reset_index(drop=True)
    .rename(columns={'stop_id': 'num_of_stations'})
)
print("Lines with the highest number of stations")
lines_with_top_stations.head(10)

Lines with the highest number of stations


Unnamed: 0,route_id,num_of_stations,route_short_name
0,10025,104,25P
1,605,103,605
2,74,97,74
3,94,97,94
4,610,94,610
5,90003,85,E6
6,73,84,73
7,302,83,302
8,95,82,95
9,405,81,405


In [17]:
stations_with_top_lines = (
    stopsRoutes
    .groupby(['stop_id', 'stop_name'])['route_id']
    .nunique()
    .sort_values(ascending=False)
    #.drop(['route_id'], axis=1)
    .reset_index(drop=True)
    .rename(columns={'route_id': 'num_of_lines'})
)
print("Stations with the highest number of lines")
stations_with_top_lines.head(10)

TypeError: Series.rename() got an unexpected keyword argument 'columns'

3. Calculate the expected daily traffic for a certain station (stop) as the total expected number of line stops on that station, based on the planned schedule. Which are the stations with the highest traffic? 

Comment: Here, we're going to consider the day of the ride (service_id).

In [18]:
stop_traffic = (
    stopTimes[['trip_id', 'stop_id']]
    .merge(trips[['trip_id', 'service_id']])
    .merge(stops[['stop_id', 'stop_name']])
    .drop(['trip_id'], axis=1)
    .groupby(['stop_id', 'stop_name', 'service_id'], as_index=False)['stop_id']
    .count()
    .sort_values(by=['stop_id'], ascending=False)
    .rename(columns={'stop_id': 'count'})
)
print("Stations (stops) with the highest traffic: \n")
print("Zimski-Radni Dan: \n")
stop_traffic[stop_traffic.service_id=="Zimski-Radni Dan"].drop(['service_id'], axis=1).head()

Stations (stops) with the highest traffic: 

Zimski-Radni Dan: 



Unnamed: 0,stop_name,count
1396,Sajam,1664
1393,Sajam,1656
1417,Ada Ciganlija,1643
355,Karađorđev Park,1622
352,Karađorđev Park,1619


In [19]:
print("Zimski-Subota: \n")
stop_traffic[stop_traffic.service_id=="Zimski-Subota"].drop(['service_id'], axis=1).head()

Zimski-Subota: 



Unnamed: 0,stop_name,count
803,Brankov Most,1278
1394,Sajam,1208
1397,Sajam,1202
1418,Ada Ciganlija,1202
356,Karađorđev Park,1189


In [20]:
print("Zimski-Nedelja: \n")
stop_traffic[stop_traffic.service_id=="Zimski-Subota"].drop(['service_id'], axis=1).head()

Zimski-Nedelja: 



Unnamed: 0,stop_name,count
803,Brankov Most,1278
1394,Sajam,1208
1397,Sajam,1202
1418,Ada Ciganlija,1202
356,Karađorđev Park,1189


4. Assign each station a category based on the type of transition that passes through that station (bus station, tram station, trolleybus station or mixed station). How many stations of each category there are?

Comment on route_type : 0 - tram, 3 - bus, 702 - E line, 800 - trolleybus 
E lines are considered as buses.

In [21]:
routes_redefined = routes[['route_id', 'route_type']].copy()
routes_redefined.loc[routes_redefined['route_type'] == 702, 'route_type'] = 3 # change route_type of E line to be the same as bus

In [22]:
def format_stop_type():
    pass
stop_types_df = (
    stopsRoutes
    .merge(routes_redefined)
    .drop('route_id', axis=1)
    .groupby(['stop_id', 'stop_name'], as_index=False)['route_type']
    .agg({'route_type': 'max', 'num_types': 'nunique'})
    #.drop(['route_id'], axis=1)
    .reset_index(drop=True)
    #.assign(num_of_types=lambda df: df['route_type'].apply(format_stop_type))
    .assign(stop_type=lambda df: np.where(df['num_types'] > 1, 'Mixed', df['route_type']))
    .drop(['num_types', 'route_type'], axis=1)
    .assign(stop_type=lambda df: df['stop_type'].replace({'0': 'Tram', '3': 'Bus', '800': 'Trolleybus'}))
)
print("Number of stations by categories: ")
stop_types_df.stop_type.value_counts()

Number of stations by categories: 


Bus           2858
Mixed          170
Tram            88
Trolleybus      11
Name: stop_type, dtype: int64

In [25]:
stop_types_df.to_csv('cleaned_data/stop_types_df.csv', index=False)