# Wroclaw Public Transport

***

#### About the notebook
In this notebook, I will make some geographical analysis of the public transport in Wroclaw, Poland.

<br>

***

#### About the dataset
This GPS dataset is from [www.kaggle.com](https://www.kaggle.com). It was collected by the user [Piotr](https://www.kaggle.com/pieca111). It represents data from public transport vehicles in Wrocław, Poland in the period 2022-04-13 to 2022-04-30.

***

### (1) Import data from CSV

In [118]:
# IMPORTS
import pandas as pd

# LOAD CSV IN DATAFRAME
df = pd.read_csv("../data/positions.csv",
                 dtype='unicode',
                 names=["ID",
                        "Vehicle-ID",
                        "Line",
                        "Type",
                        "Latitude",
                        "Longitude",
                        "Timestamp",
                        ])

# DELETE FIRST ROW
df = df.iloc[1: , :]

### (2) Print dataframe to explore data

In [119]:
df.head(100)

Unnamed: 0,ID,Vehicle-ID,Line,Type,Latitude,Longitude,Timestamp
1,1,19707402,33,tram,51.113544,17.067019,2022-04-13T11:34:46.810755
2,2,19707424,33,tram,51.11554,17.074562,2022-04-13T11:34:46.810755
3,3,19679615,31,tram,51.112755,17.012657,2022-04-13T11:34:46.810755
4,4,19679761,31,tram,51.112915,17.01011,2022-04-13T11:34:46.810755
5,5,19679724,31,tram,51.12958,16.982096,2022-04-13T11:34:46.810755
...,...,...,...,...,...,...,...
96,96,19767216,d,bus,51.15687,17.121798,2022-04-13T11:34:46.810755
97,97,19767441,k,bus,7158.279,7158.279,2022-04-13T11:34:46.810755
98,98,19708471,70,tram,51.121685,17.043598,2022-04-13T11:34:46.810755
99,99,19764541,131,bus,51.14542,17.108097,2022-04-13T11:34:46.810755


### (3) Prepare data

#### Split and convert 'Timestamp' into 'Date' and 'Time'

In [120]:
# SPLIT 'TIMESTAMP' COLUMN
df[['Date', 'Time']] = df['Timestamp'].str.split('T', 1, expand=True)

In [121]:
# CONVERT 'DATE' COLUMN
df['Date'] = pd.to_datetime(df.Date, format='%Y-%m-%d')

In [122]:
# CONVERT 'TIME' COLUMN
df['Time'] = pd.to_datetime(df.Time, format ='%H:%M:%S.%f', errors = 'coerce').dt.time # TIME IS STILL AN OBJECT

In [123]:
# REMOVE CHARACTER IN 'TIMESTAMP' COLUMN
df['Timestamp'] = df["Timestamp"].str.replace("T"," ")

In [124]:
# CONVERT 'TIMESTAMP' COLUMN
df['Timestamp'] = pd.to_datetime(df.Timestamp, format='%Y-%m-%d %H:%M:%S.%f')

#### Convert Geo-Positions (Latitude, Longitude)


In [125]:
# CONVERT LATITUDE
df = df.astype({'Latitude':'float'})
# CONVERT LONGITUDE
df = df.astype({'Longitude':'float'})

In [126]:
# PRINT DTYPES OF DATAFRAME
df.dtypes

ID                    object
Vehicle-ID            object
Line                  object
Type                  object
Latitude             float64
Longitude            float64
Timestamp     datetime64[ns]
Date          datetime64[ns]
Time                  object
dtype: object

#### Remove Outliers (Latitude, Longitude)

In [127]:
# SORT DATAFRAME DESCENDING
df.sort_values('Latitude', ascending=False)

Unnamed: 0,ID,Vehicle-ID,Line,Type,Latitude,Longitude,Timestamp,Date,Time
16528211,16528211,19775089,d,bus,7158.279,7158.279,2022-04-24 13:37:38.445930,2022-04-24,13:37:38.445930
25017226,25017226,19741475,2,tram,7158.279,7158.279,2022-04-29 17:59:42.887675,2022-04-29,17:59:42.887675
6428254,6428254,19760948,104,bus,7158.279,7158.279,2022-04-18 06:04:31.770293,2022-04-18,06:04:31.770293
15171417,15171417,19802765,1,tram,7158.279,7158.279,2022-04-23 12:18:04.384013,2022-04-23,12:18:04.384013
20355628,20355628,19819176,c,bus,7158.279,7158.279,2022-04-27 04:24:33.489169,2022-04-27,04:24:33.489169
...,...,...,...,...,...,...,...,...,...
7617396,7617396,19766067,602,bus,0.000,0.000,2022-04-19 06:49:36.167243,2022-04-19,06:49:36.167243
18566877,18566877,19741105,33,tram,0.000,0.000,2022-04-25 17:24:42.447676,2022-04-25,17:24:42.447676
17373534,17373534,19741092,33,tram,0.000,0.000,2022-04-25 05:16:12.900279,2022-04-25,05:16:12.900279
7415555,7415555,19766066,602,bus,0.000,0.000,2022-04-19 04:48:07.079381,2022-04-19,04:48:07.079381


In [131]:
# DROP OUTLIERS IN LATITUDE
df.drop(index=df[df['Latitude'] >= 60].index, inplace=True)
# DROP OUTLIERS IN LONGITUDE
df.drop(index=df[df['Longitude'] >= 60].index, inplace=True)

#### Print prepared dataframe

In [132]:
# PRINT DATAFRAME
df.sort_values('Latitude', ascending=False)

Unnamed: 0,ID,Vehicle-ID,Line,Type,Latitude,Longitude,Timestamp,Date,Time
12231954,12231954,19740542,3,tram,55.538483,4.232975,2022-04-21 16:48:14.570366,2022-04-21,16:48:14.570366
12231531,12231531,19740542,3,tram,55.538483,4.232975,2022-04-21 16:47:59.635115,2022-04-21,16:47:59.635115
12232800,12232800,19740542,3,tram,55.538483,4.232975,2022-04-21 16:48:44.484524,2022-04-21,16:48:44.484524
12232376,12232376,19740542,3,tram,55.538483,4.232975,2022-04-21 16:48:29.482241,2022-04-21,16:48:29.482241
3349756,3349756,19709911,20,tram,54.522713,21.743164,2022-04-15 13:33:51.133777,2022-04-15,13:33:51.133777
...,...,...,...,...,...,...,...,...,...
7440776,7440776,19766066,602,bus,0.000000,0.000000,2022-04-19 05:03:08.167582,2022-04-19,05:03:08.167582
18521376,18521376,19741104,33,tram,0.000000,0.000000,2022-04-25 16:34:27.314082,2022-04-25,16:34:27.314082
17806967,17806967,19741096,33,tram,0.000000,0.000000,2022-04-25 09:16:56.213541,2022-04-25,09:16:56.213541
17544096,17544096,19741094,33,tram,0.000000,0.000000,2022-04-25 06:43:40.872567,2022-04-25,06:43:40.872567


### (4) Visualise data

#### Create new dataframe at specific date

In [133]:
# CREATE NEW DATAFRAME
time_df = pd.DataFrame()

# WRITE DATA AT SPECIFIC DATE IN NEW DATAFRAME
time_df = df[df['Date'] == '2022-04-25'] # HERE YOU CAN SET DATE

In [134]:
# COUNT ALL ROWS IN DATAFRAME
len(time_df.index)

1641518

In [135]:
# COUNT ENTRIES IN 'TYPE' COLUMN
time_df['Type'].value_counts()

bus     1043371
tram     598147
Name: Type, dtype: int64

#### Create new dataframe for 'Type' and counts entries

In [136]:
# CREATE NEW DATAFRAME
type_df = pd.DataFrame()

# COUNTS ENTRIES OF 'Type' AT SPECIFIC TIME
type_df = time_df.groupby(['Type', 'Timestamp']).size().to_frame('Count')

# REMOVES SUB COLUMNS
type_df = type_df.reset_index()

# SHOWS AXES OF AN DATAFRAME
type_df.axes

[RangeIndex(start=0, stop=10370, step=1),
 Index(['Type', 'Timestamp', 'Count'], dtype='object')]

In [137]:
type_df.reset_index()

Unnamed: 0,index,Type,Timestamp,Count
0,0,bus,2022-04-25 00:00:09.766160,36
1,1,bus,2022-04-25 00:00:25.082715,37
2,2,bus,2022-04-25 00:00:40.596822,36
3,3,bus,2022-04-25 00:00:55.344544,36
4,4,bus,2022-04-25 00:01:10.111151,36
...,...,...,...,...
10365,10365,tram,2022-04-25 21:13:13.105450,28
10366,10366,tram,2022-04-25 21:13:28.112590,26
10367,10367,tram,2022-04-25 21:13:43.093792,15
10368,10368,tram,2022-04-25 21:13:58.094528,9


In [138]:
# IMPORTS
import seaborn as sns
import matplotlib.pyplot as plt

# SET THEME
#sns.set_theme(style="whitegrid")

# SET SIZE OF PLOT
#fig_dims = (14, 8)
#fig, ax = plt.subplots(figsize=fig_dims)

# LINEPLOT
#sns.lineplot(x="Timestamp", y="Count", hue="Type", data=type_df);

import plotly.express as px

fig = px.line(type_df, x="Timestamp", y="Count", color='Type')
fig.show()

#### Create new dataframe for a specific 'Line'

In [140]:
# CREATE NEW DATAFRAME
line_df = pd.DataFrame()

# WRITE DATA OF A SPECIFIC LINE IN NEW DATAFRAME
line_df = time_df[time_df['Line'] == '3'] # HERE YOU CAN SET LINE
line_df

Unnamed: 0,ID,Vehicle-ID,Line,Type,Latitude,Longitude,Timestamp,Date,Time
17092142,17092142,19740616,3,tram,51.125164,17.040920,2022-04-25 01:59:56.196346,2022-04-25,01:59:56.196346
17092228,17092228,19740616,3,tram,51.125170,17.040897,2022-04-25 02:00:10.106957,2022-04-25,02:00:10.106957
17092312,17092312,19740616,3,tram,51.125170,17.040897,2022-04-25 02:00:25.103417,2022-04-25,02:00:25.103417
17092398,17092398,19740616,3,tram,51.125190,17.040900,2022-04-25 02:00:40.666367,2022-04-25,02:00:40.666367
17092483,17092483,19740616,3,tram,51.125190,17.040900,2022-04-25 02:00:55.322744,2022-04-25,02:00:55.322744
...,...,...,...,...,...,...,...,...,...
18708948,18708948,19740430,3,tram,51.106390,17.046100,2022-04-25 20:54:43.067320,2022-04-25,20:54:43.067320
18708953,18708953,19740370,3,tram,51.112473,17.018630,2022-04-25 20:54:43.067320,2022-04-25,20:54:43.067320
18709079,18709079,19740430,3,tram,51.106390,17.046100,2022-04-25 20:54:58.060967,2022-04-25,20:54:58.060967
18709210,18709210,19740430,3,tram,51.106390,17.046100,2022-04-25 20:55:13.065929,2022-04-25,20:55:13.065929


In [141]:
# SHOW VALUES
line_df['Vehicle-ID'].value_counts()

19740308    289
19740655    275
19740417    274
19744941    271
19740323    268
           ... 
19744196    144
19740600    138
19740638    121
19740452    119
19740484     68
Name: Vehicle-ID, Length: 145, dtype: int64

#### Create new dataframe for a specific ‘Vehicle’

In [142]:
# CREATE NEW DATAFRAME
vehicle_df = pd.DataFrame()

# WRITE DATA OF A SPECIFIC LINE IN NEW DATAFRAME
vehicle_df = line_df[line_df['Vehicle-ID'] == '19740308'] # HERE YOU CAN SET VEHICLE-ID
vehicle_df

Unnamed: 0,ID,Vehicle-ID,Line,Type,Latitude,Longitude,Timestamp,Date,Time
17860695,17860695,19740308,3,tram,51.077580,17.083690,2022-04-25 09:52:11.303135,2022-04-25,09:52:11.303135
17861085,17861085,19740308,3,tram,51.077580,17.083690,2022-04-25 09:52:26.290757,2022-04-25,09:52:26.290757
17861472,17861472,19740308,3,tram,51.076572,17.083668,2022-04-25 09:52:41.308460,2022-04-25,09:52:41.308460
17861858,17861858,19740308,3,tram,51.076572,17.083668,2022-04-25 09:52:56.324579,2022-04-25,09:52:56.324579
17862246,17862246,19740308,3,tram,51.076680,17.084118,2022-04-25 09:53:11.290868,2022-04-25,09:53:11.290868
...,...,...,...,...,...,...,...,...,...
17971363,17971363,19740308,3,tram,51.138670,16.965544,2022-04-25 11:03:11.502227,2022-04-25,11:03:11.502227
17971767,17971767,19740308,3,tram,51.139120,16.965260,2022-04-25 11:03:26.495514,2022-04-25,11:03:26.495514
17972172,17972172,19740308,3,tram,51.139780,16.964930,2022-04-25 11:03:41.502597,2022-04-25,11:03:41.502597
17972573,17972573,19740308,3,tram,51.140633,16.964367,2022-04-25 11:03:56.492484,2022-04-25,11:03:56.492484


#### Create map for a specific 'Vehicle' at a specific 'Date'

In [143]:
# IMPORTS
import folium

# SET UP MAP
map_vehicle = folium.Map(location=[51.107883, 17.038538], zoom_start=12)

# CREATE NEW DATAFRAME
locations_df = pd.DataFrame()

# WRITE DATA IN A NEW DATAFRAME GROUPED BY 'Latitude' AND 'Longitude'
locations_df = vehicle_df[['Latitude', 'Longitude']].copy()

# WRITE LOCATION DATA IN A LIST
location_list = locations_df.values.tolist()

folium.PolyLine(location_list, color="red", weight=3.5, opacity=1).add_to(map_vehicle)

map_vehicle

In [145]:
import plotly.figure_factory as ff
import plotly.express as px

px.set_mapbox_access_token(open(".mapbox_token").read())

fig = ff.create_hexbin_mapbox(
    data_frame=time_df, lat="Latitude", lon="Longitude",
    nx_hexagon=1000, opacity=0.5, labels={"color": "Point Count"},
    min_count=100,
)

fig.update_layout(mapbox_zoom=10, mapbox_center_lat = 51.107883, mapbox_center_lon = 17.038538, margin={"r":0,"t":0,"l":0,"b":0})

fig.show()