In [1]:
import os
import polars as pl
import json 

In [2]:
path = "/Users/afnan/Desktop/python/Data_science_Notes/Uber_data"

In [3]:
all_files = os.listdir(f"{path}/4_ETA/preprocessed")
parquet_files = [file for file in all_files if file.endswith(".parquet")]

In [4]:
feature_cols = ['PULocationID', 'PUBorough', 'PUservice_zone',
                'day_of_week', 'dispatching_base_num','month', 'taxi_company', 'wind_direction']

In [5]:
df1 = pl.read_parquet(f"{path}/4_ETA/preprocessed/{parquet_files[0]}", columns=feature_cols)
df2 = pl.read_parquet(f"{path}/4_ETA/preprocessed/{parquet_files[1]}", columns=feature_cols)
df3 = pl.read_parquet(f"{path}/4_ETA/preprocessed/{parquet_files[2]}", columns=feature_cols)
df4 = pl.read_parquet(f"{path}/4_ETA/preprocessed/{parquet_files[3]}", columns=feature_cols)
df5 = pl.read_parquet(f"{path}/4_ETA/preprocessed/{parquet_files[4]}", columns=feature_cols)
df6 = pl.read_parquet(f"{path}/4_ETA/preprocessed/{parquet_files[5]}", columns=feature_cols)
df7 = pl.read_parquet(f"{path}/4_ETA/preprocessed/{parquet_files[6]}", columns=feature_cols)
df8 = pl.read_parquet(f"{path}/4_ETA/preprocessed/{parquet_files[7]}", columns=feature_cols)

In [6]:
dfs = [df1, df2, df3, df4, df5, df6, df7, df8]

In [7]:
combined_df = pl.concat(dfs)

In [8]:
combined_df.head()

PULocationID,PUBorough,PUservice_zone,day_of_week,dispatching_base_num,month,taxi_company,wind_direction
i64,str,str,str,str,str,str,str
39,"""Brooklyn""","""Boro Zone""","""Sunday""","""B02510""","""July""","""Lyft""","""SE"""
141,"""Manhattan""","""Yellow Zone""","""Sunday""","""B02866""","""July""","""Uber""","""SE"""
222,"""Brooklyn""","""Boro Zone""","""Sunday""","""B02764""","""July""","""Uber""","""SE"""
250,"""Bronx""","""Boro Zone""","""Sunday""","""B02836""","""July""","""Uber""","""SE"""
256,"""Brooklyn""","""Boro Zone""","""Sunday""","""B02872""","""July""","""Uber""","""SE"""


##### Locations

In [9]:
location_df = combined_df[['PULocationID', 'PUBorough', 'PUservice_zone']].unique()

In [10]:
taxi_zone_lookup = pl.read_csv(f'{path}/dataset/taxi_zone_lookup.csv')
taxi_zone_lookup.head()

LocationID,Borough,Zone,service_zone
i64,str,str,str
1,"""EWR""","""Newark Airport…","""EWR"""
2,"""Queens""","""Jamaica Bay""","""Boro Zone"""
3,"""Bronx""","""Allerton/Pelha…","""Boro Zone"""
4,"""Manhattan""","""Alphabet City""","""Yellow Zone"""
5,"""Staten Island""","""Arden Heights""","""Boro Zone"""


In [11]:
location_df = location_df.join(taxi_zone_lookup[['LocationID', 'Zone']], 
                               left_on="PULocationID", right_on="LocationID")

In [12]:
location_df = location_df.rename({"PULocationID": "LocationID", "PUBorough": "Borough", 
                                  "PUservice_zone": "service_zone", "Zone": "Location"})

In [13]:
borough_mapping = { "Queens": 1, "Manhattan": 2, "Bronx": 3, "Staten Island": 4, "Brooklyn": 5, "EWR": 6 }
zone_mapping = {"Yellow Zone":1, "Boro Zone":2, "Airports":3}

In [14]:
location_df = location_df.with_columns(pl.col("Borough").replace(borough_mapping).cast(pl.Int64, strict=False))
location_df = location_df.with_columns(pl.col("service_zone").replace(zone_mapping).cast(pl.Int64, strict=False))

In [15]:
location_df.head()

LocationID,Borough,service_zone,Location
i64,i64,i64,str
1,6,3,"""Newark Airport…"
3,3,2,"""Allerton/Pelha…"
4,2,1,"""Alphabet City"""
5,4,2,"""Arden Heights"""
6,4,2,"""Arrochar/Fort …"


In [16]:
# location_df.write_csv("locations.csv")

##### Other categories

In [17]:
month_mapping = { "January": 1, "February": 2, "March": 3, "April": 4, "May": 5, "June": 6, "July": 7,
                  "August": 8, "September": 9, "October": 10, "November": 11, "December": 12
                }

day_of_week_mapping = {
    "Monday": 1, "Tuesday": 2, "Wednesday": 3, "Thursday": 4, "Friday": 5, "Saturday": 6, "Sunday": 7
}

taxi_mapping = {'Lyft': 0, 'Uber': 1, 'Via': 2}
dbn_mapping = {'B02510': 1, 'B02866': 13, 'B02764': 5, 'B02836': 9, 'B02872': 18, 'B02887': 28, 'B02765': 6, 
               'B02867': 14, 'B02877': 21, 'B02835': 8, 'B02883': 26, 'B02682': 4, 'B02876': 20, 'B02617': 3, 
               'B02882': 25, 'B02512': 2, 'B02869': 15, 'B02871': 17, 'B02878': 22, 'B02870': 16, 'B02864': 11, 
               'B02884': 27, 'B02395': 0, 'B02875': 19, 'B02865': 12, 'B02880': 24, 'B02889': 30, 'B02888': 29, 
               'B02879': 23, 'B02800': 7, 'B02844': 10, 'B03136': 31, 'B03406': 33, 'B03404': 32}
wind_mapping = {'SE': 5, 'NW': 3, 'NE': 2, 'S': 4, 'N': 1, 'E': 0, 'SW': 6, 'W': 7}

In [18]:
categories = {
              'day_of_week': day_of_week_mapping, 
              'dispatching_base_num': dbn_mapping,
              'month': month_mapping, 
              'taxi_company': taxi_mapping, 
              'wind_direction' : wind_mapping
}

In [19]:
# with open("other_categories.json", "w") as outfile: 
#     json.dump(categories, outfile) 