In [1]:
import os 
import polars as pl
import pickle

In [2]:
path = "/Users/afnan/Desktop/python/Data_science_Notes/Uber_data"

In [3]:
all_files = os.listdir(f"{path}/4_ETA/preprocessed")
parquet_files = [file for file in all_files if file.endswith(".parquet")]

In [4]:
feature_cols = ['taxi_company', 'trip_miles',  'wav_request_flag', 'wav_match_flag', 'dispatching_base_num', 
                'PULocationID', 'PUBorough', 'PUservice_zone', 'DOLocationID', 'DOBorough', 'DOservice_zone', 
                'any_tolls', 'hour_of_day', 'day_of_week', 'month', 'traffic',
                'feel', 'humidity', 'BR', 'CLR', 'SN', 'wind_speed', 'wind_direction',
                'trip_time', 'minutes_per_mile']

In [5]:
df1 = pl.read_parquet(f"{path}/4_ETA/preprocessed/{parquet_files[0]}", columns=feature_cols)
df2 = pl.read_parquet(f"{path}/4_ETA/preprocessed/{parquet_files[1]}", columns=feature_cols)
df3 = pl.read_parquet(f"{path}/4_ETA/preprocessed/{parquet_files[2]}", columns=feature_cols)
df4 = pl.read_parquet(f"{path}/4_ETA/preprocessed/{parquet_files[3]}", columns=feature_cols)
df5 = pl.read_parquet(f"{path}/4_ETA/preprocessed/{parquet_files[4]}", columns=feature_cols)
df6 = pl.read_parquet(f"{path}/4_ETA/preprocessed/{parquet_files[5]}", columns=feature_cols)
df7 = pl.read_parquet(f"{path}/4_ETA/preprocessed/{parquet_files[6]}", columns=feature_cols)
df8 = pl.read_parquet(f"{path}/4_ETA/preprocessed/{parquet_files[7]}", columns=feature_cols)

In [6]:
dfs = [df1, df2, df3, df4, df5, df6, df7, df8]

In [7]:
combined_df = pl.concat(dfs)

In [8]:
test_df = combined_df[::100]

In [9]:
test_df.shape

(162772, 25)

In [10]:
encoded_df = test_df

##### Label encoding

In [11]:
month_mapping = { "January": 1, "February": 2, "March": 3, "April": 4, "May": 5, "June": 6, "July": 7,
                  "August": 8, "September": 9, "October": 10, "November": 11, "December": 12
                }

day_of_week_mapping = {
    "Monday": 1, "Tuesday": 2, "Wednesday": 3, "Thursday": 4, "Friday": 5, "Saturday": 6, "Sunday": 7
}

taxi_mapping = {'Lyft': 0, 'Uber': 1, 'Via': 2}
dbn_mapping = {'B02510': 1, 'B02866': 13, 'B02764': 5, 'B02836': 9, 'B02872': 18, 'B02887': 28, 'B02765': 6, 
               'B02867': 14, 'B02877': 21, 'B02835': 8, 'B02883': 26, 'B02682': 4, 'B02876': 20, 'B02617': 3, 
               'B02882': 25, 'B02512': 2, 'B02869': 15, 'B02871': 17, 'B02878': 22, 'B02870': 16, 'B02864': 11, 
               'B02884': 27, 'B02395': 0, 'B02875': 19, 'B02865': 12, 'B02880': 24, 'B02889': 30, 'B02888': 29, 
               'B02879': 23, 'B02800': 7, 'B02844': 10, 'B03136': 31, 'B03406': 33, 'B03404': 32}
wind_mapping = {'SE': 5, 'NW': 3, 'NE': 2, 'S': 4, 'N': 1, 'E': 0, 'SW': 6, 'W': 7}

borough_mapping = { "Queens": 1, "Manhattan": 2, "Bronx": 3, "Staten Island": 4, "Brooklyn": 5, "EWR": 6 }
zone_mapping = {"Yellow Zone":1, "Boro Zone":2, "Airports":3}

In [12]:
categories = {
              'day_of_week': day_of_week_mapping, 
              'dispatching_base_num': dbn_mapping,
              'month': month_mapping, 
              'taxi_company': taxi_mapping, 
              'wind_direction' : wind_mapping,
              'PUBorough': borough_mapping,
              'DOBorough': borough_mapping,
              'PUservice_zone': zone_mapping,
              'DOservice_zone': zone_mapping
}

In [13]:
for cat in categories:
    encoded_df = encoded_df.with_columns(pl.col(cat).replace(categories[cat]).cast(pl.Int64, strict=False))

##### Load model

In [14]:
model = pickle.load(open(f"{path}/4_ETA/xgb_model.pkl", 'rb'))

In [15]:
train_cols = feature_cols[:-2]

In [16]:
test_df = test_df.with_columns((model.predict(encoded_df[train_cols])*encoded_df['trip_miles']).alias("prediction"))

In [17]:
test_df = test_df.drop("minutes_per_mile")

In [18]:
test_df = test_df.with_columns((abs(test_df['trip_time'] - test_df['prediction'])).alias("error"))

In [19]:
taxi_zone_lookup = pl.read_csv(f'{path}/dataset/taxi_zone_lookup.csv')
taxi_zone_lookup.head()

LocationID,Borough,Zone,service_zone
i64,str,str,str
1,"""EWR""","""Newark Airport…","""EWR"""
2,"""Queens""","""Jamaica Bay""","""Boro Zone"""
3,"""Bronx""","""Allerton/Pelha…","""Boro Zone"""
4,"""Manhattan""","""Alphabet City""","""Yellow Zone"""
5,"""Staten Island""","""Arden Heights""","""Boro Zone"""


In [20]:
test_df = test_df.join(taxi_zone_lookup[["LocationID", "Zone"]], left_on="PULocationID", right_on="LocationID")\
                    .rename({"Zone":"PUZone"})
test_df = test_df.join(taxi_zone_lookup[["LocationID", "Zone"]], left_on="DOLocationID", right_on="LocationID")\
                    .rename({"Zone":"DOZone"})

In [22]:
test_df.head()

taxi_company,trip_miles,wav_request_flag,wav_match_flag,dispatching_base_num,PULocationID,PUBorough,PUservice_zone,DOLocationID,DOBorough,DOservice_zone,any_tolls,hour_of_day,day_of_week,month,traffic,feel,humidity,BR,CLR,SN,wind_speed,wind_direction,trip_time,prediction,error,PUZone,DOZone
str,f64,i32,i32,str,i64,str,str,i64,str,str,i32,i32,str,str,i32,f64,f64,i64,i64,i64,f64,str,f64,f64,f64,str,str
"""Lyft""",2.1,0,0,"""B02510""",39,"""Brooklyn""","""Boro Zone""",72,"""Brooklyn""","""Boro Zone""",0,14,"""Sunday""","""July""",24000,73.9,81.82,0,1,0,9.0,"""SE""",11.55,11.324217,0.225783,"""Canarsie""","""East Flatbush/…"
"""Lyft""",0.964,0,0,"""B02510""",205,"""Queens""","""Boro Zone""",122,"""Queens""","""Boro Zone""",0,14,"""Sunday""","""July""",24000,73.9,81.82,0,1,0,9.0,"""SE""",6.1,5.226246,0.873754,"""Saint Albans""","""Hollis"""
"""Lyft""",5.701,0,0,"""B02510""",129,"""Queens""","""Boro Zone""",192,"""Queens""","""Boro Zone""",0,14,"""Sunday""","""July""",24000,73.9,81.82,0,1,0,9.0,"""SE""",14.366667,20.508784,6.142117,"""Jackson Height…","""Queensboro Hil…"
"""Uber""",1.79,0,0,"""B02869""",226,"""Queens""","""Boro Zone""",129,"""Queens""","""Boro Zone""",0,14,"""Sunday""","""July""",24000,73.9,81.82,0,1,0,9.0,"""SE""",17.933333,11.384996,6.548337,"""Sunnyside""","""Jackson Height…"
"""Uber""",0.63,0,0,"""B02882""",63,"""Brooklyn""","""Boro Zone""",76,"""Brooklyn""","""Boro Zone""",0,14,"""Sunday""","""July""",24000,73.9,81.82,0,1,0,9.0,"""SE""",5.45,4.766952,0.683048,"""Cypress Hills""","""East New York"""


In [23]:
# test_df.write_csv("prediction.csv")