In [1]:
import pandas as pd
import joblib
from sklearn.metrics import f1_score

# Cargamos el modelo y las funciones que necesitamos del notebook 00

In [3]:
rfc = joblib.load("./model/random_forest.joblib")

In [9]:
numeric_feat = [
"pickup_weekday",
"pickup_hour",
'work_hours',
"pickup_minute",
"passenger_count",
'trip_distance',
'trip_time',
'trip_speed'
]
categorical_feat = [
    "PULocationID",
    "DOLocationID",
    "RatecodeID",
]
EPS = 1e-7
features = numeric_feat + categorical_feat
target_col = "high_tip"

In [4]:
def preprocess(df, target_col):

   # Basic cleaning
    df = df[df['fare_amount'] > 0].reset_index(drop=True)  # avoid divide-by-zero
    # add target
    df['tip_fraction'] = df['tip_amount'] / df['fare_amount']
    df[target_col] = df['tip_fraction'] > 0.2

    # add features
    df['pickup_weekday'] = df['tpep_pickup_datetime'].dt.weekday
    df['pickup_hour'] = df['tpep_pickup_datetime'].dt.hour
    df['pickup_minute'] = df['tpep_pickup_datetime'].dt.minute
    df['work_hours'] = (df['pickup_weekday'] >= 0) & (df['pickup_weekday'] <= 4) & (df['pickup_hour'] >= 8) & (df['pickup_hour'] <= 18)
    df['trip_time'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.seconds
    df['trip_speed'] = df['trip_distance'] / (df['trip_time'] + EPS)

    # drop unused columns
    df = df[['tpep_dropoff_datetime'] + features + [target_col]]
    df[features + [target_col]] = df[features + [target_col]].astype("float32").fillna(-1.0)

    # convert target to int32 for efficiency (it's just 0s and 1s)
    df[target_col] = df[target_col].astype("int32")

    return df.reset_index(drop=True)

# Evaluación Marzo 2020

In [16]:
taxi_march= pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2020-03.parquet')
taxi_test = preprocess(taxi_march, target_col=target_col).head(1000)
taxi_test.head()

preds_test = rfc.predict_proba(taxi_test[features])
preds_test_labels = [p[1] for p in preds_test.round()]
print(f'march-2020 F1: {f1_score(taxi_test[target_col], preds_test_labels)}')

march-2020 F1: 0.7428571428571428


In [17]:
taxi_test.to_csv('./app/data/yellow_tripdata_2020-03_test.csv', index=False)

# Evaluación Mayo 2020

In [18]:
taxi_may= pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2020-04.parquet')
taxi_test = preprocess(taxi_may, target_col=target_col).head(1000)
taxi_test.head()

preds_test = rfc.predict_proba(taxi_test[features])
preds_test_labels = [p[1] for p in preds_test.round()]
print(f'may-2020 F1: {f1_score(taxi_test[target_col], preds_test_labels)}')

may-2020 F1: 0.5535545023696683


In [19]:
taxi_test.to_csv('./app/data/yellow_tripdata_2020-05_test.csv', index=False)