In [6]:
# Data loading
import urllib.request

urllib.request.urlretrieve(
    "https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2022-01.parquet",
    "data/taxi-trips-jan22.parquet",
)
urllib.request.urlretrieve(
    "https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2022-02.parquet",
    "data/taxi-trips-feb22.parquet",
)
urllib.request.urlretrieve(
    "https://d37ci6vzurychx.cloudfront.net/misc/taxi+_zone_lookup.csv",
    "data/taxi-zones.csv",
)

('data/taxi-zones.csv', <http.client.HTTPMessage at 0x7f97710bf550>)

In [7]:
# Uglify
import pandas as pd
from tqdm import tqdm

tqdm.pandas()


def add_negative_values(data: pd.DataFrame, col: str) -> pd.DataFrame:
    """
    Selects a few rows & disrupts the value in the column changing to a negative number
    """
    data_samples = data.sample(frac=0.001)
    data[col] = (
        data[col] * (-1 * data.index.isin(data_samples.index))
        + (1 - data.index.isin(data_samples.index)) * data[col]
    )
    return data


def distort_values(data: pd.DataFrame, col: str) -> pd.DataFrame:
    """
    Selects a few rows & disrupts the value in the column multiplying by a significant number
    """
    data_samples = data.sample(frac=0.0001)
    data[col] = (
        data[col] * (1000 * data.index.isin(data_samples.index))
        + (1 - data.index.isin(data_samples.index)) * data[col]
    )
    return data


def add_new_value(data: pd.DataFrame, col: str, value: str) -> pd.DataFrame:
    """
    Selects a few rows & disrupts the value in the column changing to a new value
    """
    data_samples = data.sample(frac=0.001)
    data.loc[data.index.isin(data_samples.index), col] = value
    return data


def uglify_taxi_trips(data: pd.DataFrame) -> pd.DataFrame:
    """
    Apply a set of transformation to uglify data (for the purpose of our data management & quality exercise)
    """
    data = add_negative_values(data, "base_passenger_fare")
    data = add_negative_values(data, "tips")
    data = distort_values(data, "trip_miles")
    data = add_new_value(data, "hvfhs_license_num", "HV000")
    data = add_new_value(data, "PULocationID", 264)
    return data

In [8]:
# Add zones mapping & pick a random number of rows -> to lighten memory footprint
for file_path in tqdm(
    ["data/taxi-trips-feb22.parquet", "data/taxi-trips-jan22.parquet"]
):
    data = pd.read_parquet(file_path)
    data = data.sample(frac=0.25, random_state=42)
    data.to_parquet(file_path)

100%|██████████| 2/2 [00:21<00:00, 10.61s/it]


In [9]:
# Uglify
for file_path in tqdm(
    ["data/taxi-trips-feb22.parquet", "data/taxi-trips-jan22.parquet"]
):
    data = pd.read_parquet(file_path)
    data = uglify_taxi_trips(data)
    data.to_parquet(file_path)

100%|██████████| 2/2 [00:10<00:00,  5.35s/it]
