In [None]:
import pandas as pd
import terality as te
# from codetiming import Timer
import random
from datetime import datetime


In [None]:
file_path = "C:\\\\Users\\\\bhask\\\\Google Drive\\\\datasets\\\\Chicago_taxi_trips\\\\"
# timer = Timer(name="Terality Tests")

In [None]:
# Disabling Terality cache for testing
# cache_disabler = te.disable_cache()
# cache_disabler.__enter__()

In [None]:
def get_slice_range(limit):
    start = random.randint(1, limit)
    end = random.randint(start, start + 1000)
    return start, end

In [1]:
def get_time():
    return datetime.now()

In [None]:
def get_time_diff(start_time, end_time):
    return (end_time - start_time).total_seconds()

In [None]:
# A list of dataframe columns renamed by converting to lowercase and replacing spaces with '_'
columns = [
'trip_id',
'taxi_id',
'trip_start_timestamp',
'trip_end_timestamp',
'trip_seconds',
'trip_miles',
'pickup_census_tract',
'dropoff_census_tract',
'pickup_community_area',
'dropoff_community_area',
'fare',
'tips',
'tolls',
'extras',
'trip_total',
'payment_type',
'company',
'pickup_centroid_latitude',
'pickup_centroid_longitude',
'pickup_centroid_location',
'dropoff_centroid_latitude',
'dropoff_centroid_longitude',
'dropoff_centroid__location'
]

In [None]:
# Loading data into dataframes using Pandas
start_time = get_time()
taxi_trips_pdf = pd.read_csv(file_path + "Taxi_Trips_2019_2020.csv")
print(get_time_diff(start_time, get_time()))

In [None]:
# Loading data into dataframes using Terality
start_time = get_time()
taxi_trips_tdf = te.read_csv(file_path + "Taxi_Trips_2019_2020.csv")
print(get_time_diff(start_time, get_time()))

In [None]:
start_time = get_time()
taxi_trips_pdf.head()
print(get_time_diff(start_time, get_time()))

In [None]:
start_time = get_time()
taxi_trips_tdf.head()
print(get_time_diff(start_time, get_time()))

In [None]:
# Rename the columns of the Pandas dataframe
start_time = get_time()
taxi_trips_pdf.columns = columns
print(get_time_diff(start_time, get_time()))

In [None]:
# Rename the columns of the Terality dataframe
start_time = get_time()
taxi_trips_tdf.columns = columns
print(get_time_diff(start_time, get_time()))

In [None]:
# Row count before cleanup - using Pandas
start_time = get_time()
print("Pandas DF Row Count before: % d" % taxi_trips_pdf.shape[0])
print(get_time_diff(start_time, get_time()))

In [None]:
# Remove all the rows with NaN values using Pandas. There is plenty of data even otherwise
start_time = get_time()
taxi_trips_pdf.dropna(how='any',axis=0,inplace=True)
print(get_time_diff(start_time, get_time()))

In [None]:
# Row count after cleanup - using Pandas
start_time = get_time()
row_count = taxi_trips_pdf.shape[0]
print("Pandas DF Row Count after: % d" % row_count)
print(get_time_diff(start_time, get_time()))

In [None]:
# Row count before cleanup - using Terality
start_time = get_time()
print("Terality DF Row Count after: % d" % taxi_trips_tdf.shape[0])
print(get_time_diff(start_time, get_time()))

In [None]:
# Remove all the rows with NaN values using Terality. There is plenty of data even otherwise
start_time = get_time()
taxi_trips_tdf.dropna(how='any',axis=0,inplace=True)
print(get_time_diff(start_time, get_time()))

In [None]:
# Row count after cleanup - using Terality
start_time = get_time()
print("Terality DF Row Count after: % d" % taxi_trips_tdf.shape[0])
print(get_time_diff(start_time, get_time()))

In [None]:
# Adding a new column to store datetime version of string datetimes - Pandas
start_time = get_time()
taxi_trips_pdf['trip_start_dt'] = pd.to_datetime(taxi_trips_pdf['trip_start_timestamp'], infer_datetime_format=True)
taxi_trips_pdf['trip_end_dt'] = pd.to_datetime(taxi_trips_pdf['trip_end_timestamp'], infer_datetime_format=True)
taxi_trips_pdf['trip_start_ym'] = taxi_trips_pdf['trip_start_dt'].dt.to_period('M')
taxi_trips_pdf['trip_end_ym'] = taxi_trips_pdf['trip_end_dt'].dt.to_period('M')
print(get_time_diff(start_time, get_time()))

In [None]:
taxi_trips_pdf.head()

In [None]:
# Adding a new column to store datetime version of string datetimes - Terality
start_time = get_time()
taxi_trips_tdf['trip_start_dt'] = te.to_datetime(taxi_trips_tdf['trip_start_timestamp'], infer_datetime_format=True)
taxi_trips_tdf['trip_end_dt'] = te.to_datetime(taxi_trips_tdf['trip_end_timestamp'], infer_datetime_format=True)
taxi_trips_tdf['trip_start_ym'] = taxi_trips_tdf['trip_start_dt'].dt.to_period('M')
taxi_trips_tdf['trip_end_ym'] = taxi_trips_tdf['trip_end_dt'].dt.to_period('M')
print(get_time_diff(start_time, get_time()))

In [None]:
taxi_trips_tdf.head()
# print(get_time_diff(start_time, get_time()))


In [None]:
# Describe - Pandas
start_time = get_time()
taxi_trips_pdf.describe()

In [None]:
print(get_time_diff(start_time, get_time()))

In [None]:
# Describe - Terality
start_time = get_time()
taxi_trips_tdf.describe()

In [None]:
print(get_time_diff(start_time, get_time()))

In [None]:
# Query 1: Number of trips grouped by taxi - Using Pandas
start_time = get_time()
taxi_trips_pdf.groupby('taxi_id').count()['trip_id']


In [None]:
print(get_time_diff(start_time, get_time()))

In [None]:
# Query 1: Number of trips grouped by taxi - Using Terality
start_time = get_time()
taxi_trips_pdf.groupby('taxi_id').count()['trip_id']



In [None]:
print(get_time_diff(start_time, get_time()))

In [None]:
# Query 2: Number of trips grouped by taxi and year/month - Using Pandas
start_time = get_time()
taxi_trips_pdf.groupby(['taxi_id', 'trip_start_ym']).count()['trip_id']

In [None]:
print(get_time_diff(start_time, get_time()))

In [None]:
# Query 2: Number of trips grouped by taxi and year/month - Using Terality
start_time = get_time()
taxi_trips_tdf.groupby(['taxi_id', 'trip_start_ym']).count()['trip_id']

In [None]:
print(get_time_diff(start_time, get_time()))

In [None]:
# Query 3: Number of trips by Taxi Company - using Pandas
start_time = get_time()
taxi_trips_pdf.groupby('company').count()['trip_id'].sort_values(ascending=False).head(10)


In [None]:
print(get_time_diff(start_time, get_time()))

In [None]:
# Query 3: Number of trips by Taxi Company - using Terality
start_time = get_time()
taxi_trips_tdf.groupby('company').count()['trip_id'].sort_values(ascending=False).head(10)


In [None]:
print(get_time_diff(start_time, get_time()))

In [None]:
# Query 4: Sort by Taxi Company - using Pandas
start_time = get_time()
taxi_trips_pdf.sort_values(by="company")
print(get_time_diff(start_time, get_time()))

In [None]:
# Query 4: Sort by Taxi Company - using Terality
start_time = get_time()
taxi_trips_tdf.sort_values(by="company")
print(get_time_diff(start_time, get_time()))

In [None]:
range = get_slice_range(row_count)
print("Start: %d, End: %d" % range)


In [None]:
# Query 5: Slicing using random numbers for start and end - using Pandas
start_time = get_time()
taxi_trips_pdf[range[0]:range[1]]

In [None]:
print(get_time_diff(start_time, get_time()))

In [None]:
# Query 5: Slicing using random numbers for start and end - using Terality
start_time = get_time()
taxi_trips_tdf[range[0]:range[1]]

In [None]:
print(get_time_diff(start_time, get_time()))

In [None]:
# Query 6: Selecting a single row from an unindexed dataframe - using Pandas
start_time = get_time()
taxi_trips_pdf[taxi_trips_pdf['trip_id'] == 'd3e437e3af9c691e6a9fe1f9802605d989605fdb']

In [None]:
print(get_time_diff(start_time, get_time()))

In [None]:
# Query 6: Selecting a single row from an unindexed dataframe - using Terality
start_time = get_time()
taxi_trips_tdf[taxi_trips_tdf['trip_id'] == 'd3e437e3af9c691e6a9fe1f9802605d989605fdb']

In [None]:
print(get_time_diff(start_time, get_time()))

In [None]:
# Query 7: Range queries using isin() - Using Pandas
start_time = get_time()
taxi_trips_pdf[taxi_trips_pdf['company'].isin(['Taxi Affiliation Services', 'Blue Diamond', 'Chicago Taxicab', 'Nova Taxi Affiliation Llc', 'U Taxicab'])]

In [None]:
print(get_time_diff(start_time, get_time()))

In [None]:
# Query 7: Range queries using isin() - Using Terality
start_time = get_time()
taxi_trips_tdf[taxi_trips_tdf['company'].isin(['Taxi Affiliation Services', 'Blue Diamond', 'Chicago Taxicab', 'Nova Taxi Affiliation Llc', 'U Taxicab'])]

In [None]:
print(get_time_diff(start_time, get_time()))