In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
train_df = pd.read_csv('processed_train.csv')

In [None]:
# Compare distributions of non-holidays and holidays
pd.set_option('display.float_format', '{:.1f}'.format)
print("TRAVEL_TIME distribution for non-holiday")
print(train_df[train_df['HOLIDAY'] == 0]['TRAVEL_TIME'].describe())
print()
print("TRAVEL_TIME distribution for day before holiday")
print(train_df[train_df['HOLIDAY'] == 1]['TRAVEL_TIME'].describe())
print()
print("TRAVEL_TIME distribution for holiday")
print(train_df[train_df['HOLIDAY'] == 2]['TRAVEL_TIME'].describe())

In [None]:
mean, std = train_df["TRAVEL_TIME"].mean(), train_df["TRAVEL_TIME"].std()

# First n samples to analyze. Set to -1 to use all data
end = -1

outlier_threshold = 3

# "Choose all data, where the trip length is less than 3 standard deviations away from the mean"
# This is to remove outliers. Otherwise, our plots would look very squished (since there are some
# VERRRRRY long taxi trips in the dataset)
df_trimmed = train_df[train_df["TRAVEL_TIME"] < mean + outlier_threshold * std]

# Because our y-values only take on multiples of 15, we want just enough buckets in a histogram
# such that each buckets counts one value's frequency. (e.x. one bucket counts how many 15s trips, 
# how many 30s trips, etc. )
buckets = (int(mean + outlier_threshold * std) // 15)

print(f"Using: {len(df_trimmed)}/{len(train_df)}")

fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(18,5))

# Now, we visualize some features that we think might be useful
for idx, v in enumerate(["QTRHR", "WK", "WKYR"]):
    
    #ax = axs[idx // 3, idx % 3]
    ax = axs[idx % 3]
    
    # Remove any rows with invalid values
    df_subset = df_trimmed.dropna(subset=v)
    
    # Create a histogram. Look up the documentation for more details
    ax.hist2d(df_subset[v][:end], df_subset["TRAVEL_TIME"][:end], cmap="CMRmap", bins=(120,buckets))
    
    # Some stylistic things to make the graphs look nice
    ax.set_xlim(ax.get_xlim()[0] - 1, ax.get_xlim()[1] + 1)
    ax.set_facecolor("black")
    ax.set_ylabel("seconds", fontsize=18)
    ax.set_title(f"Feature: {v}", fontsize=20)
    #plt.savefig('time_features.png', dpi=600, bbox_inches='tight')

In [None]:
plt.figure(figsize=(18,6))

histogram, bin_boundary = np.histogram(train_df['TRAVEL_TIME'], bins=buckets)
histogram = histogram / len(train_df['TRAVEL_TIME'])

bin_centers = [(bin_boundary[i] + bin_boundary[i+1])/2 for i in range(buckets)]
plt.plot(bin_centers, histogram)
plt.rc('font', size=20) 
plt.xlabel('Taxi Trip Time (s)')
plt.ylabel('Percentile of Times')
plt.title('Distribution of Travel Times')
plt.savefig('distribution.png', dpi=600, bbox_inches='tight')

In [None]:
call_type_A = df_trimmed[df_trimmed['CALL_TYPE'] == 'A']
call_type_B = df_trimmed[df_trimmed['CALL_TYPE'] == 'B']
call_type_C = df_trimmed[df_trimmed['CALL_TYPE'] == 'C']
plt.figure(figsize=(12,6))

histogram_a, bin_boundary_a = np.histogram(call_type_A['TRAVEL_TIME'], bins=buckets)
histogram_a = histogram_a / len(call_type_A['TRAVEL_TIME'])
bin_centers_a = [(bin_boundary_a[i] + bin_boundary_a[i+1])/2 for i in range(buckets)]
plt.plot(bin_centers_a, histogram_a, label='A')

histogram_b, bin_boundary_b = np.histogram(call_type_B['TRAVEL_TIME'], bins=buckets)
histogram_b = histogram_b / len(call_type_B['TRAVEL_TIME'])
bin_centers_b = [(bin_boundary_b[i] + bin_boundary_b[i+1])/2 for i in range(buckets)]
plt.plot(bin_centers_b, histogram_b, label='B')

histogram_c, bin_boundary_c = np.histogram(call_type_C['TRAVEL_TIME'], bins=buckets)
histogram_c = histogram_c / len(call_type_C['TRAVEL_TIME'])
bin_centers_c = [(bin_boundary_c[i] + bin_boundary_c[i+1])/2 for i in range(buckets)]
plt.plot(bin_centers_c, histogram_c, label='C')
plt.rc('font', size=20) 
plt.xlabel('Taxi Trip Time (s)')
plt.ylabel('Percentile of Times')
plt.title('Distribution of Travel Times By Call Type')
plt.legend()
#plt.savefig('distribution.png', dpi=600, bbox_inches='tight')

In [None]:
np.set_printoptions(threshold=10)
train_df.columns

In [None]:
print(np.array(train_df['CALL_TYPE'].unique()))
print(len(train_df['CALL_TYPE'].unique()))

In [None]:
print(np.array(sorted(train_df['ORIGIN_CALL'].unique())))
print(len(train_df['ORIGIN_CALL'].unique()))

In [None]:
print(np.array(sorted(train_df['ORIGIN_STAND'].unique())))
print(len(train_df['ORIGIN_STAND'].unique()))

In [None]:
print(np.array(sorted(train_df['TAXI_ID'].unique())))
print(len(train_df['TAXI_ID'].unique()))

In [None]:
print(np.array(sorted(train_df['QTRHR'].unique())))
print(len(train_df['QTRHR'].unique()))

In [None]:
print(np.array(sorted(train_df['WK'].unique())))
print(len(train_df['WK'].unique()))

In [None]:
print(np.array(sorted(train_df['WKYR'].unique())))
print(len(train_df['WKYR'].unique()))

In [None]:
print(np.array(sorted(train_df['HOLIDAY'].unique())))
print(len(train_df['HOLIDAY'].unique()))

In [None]:
metadata = pd.read_csv("kaggle_data/metaData_taxistandsID_name_GPSlocation.csv")
metadata = metadata.drop('Descricao', axis=1)

In [None]:
trips_from_stand = train_df[train_df['ORIGIN_STAND'] != 0]

In [None]:
from tqdm import tqdm

progress_bar = tqdm(total=len(trips_from_stand))

def origin_stand_to_location(x):
  progress_bar.update(1)
  result = metadata[metadata['ID'] == x["ORIGIN_STAND"]]
  return float(result['Latitude'].item()), float(result['Longitude'].item())

trips_from_stand[["OS_LAT", "OS_LONG"]] = trips_from_stand[["ORIGIN_STAND"]].apply(origin_stand_to_location, axis=1, result_type="expand")
progress_bar.close()

In [None]:
def count_trips(x):
    return len(trips_from_stand[trips_from_stand['ORIGIN_STAND'] == x['ID']])

metadata["COUNT"] = metadata[["ID"]].apply(count_trips, axis=1, result_type="expand")

In [None]:
assert len(trips_from_stand) == metadata["COUNT"].sum()

In [None]:
# Create a 2D histogram
heatmap, xedges, yedges = np.histogram2d(trips_from_stand['OS_LAT'], trips_from_stand['OS_LONG'], bins=20)

heatmap = heatmap / len(trips_from_stand)

# Create a heatmap using Seaborn
sns.heatmap(heatmap, cmap='Blues', cbar_kws={'label': 'Percent of Taxi Trips'})

x_bin_edges = xedges
y_bin_edges = yedges

x_bin_edges = [f'{value:.3f}' for value in x_bin_edges]
y_bin_edges = [f'{value:.3f}' for value in y_bin_edges]

plt.xticks(range(len(x_bin_edges)), x_bin_edges, rotation=45)
plt.yticks(range(len(y_bin_edges)), y_bin_edges, rotation=0)


# Set the axis labels
plt.xlabel('Latitude')
plt.ylabel('Longitude')

plt.title('Distribution of Taxi Trip Starting Points')
plt.savefig('heatmap_starting.png', dpi=600, bbox_inches='tight')
# Display the plot
plt.show()

In [None]:
bins=20
# Calculate the 2D histogram sums and counts
hist_sum, xedges, yedges = np.histogram2d(
    trips_from_stand['OS_LAT'],
    trips_from_stand['OS_LONG'],
    bins=bins,
    weights=trips_from_stand['TRAVEL_TIME']
)
hist_count, _, _ = np.histogram2d(trips_from_stand['OS_LAT'], trips_from_stand['OS_LONG'], bins=bins)

x_bin_edges = xedges
y_bin_edges = yedges

x_bin_edges = [f'{value:.3f}' for value in x_bin_edges]
y_bin_edges = [f'{value:.3f}' for value in y_bin_edges]

# Calculate the average for each bin
hist_average = np.divide(hist_sum, hist_count, where=hist_count != 0)

# Create a heatmap using Seaborn
sns.heatmap(hist_average, cmap='Purples', cbar_kws={'label': 'Taxi Travel Time (s)'})

# Set the x-axis tick labels
plt.xticks(range(bins + 1), x_bin_edges, rotation=45)

# Set the y-axis tick labels
plt.yticks(range(bins + 1), y_bin_edges, rotation=0)

# Set the axis labels
plt.xlabel('Latitude')
plt.ylabel('Longitude')

# Set the title
plt.title('Average Taxi Travel Times')
plt.savefig('heatmap_average.png', dpi=600, bbox_inches='tight')
# Display the plot
plt.show()