In [None]:
from matplotlib import pyplot as plt
import sweetviz as sv
import seaborn as sns
import pandas as pd
import numpy as np

from helper_functions import plot_density_graph

# Import files

In [None]:
df = pd.read_csv("data//original_cleaned_nyc_taxi_data_2018.csv")
df_geo = pd.read_csv("data//taxi_zone_geo.csv")

# Data cleaning

In [None]:
# Merging the geo dataset to get the pickup & dropoff boroughts
df = pd.merge(
    df,
    df_geo[['zone_id', 'borough']],
    how='left',
    left_on='pickup_location_id',
    right_on='zone_id'
)

df.rename(columns={'borough': 'pickup_borough'}, inplace=True)
df = df.drop(columns=['zone_id'])

df = pd.merge(
    df,
    df_geo[['zone_id', 'borough']],
    how='left',
    left_on='dropoff_location_id',
    right_on='zone_id'
)

df.rename(columns={'borough': 'dropoff_borough'}, inplace=True)
df = df.drop(columns=['zone_id'])

df = df[df['pickup_borough'].notna()]
df = df[df['dropoff_borough'].notna()]

In [None]:
# Create a field for the tip%
df = df[df['total_amount'] != df['calculated_total_amount']]
df['tip_percent'] = df["tip_amount"] / df["fare_amount"]

# Create the target field
df['tip_percent_class'] = np.nan
df.tip_percent_class = np.where(df.tip_percent.between(0, 0.1, inclusive='left'), 0, df.tip_percent_class)
df.tip_percent_class = np.where(df.tip_percent.between(0.1, 0.2, inclusive='left'), 1, df.tip_percent_class)
df.tip_percent_class = np.where(df.tip_percent.between(0.2, 0.3, inclusive='left'), 2, df.tip_percent_class)
df.tip_percent_class = np.where(df.tip_percent.between(0.3, 0.4, inclusive='left'), 3, df.tip_percent_class)
df.tip_percent_class = np.where(df.tip_percent.between(0.4, 100_000, inclusive='left'), 4, df.tip_percent_class)

# Temporary dataframe
df_pct = df.copy

# Droping columns
df = df.drop(columns=[
    'Unnamed: 0', 'year', 'calculated_total_amount',
    'imp_surcharge', 'payment_type', 'store_and_fwd_flag',
    'tip_amount'#, 'tip_percent'
])

# Creating a dataframe with only 500K to save time for some validations
df_short = df.head(500_000)

In [None]:
# Density graph for 'tip_percent'
plot_density_graph(df_short, 'tip_percent', quantile_upper=0.95)

In [None]:
# Validating nan
df.isna().sum().sum()

# EDA

In [None]:
# Show correlation matrix
corr = df.corr()

fig, ax = plt.subplots(figsize=(10,10))

ax = sns.heatmap(
    round(corr, 2),
    cmap='YlGnBu',
    annot=True,
    linewidths=0.5
)

In [None]:
# Show density graph (KDE) for numeric fields
var_list = [
    'trip_distance', 'fare_amount', 'tolls_amount',
    'total_amount', 'pickup_location_id', 'dropoff_location_id',
    'trip_duration'
]

for var in var_list:
    plot_density_graph(df_short, var, 'tip_percent_class', 0.95)

# SweetViz report

In [None]:
feature_config = sv.FeatureConfig(force_num = ['tip_percent_class'])
sv_report = sv.analyze(df_short, target_feat='tip_percent_class', feat_cfg=feature_config)
sv_report.show_html()