This notebook contains:
    
1) commands to limit trips to $50 as not to skew the data analysis

2) creating the new variables, not saved in the previous notebook (grouping trips by length and fare amount)

3) converting date and time variables from 'object' to date and time values

4) exporting updated notebook to file


In [1]:
# Importing Analitycal and Visual libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from datetime import datetime 

Creating a path

In [2]:
path = r'C:\Users\Paola\Desktop\Shaul\Data\CF\Data Immersion\Achievement 6\Uber October-2023 Basket Analysis'

In [3]:
path

'C:\\Users\\Paola\\Desktop\\Shaul\\Data\\CF\\Data Immersion\\Achievement 6\\Uber October-2023 Basket Analysis'

In [4]:
df = pd.read_csv(os.path.join(path, '2-Data', 'Prepared Data', 'uber_checked2.csv'), index_col = False)

In [5]:
# Checking the dataset
df.shape

(99990, 21)

In [6]:
df.columns

Index(['VendorID', 'pickup_date', 'pickup_time', 'dropff_date', 'dropoff_time',
       'passenger_count', 'trip_distance', 'pickup_longitude',
       'pickup_latitude', 'RatecodeID', 'store_and_fwd_flag',
       'dropoff_longitude', 'dropoff_latitude', 'payment_type', 'fare_amount',
       'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
       'improvement_surcharge', 'total_amount'],
      dtype='object')

In [7]:
# There are 2921 trips that exceed $50 dollars. From those, 64 that exceed $100 dollars. 
# From the latter, 26 that exceed $150 dollars. Lastly, from those 10 exceed $200 dollars. 
# Only four trips exceed $300 dollars. I am limiting the visual analysis to trip costing up to $50, to avoid skewing
# the data unnecessarily.
df = df.loc[df['fare_amount'] < 51]

In [8]:
df.shape

(97106, 21)

In [9]:
# NOTE: I am repeating these steps to create these variables, as I did not save them in the previous notebook.
# Creating a new variable to categorise trips by length
df.loc[df['trip_distance'] < 5, 'Trip category'] = 'Short trip'

In [10]:
df.loc[(df['trip_distance'] >= 5) & (df['trip_distance'] <10), 'Trip category'] = 'Normal trip'

In [11]:
df.loc[df['trip_distance'] >= 10, 'Trip category'] = 'Longer trip'

In [12]:
# Checking value counts for the new variable
df['Trip category'].value_counts(dropna = False)

Short trip     84137
Normal trip     9372
Longer trip     3597
Name: Trip category, dtype: int64

In [13]:
# NOTE: I am repeating these steps to create this variable for the same reason as stated above.
# Creating a new variable to categorise trips by length
df.loc[df['fare_amount'] < 10, 'Fare category'] = 'Cheap trip'

In [14]:
df.loc[(df['fare_amount'] >= 10) & (df['fare_amount'] <20), 'Fare category'] = 'Midprice trip'

In [15]:
df.loc[df['fare_amount'] >= 20, 'Fare category'] = 'Expensive trip'

In [16]:
# Checking value counts for the new variable
df['Fare category'].value_counts(dropna = False)

Cheap trip        50349
Midprice trip     33757
Expensive trip    13000
Name: Fare category, dtype: int64

Dealing with the date and time variables

In [35]:
df.pickup_date.describe()

count                97106
unique                   2
top       10/03/2016 00:00
freq                 74823
Name: pickup_date, dtype: object

In [40]:
# Renaming/Standardising column name: from 'dropff_date' to 'dropoff_date'
df.rename(columns = {'dropff_date' : 'dropoff_date'}, inplace = True)

In [41]:
# I have repeated this command to create new four variables.
#These are: 'pickup date new', 'dropoff date new', 'pickup time new', 'dropoff time new'.
df['dropoff_date_new'] = pd.to_datetime(df['dropoff_date'], format = '%d/%m/%Y %H:%M').dt.date

count        97106
unique         916
top       08:29:00
freq           224
Name: dropoff_time_new, dtype: object

In [47]:
df.pickup_date_new.describe(datetime_is_numeric=True)

count          97106
unique             2
top       2016-03-10
freq           74823
Name: pickup_date_new, dtype: object

In [42]:
df.dropoff_date_new.describe(datetime_is_numeric=True)

count          97106
unique             4
top       2016-03-10
freq           74664
Name: dropoff_date_new, dtype: object

In [48]:
df.pickup_time_new.describe(datetime_is_numeric=True)

count        97106
unique         819
top       08:20:00
freq           238
Name: pickup_time_new, dtype: object

In [49]:
df.dropoff_time_new.describe(datetime_is_numeric=True)

count        97106
unique         916
top       08:29:00
freq           224
Name: dropoff_time_new, dtype: object

In [43]:
df.columns

Index(['VendorID', 'pickup_date', 'pickup_time', 'dropoff_date',
       'dropoff_time', 'passenger_count', 'trip_distance', 'pickup_longitude',
       'pickup_latitude', 'RatecodeID', 'store_and_fwd_flag',
       'dropoff_longitude', 'dropoff_latitude', 'payment_type', 'fare_amount',
       'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
       'improvement_surcharge', 'total_amount', 'Trip category',
       'Fare category', 'pickup_time_new', 'dropoff_time_new',
       'pickup_date_new', 'dropoff_date_new'],
      dtype='object')

In [50]:
# Exporting 'df' to csv
df.to_csv(os.path.join(path, '2-Data','Prepared Data', 'uber_limited_to_usd50_trips.csv'), index=False)