In [1]:
#Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
from scipy.stats import linregress
import numpy as np

#File to load
cab_data_path = "Resources/Kaggle_Data/cab_rides.csv"

#Read the Cab data
cab_df = pd.read_csv(cab_data_path)

#Display the cab_df file for preview
cab_df

Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,name
0,0.44,Lyft,1544952607890,North Station,Haymarket Square,5.0,1.0,424553bb-7174-41ea-aeb4-fe06d4f4b9d7,lyft_line,Shared
1,0.44,Lyft,1543284023677,North Station,Haymarket Square,11.0,1.0,4bd23055-6827-41c6-b23b-3c491f24e74d,lyft_premier,Lux
2,0.44,Lyft,1543366822198,North Station,Haymarket Square,7.0,1.0,981a3613-77af-4620-a42a-0c0866077d1e,lyft,Lyft
3,0.44,Lyft,1543553582749,North Station,Haymarket Square,26.0,1.0,c2d88af2-d278-4bfd-a8d0-29ca77cc5512,lyft_luxsuv,Lux Black XL
4,0.44,Lyft,1543463360223,North Station,Haymarket Square,9.0,1.0,e0126e1f-8ca9-4f2e-82b3-50505a09db9a,lyft_plus,Lyft XL
...,...,...,...,...,...,...,...,...,...,...
693066,1.00,Uber,1543708385534,North End,West End,13.0,1.0,616d3611-1820-450a-9845-a9ff304a4842,6f72dfc5-27f1-42e8-84db-ccc7a75f6969,UberXL
693067,1.00,Uber,1543708385534,North End,West End,9.5,1.0,633a3fc3-1f86-4b9e-9d48-2b7132112341,55c66225-fbe7-4fd5-9072-eab1ece5e23e,UberX
693068,1.00,Uber,1543708385534,North End,West End,,1.0,64d451d0-639f-47a4-9b7c-6fd92fbd264f,8cf7e821-f0d3-49c6-8eba-e679c0ebcf6a,Taxi
693069,1.00,Uber,1543708385534,North End,West End,27.0,1.0,727e5f07-a96b-4ad1-a2c7-9abc3ad55b4e,6d318bcc-22a3-4af6-bddd-b409bfce1546,Black SUV


In [4]:
#Clean Data

#Convert the 13-digit timestamp to a date-time format
import datetime
cab_df['Time Stamp'] = pd.to_datetime(cab_df['time_stamp']/1000, unit = 's')
cab_df['Day of Week'] = cab_df['Time Stamp'].dt.dayofweek

#Print the unique dates on which data was collected
cab_df['Date'] = cab_df['Time Stamp'].dt.date
cab_df['Date'].unique()

array([datetime.date(2018, 12, 16), datetime.date(2018, 11, 27),
       datetime.date(2018, 11, 28), datetime.date(2018, 11, 30),
       datetime.date(2018, 11, 29), datetime.date(2018, 12, 17),
       datetime.date(2018, 11, 26), datetime.date(2018, 12, 2),
       datetime.date(2018, 12, 3), datetime.date(2018, 12, 13),
       datetime.date(2018, 12, 14), datetime.date(2018, 12, 1),
       datetime.date(2018, 12, 18), datetime.date(2018, 12, 15),
       datetime.date(2018, 12, 4), datetime.date(2018, 12, 10),
       datetime.date(2018, 12, 9)], dtype=object)

In [5]:
#Continue cleaning data

#Remove columns that are not applicable to analysis
cleaned_cab_df = cab_df[['distance', 'cab_type', 'destination', 'source', 'price', 'Day of Week']]

#Rename columns
cleaned_cab_df = cleaned_cab_df.rename(columns = {'distance': 'Distance',
                                                  'cab_type': 'Company',
                                                  'destination': 'Destination',
                                                  'source': 'Source',
                                                  'price': 'Price'})
#Remove rows that are missing values
cleaned_cab_df = cleaned_cab_df.dropna(how='any')

#Export the cleaned data file as a csv file
cleaned_cab_df.to_csv("Resources/cleaned_Kaggle_data.csv", index=False, header=True)

#Print the cleaned dataframe
cleaned_cab_df

Unnamed: 0,Distance,Company,Destination,Source,Price,Day of Week
0,0.44,Lyft,North Station,Haymarket Square,5.0,6
1,0.44,Lyft,North Station,Haymarket Square,11.0,1
2,0.44,Lyft,North Station,Haymarket Square,7.0,2
3,0.44,Lyft,North Station,Haymarket Square,26.0,4
4,0.44,Lyft,North Station,Haymarket Square,9.0,3
...,...,...,...,...,...,...
693065,1.00,Uber,North End,West End,9.5,5
693066,1.00,Uber,North End,West End,13.0,5
693067,1.00,Uber,North End,West End,9.5,5
693069,1.00,Uber,North End,West End,27.0,5
