In [2]:
import numpy as np
import pandas as pd
import sqlite3

In [3]:

# import the  data

app_downloads = pd.read_csv('metrocar_data/app_downloads.csv')
reviews = pd.read_csv('metrocar_data/reviews.csv')
ride_requests = pd.read_csv('metrocar_data/ride_requests.csv')
signups = pd.read_csv('metrocar_data/signups.csv')
transactions = pd.read_csv('metrocar_data/transactions.csv')


  ride_requests = pd.read_csv('metrocar_data/ride_requests.csv')


In [4]:
# create the SQLite connection

cnn = sqlite3.connect('cars.db')

In [5]:
# Save the data to SQLite

app_downloads.to_sql('app_downloads', cnn, if_exists='replace')
reviews.to_sql('reviews', cnn, if_exists='replace')
ride_requests.to_sql('ride_requests', cnn, if_exists='replace')
signups.to_sql('signups', cnn, if_exists='replace')
transactions.to_sql('transactions', cnn, if_exists='replace')

223652

In [6]:
# Load the ipython-sql extension
%load_ext sql

# Connect to the SQLite database
%sql sqlite:///cars.db

'Connected: @cars.db'

In [7]:
%%sql

SELECT COUNT(*)
FROM app_downloads;

 * sqlite:///cars.db
Done.


COUNT(*)
23608


### Dataset structure

Description of each table and its columns.

#### app_downloads: contains information about app downloads
* app_download_key: unique id of an app download
* platform: ios, android or web
* download_ts: download timestamp


#### signups: contains information about new user signups

* user_id: primary id for a user
* session_id: id of app download
* signup_ts: signup timestamp


#### ride_requests: contains information about rides


* ride_id: primary id for a ride
* user_id: foreign key to user (requester)
* driver_id: foreign key to driver
* request_ts: ride request timestamp
* accept_ts: driver accept timestamp
* pickup_location: pickup coordinates
* destination_location: destination coordinates
* pickup_ts: pickup timestamp
* dropoff_ts: dropoff timestamp
* cancel_ts: ride cancel timestamp (accept, pickup and dropoff timestamps may be null)


#### transactions: contains information about financial transactions based on completed rides:


* ride_id: foreign key to ride
* purchase_amount_usd: purchase amount in USD
* charge_status: approved, cancelled
* transaction_ts: transaction timestamp


#### reviews: contains information about driver reviews once rides are completed


* review_id: primary id of review
* ride_id: foreign key to ride
* driver_id: foreign key to driver
* user_id: foreign key to user (requester)
* rating: rating from 0 to 5
* free_response: text response given by user/requester

How many times was the app downloaded?


In [8]:
%%sql

SELECT COUNT(*) Number_of_downloads
FROM app_downloads;

 * sqlite:///cars.db
Done.


Number_of_downloads
23608


How many users signed up on the app?

In [9]:
%%sql

SELECT COUNT (DISTINCT user_id) AS total_user_signups
FROM signups

 * sqlite:///cars.db
Done.


total_user_signups
17623


How many rides were requested through the app?


In [10]:
%%sql

SELECT COUNT(request_ts) AS total_requests
FROM ride_requests

 * sqlite:///cars.db
Done.


total_requests
385477


How many rides were requested and completed through the app?


In [11]:
%%sql

SELECT COUNT(dropoff_ts) AS total_completed_requests
FROM ride_requests

 * sqlite:///cars.db
Done.


total_completed_requests
223652


How many rides were requested and how many unique users requested a ride?


In [12]:
%%sql

SELECT COUNT(ride_id) total_ride_requests, COUNT(DISTINCT user_id) total_users
FROM ride_requests

 * sqlite:///cars.db
Done.


total_ride_requests,total_users
385477,12406


What is the average time of a ride from pick up to drop off?


In [13]:
ride_requests.head()

Unnamed: 0,ride_id,user_id,driver_id,request_ts,accept_ts,pickup_location,dropoff_location,pickup_ts,dropoff_ts,cancel_ts
0,3080556,108995,116266.0,2021-07-26 09:01:00,2021-07-26 09:19:00,40.72216744 -73.96212375,40.72262277 -73.87869592,,,2021-07-26 09:25:00
1,3081967,110902,106286.0,2021-08-23 16:42:00,2021-08-23 16:59:00,40.74316496 -73.90019974,40.79460742 -73.81512246,,,2021-08-23 17:09:00
2,3088174,114998,116029.0,2021-11-13 17:06:00,2021-11-13 17:23:00,40.76639545 -73.877075,40.75548354 -73.9505886,,,2021-11-13 17:33:00
3,3180652,112421,109517.0,2021-10-16 15:28:00,2021-10-16 15:47:00,40.76499488 -73.8481648,40.86129334 -73.89895067,,,2021-10-16 15:52:00
4,3191244,104964,106628.0,2021-05-07 19:31:00,2021-05-07 19:51:00,40.73396733 -73.79521329,40.88348849 -74.03331643,,,2021-05-07 19:58:00


In [14]:
%%sql

SELECT ROUND(AVG((strftime('%s', dropoff_ts) - strftime('%s', pickup_ts)) / 60),2) AS average_duration_mins
FROM ride_requests

 * sqlite:///cars.db
Done.


average_duration_mins
52.61


How many rides were accepted by a driver?


In [15]:
%%sql

SELECT COUNT(accept_ts) AS total_accepted_requests
FROM ride_requests

 * sqlite:///cars.db
Done.


total_accepted_requests
248379


How many rides did we successfully collect payments and how much was collected?


In [28]:
%%sql

SELECT COUNT(transaction_ts) AS total_transations, ROUND(SUM(purchase_amount_usd), 2) AS total_usd_collected
FROM transactions
WHERE charge_status LIKE 'approved'

 * sqlite:///cars.db
Done.


total_transations,total_usd_collected
212628,4251667.61


How many ride requests happened on each platform?


In [17]:
%%sql

SELECT platform, COUNT(request_ts) total_requests
FROM app_downloads a
LEFT JOIN signups s
ON a.app_download_key = s.session_id
LEFT JOIN ride_requests r
ON r.user_id = s.user_id
GROUP BY 1

 * sqlite:///cars.db
Done.


platform,total_requests
android,112317
ios,234693
web,38467


What is the drop-off from users signing up to users requesting a ride?

Reference: https://popsql.com/sql-templates/marketing/running-a-funnel-analysis#calculating-drop-off-at-each-step

In [35]:
%%sql

WITH steps AS (
    SELECT 'Sign Up' as step, COUNT(DISTINCT user_id) as count FROM signups
    UNION
    SELECT 'Ride Request' as step, COUNT(request_ts) as count FROM ride_requests
)


SELECT step, 
        count, 
        lag(count, 1) OVER() as previous_count,
        ROUND((1.0 - count * 1.0 / lag(count) OVER (ORDER BY step)), 2) AS drop_off


from steps;

 * sqlite:///cars.db
Done.


step,count,previous_count,drop_off
Ride Request,385477,,
Sign Up,17623,385477.0,0.95


In [40]:
app_downloads['download_ts'].min(), app_downloads['download_ts'].max()

('2021-01-01 00:05:59', '2021-12-31 23:52:27')

In [36]:
%%sql

-- VISITORS (DEFINES THE GROUP WE FOLLOW THROUGH THE FUNNEL)
with visitors as (
  select
    distinct_id, -- effectively a user_id
    min(time) as min_time -- gets the earliest Visit for each person
  from events
  where name = 'View Landing Page'
  group by 1
  having min(time) between '2020-04-01' and '2020-05-31' -- selects people whose first visit is in this time range
),

-- SIGN-UPS (FROM THE VISITORS ABOVE)
sign_ups as (
  select
    distinct e.distinct_id
  from visitors v -- ensures we only look at the Visitors defined above
  inner join events e on e.distinct_id = v.distinct_id
  where e.name = 'Sign Up' -- an internal event that defines sign-up
),

-- ACTIVATIONS (FROM THE SIGN-UPS ABOVE)
activations as (
  select
    distinct e.distinct_id
  from sign_ups s  -- ensures we only look at the Signups defined above
  inner join events e on e.distinct_id = s.distinct_id
  where e.name = 'New Canvas'
),

-- PURCHASES (FROM THE ACTIVATIONS ABOVE)
purchases as (
  select
    distinct e.distinct_id
  from activations a  -- ensures we only look at the Activations defined above
  inner join events e on e.distinct_id = a.distinct_id
  where e.name = 'Start Subscription'   
)


-- add a CTE called steps so we can do one further analysis
steps as (
  select 'Visit' as step, COUNT(*) from visitors
    union
   select 'Sign Up' as step, COUNT(*) from sign_ups
    union
  select 'Activate' as step, COUNT(*) from activations
    union
  select 'Purchase' as step, COUNT(*) from purchases
  order by count desc
)
select
  step,
  count,
  lag(count, 1) over (),
    round((1.0 - count::numeric/lag(count, 1) over ()),2) as drop_off

from steps;


 * sqlite:///cars.db
(sqlite3.OperationalError) no such table: events
[SQL: -- VISITORS (DEFINES THE GROUP WE FOLLOW THROUGH THE FUNNEL)
with visitors as (
  select
    distinct_id, -- effectively a user_id
    min(time) as min_time -- gets the earliest Visit for each person
  from events
  where name = 'View Landing Page'
  group by 1
  having min(time) between '2020-04-01' and '2020-05-31' -- selects people whose first visit is in this time range
),

-- SIGN-UPS (FROM THE VISITORS ABOVE)
sign_ups as (
  select
    distinct e.distinct_id
  from visitors v -- ensures we only look at the Visitors defined above
  inner join events e on e.distinct_id = v.distinct_id
  where e.name = 'Sign Up' -- an internal event that defines sign-up
),

-- ACTIVATIONS (FROM THE SIGN-UPS ABOVE)
activations as (
  select
    distinct e.distinct_id
  from sign_ups s  -- ensures we only look at the Signups defined above
  inner join events e on e.distinct_id = s.distinct_id
  where e.name = 'New Canvas'
),

