In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sqlalchemy import create_engine, text

In [3]:
database_name = 'scooters'    

connection_string = f"postgresql://postgres:postgres@localhost:5432/{database_name}"

In [4]:
engine = create_engine(connection_string)

SQL Query for trips with data cleaned

In [5]:
trips = '''
SELECT sumdid, companyname, startdate, starttime, enddate, endtime
FROM trips
WHERE tripduration >= 1.0
    AND tripduration <= (1440)
    OR (tripdistance >= 0 AND tripduration >= 5.0 AND tripduration <=1440);
'''

with engine.connect() as connection: result = connection.execute(text(trips))

In [6]:
result.fetchone()

('PoweredJREQVMOESVGFU', 'Lime', datetime.date(2019, 5, 1), datetime.time(22, 42, 47), datetime.date(2019, 5, 1), datetime.time(22, 44, 23))

In [7]:
with engine.connect() as connection:    
    scooters = pd.read_sql(text(trips), con = connection)

In [8]:
trips = scooters

In [9]:
trips

Unnamed: 0,sumdid,companyname,startdate,starttime,enddate,endtime
0,PoweredJREQVMOESVGFU,Lime,2019-05-01,22:42:47,2019-05-01,22:44:23
1,PoweredQOW23KD6JPIJZ,Lime,2019-05-01,22:51:15,2019-05-01,22:53:33
2,PoweredFG5SZ3AVI3KHG,Lime,2019-05-01,22:41:57,2019-05-01,22:47:55
3,PoweredQLO2NN7JJYYFK,Lime,2019-05-01,22:44:17,2019-05-01,22:56:48
4,PoweredQPTPXL34FTX5V,Lime,2019-05-01,22:44:32,2019-05-01,22:47:11
...,...,...,...,...,...,...
549425,PoweredIFL2C43JQRDWM,Lime,2019-05-01,22:39:09,2019-05-01,22:44:31
549426,Powered6WBVA6O5VSQ6Q,Lime,2019-05-01,22:42:03,2019-05-01,23:00:45
549427,PoweredUWPZODDQ4NYUB,Lime,2019-05-01,22:58:42,2019-05-01,23:04:21
549428,Powered25UE3EUVBN6RU,Lime,2019-05-01,22:53:06,2019-05-01,22:58:53


In [10]:
trips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 549430 entries, 0 to 549429
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   sumdid       549430 non-null  object
 1   companyname  549430 non-null  object
 2   startdate    549430 non-null  object
 3   starttime    549430 non-null  object
 4   enddate      549430 non-null  object
 5   endtime      549430 non-null  object
dtypes: object(6)
memory usage: 25.2+ MB


The goal of Metro Nashville is to have each scooter used a minimum of 3 times per day. Based on the data, what is the average number of trips per scooter per day? Make sure to consider the days that a scooter was available. How does this vary by company?

In [11]:
trips['startdate'] = pd.to_datetime(trips['startdate'])

In [12]:
trips['startdate'] = pd.to_datetime(trips['startdate'])
trips['enddate'] = pd.to_datetime(trips['enddate'])
trips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 549430 entries, 0 to 549429
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   sumdid       549430 non-null  object        
 1   companyname  549430 non-null  object        
 2   startdate    549430 non-null  datetime64[ns]
 3   starttime    549430 non-null  object        
 4   enddate      549430 non-null  datetime64[ns]
 5   endtime      549430 non-null  object        
dtypes: datetime64[ns](2), object(4)
memory usage: 25.2+ MB


In [23]:
trips['startdate'] = pd.to_datetime(trips['startdate'], format = '%m/%d/%Y %I:%M:%S %p')
trips.head()

Unnamed: 0,sumdid,companyname,startdate,starttime,enddate,endtime,start_datetime,end_datetime,month,day,hour
0,PoweredJREQVMOESVGFU,Lime,2019-05-01,22:42:47,2019-05-01,22:44:23,2019-05-01 22:42:00,2019-05-01 22:44:00,5,1,22
1,PoweredQOW23KD6JPIJZ,Lime,2019-05-01,22:51:15,2019-05-01,22:53:33,2019-05-01 22:51:00,2019-05-01 22:53:00,5,1,22
2,PoweredFG5SZ3AVI3KHG,Lime,2019-05-01,22:41:57,2019-05-01,22:47:55,2019-05-01 22:41:00,2019-05-01 22:47:00,5,1,22
3,PoweredQLO2NN7JJYYFK,Lime,2019-05-01,22:44:17,2019-05-01,22:56:48,2019-05-01 22:44:00,2019-05-01 22:56:00,5,1,22
4,PoweredQPTPXL34FTX5V,Lime,2019-05-01,22:44:32,2019-05-01,22:47:11,2019-05-01 22:44:00,2019-05-01 22:47:00,5,1,22


In [27]:
trips.assign(month = trips['startdate'].dt.month_name()).month.value_counts(sort = False)

May     220004
June    199470
July    129956
Name: month, dtype: int64

In [34]:
(trips
 .assign(date = trips['startdate'].dt.date, 
         hour = trips['startdate'].dt.hour)     # Create a date and hour column so that we can group
 .groupby(['date'])
 ['sumdid']
 .count()
 .reset_index()
)

Unnamed: 0,date,sumdid
0,2019-05-01,4501
1,2019-05-02,4234
2,2019-05-03,7542
3,2019-05-04,10018
4,2019-05-05,8016
...,...,...
87,2019-07-27,7134
88,2019-07-28,5210
89,2019-07-29,3554
90,2019-07-30,3118


Number of scooters used on trips per day

In [17]:
trip_count = trips[['companyname', 'startdate', 'sumdid']]\
.groupby(['companyname', 'startdate']).count()

trip_count

Unnamed: 0_level_0,Unnamed: 1_level_0,sumdid
companyname,startdate,Unnamed: 2_level_1
Bird,2019-05-01,1088
Bird,2019-05-02,1222
Bird,2019-05-03,1962
Bird,2019-05-04,2668
Bird,2019-05-05,1952
...,...,...
SPIN,2019-07-27,608
SPIN,2019-07-28,422
SPIN,2019-07-29,292
SPIN,2019-07-30,227


Number of scooters used per company

In [49]:
trip_count = trips[['companyname', 'startdate', 'sumdid']]\
.groupby(['companyname', 'startdate'])

trip_count

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000243E21CD9A0>

In [18]:
trip_count.reset_index(inplace = True)

trip_count

Unnamed: 0,companyname,startdate,sumdid
0,Bird,2019-05-01,1088
1,Bird,2019-05-02,1222
2,Bird,2019-05-03,1962
3,Bird,2019-05-04,2668
4,Bird,2019-05-05,1952
...,...,...,...
526,SPIN,2019-07-27,608
527,SPIN,2019-07-28,422
528,SPIN,2019-07-29,292
529,SPIN,2019-07-30,227


fixed index

In [50]:
trips_average = '''
SELECT TO_CHAR(startdate, 'YYYY') AS YEAR,
    TO_CHAR(startdate, 'MM') AS MONTH,
	TO_CHAR(startdate, 'DD') AS DAY,
     COUNT( DISTINCT sumdid) /  COUNT(DISTINCT startdate) avg_trips_per_day
    FROM trips
	WHERE (tripdistance > 0 
	AND tripduration >= 5.0)
	AND tripduration >= 1.0
	AND tripduration <= 1440
	GROUP BY startdate;
'''

with engine.connect() as connection: result = connection.execute(text(trips_average))

In [54]:
with engine.connect() as connection:    
    trips_average = pd.read_sql(text(trips_average), con = connection)

In [55]:
trips_average

Unnamed: 0,year,month,day,avg_trips_per_day
0,2019,05,01,1573
1,2019,05,02,1616
2,2019,05,03,2253
3,2019,05,04,2373
4,2019,05,05,2141
...,...,...,...,...
87,2019,07,27,1921
88,2019,07,28,1625
89,2019,07,29,1238
90,2019,07,30,1082


Average number of trips per day but need to get per scooter per company

In [57]:
trips_average = '''
SELECT TO_CHAR(startdate, 'YYYY') AS YEAR,
    TO_CHAR(startdate, 'MM') AS MONTH,
	TO_CHAR(startdate, 'DD') AS DAY,
COUNT (DISTINCT sumdid) AS scooters, COUNT(starttime) AS number_start_times, COUNT(starttime)/COUNT(DISTINCT sumdid) AS avg_scooters_per_day_used
FROM trips
WHERE (tripdistance > 0 
	AND tripduration >= 5.0)
	AND tripduration >= 1.0
	AND tripduration <= 1440
GROUP BY startdate;
'''

with engine.connect() as connection: result = connection.execute(text(trips_average))

In [58]:
with engine.connect() as connection:    
    trips_average = pd.read_sql(text(trips_average), con = connection)

In [59]:
trips_average

Unnamed: 0,year,month,day,scooters,number_start_times,avg_scooters_per_day_used
0,2019,05,01,1573,3013,1
1,2019,05,02,1616,2872,1
2,2019,05,03,2253,5698,2
3,2019,05,04,2373,7590,3
4,2019,05,05,2141,6129,2
...,...,...,...,...,...,...
87,2019,07,27,1921,5193,2
88,2019,07,28,1625,3757,2
89,2019,07,29,1238,2401,1
90,2019,07,30,1082,2012,1


In [61]:
trips_average = '''
SELECT companyname, TO_CHAR(startdate, 'YYYY') AS YEAR,
    TO_CHAR(startdate, 'MM') AS MONTH,
	TO_CHAR(startdate, 'DD') AS DAY,
COUNT (DISTINCT sumdid) AS scooters, COUNT(starttime) AS number_start_times, COUNT(starttime)/COUNT(DISTINCT sumdid) AS avg_scooters_per_day_used
FROM trips
WHERE (tripdistance > 0 
	AND tripduration >= 5.0)
	AND tripduration >= 1.0
	AND tripduration <= 1440
GROUP BY startdate, companyname
ORDER BY companyname;
'''

with engine.connect() as connection: result = connection.execute(text(trips_average))

In [62]:
with engine.connect() as connection:    
    trips_average = pd.read_sql(text(trips_average), con = connection)

In [63]:
trips_average

Unnamed: 0,companyname,year,month,day,scooters,number_start_times,avg_scooters_per_day_used
0,Bird,2019,05,01,557,759,1
1,Bird,2019,05,02,609,848,1
2,Bird,2019,05,03,813,1417,1
3,Bird,2019,05,04,914,1799,1
4,Bird,2019,05,05,758,1452,1
...,...,...,...,...,...,...,...
525,SPIN,2019,07,27,231,555,2
526,SPIN,2019,07,28,178,364,2
527,SPIN,2019,07,29,125,239,1
528,SPIN,2019,07,30,111,186,1


average number of scooters used per day by company. I divided the total number of start times per day divided by number of scooters used that day. 

The question asks to make sure to use scooters was available but I figured by using data from trips table, the scooters were avilable to use. 