In [1]:
import requests
import pandas as pd
import sys
from pandas.io.json import json_normalize
from datetime import datetime
import gc
from google.oauth2 import credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from google.cloud import bigquery

In [2]:
# Set the scopes for the desired permissions
SCOPES = ['https://www.googleapis.com/auth/bigquery']

# Create the flow for authorization
flow = InstalledAppFlow.from_client_secrets_file(
    '...', scopes=SCOPES
)
credentials = flow.run_local_server(port=0)

project_id = '...'

# Use the obtained credentials for authentication
client = bigquery.Client(credentials=credentials, project=project_id) #

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=853561264369-cg46opisk56diiv2ee4nblkb6jmtland.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A51024%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fbigquery&state=ddeRKyOcgQ1yPufk3n8ua8CFuR70DK&access_type=offline


In [3]:
# Construct a reference to the "crypto_bitcoin" dataset
dataset_ref = client.dataset("chicago_taxi_trips", project="bigquery-public-data")

# API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)


In [4]:
# List all the tables in the "hacker_news" dataset
tables = list(client.list_tables(dataset))

# Print names of all tables in the dataset (there are four!)
for table in tables:  
    print(table.table_id)

taxi_trips


In [5]:
# Construct a reference to the "transactions" table
table_ref = dataset_ref.table("taxi_trips")

# API request - fetch the table
table = client.get_table(table_ref)

# Preview the first five lines of the "transactions" table
client.list_rows(table, max_results=5).to_dataframe()

Unnamed: 0,unique_key,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_census_tract,dropoff_census_tract,pickup_community_area,dropoff_community_area,...,extras,trip_total,payment_type,company,pickup_latitude,pickup_longitude,pickup_location,dropoff_latitude,dropoff_longitude,dropoff_location
0,90b5366f38e43094c5d0cebd1e492edf83bac474,d80bc0dd36a85306ac35402f5f62ff5f6775e93b06bda0...,2017-10-05 10:30:00+00:00,2017-10-05 10:45:00+00:00,1080,1.2,,,,,...,0.0,9.5,Cash,Dispatch Taxi Affiliation,,,,,,
1,d0dcdc1363e23f50594bb1202e1aa719af4fdf2b,4e04293a1ab799479725ae45fabd3c04bb33c9e46ad0de...,2017-09-26 22:30:00+00:00,2017-09-26 22:30:00+00:00,360,1.4,,,,,...,1.0,8.0,Cash,Dispatch Taxi Affiliation,,,,,,
2,33b6c9683fbdbf29970b8145b6d82ae0f7cf168c,3fb54b9070b703bd28358e704311d7fe8afb6950881654...,2017-09-27 12:15:00+00:00,2017-09-27 13:00:00+00:00,2340,18.0,,,,,...,5.0,63.6,Credit Card,KOAM Taxi Association,,,,,,
3,55d64e49c22169aa5b47ad5310744ae522addfb2,4097b390ef36dd0909677975a8e6237b7a6947dfc23e24...,2016-04-05 21:30:00+00:00,2016-04-05 21:30:00+00:00,222,1.4,,,,,...,0.0,5.0,Cash,Metro Group,,,,,,
4,8c7542e9342b23da2dc382a7a11b305292e7824f,7fb7fcd6d964e468d888284215533053c6b7aa59b72cda...,2017-09-27 18:30:00+00:00,2017-09-27 18:30:00+00:00,0,0.0,,,,,...,0.0,3.25,Cash,Choice Taxi Association,,,,,,


### Determining how old is the data set and checking if it is applicable to current traffic situation

In [6]:
query = """
        SELECT EXTRACT(YEAR FROM trip_start_timestamp) AS year, 
                COUNT(1) AS num_trips
        FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
        GROUP BY year
        ORDER BY year
        """

In [8]:
query_job = client.query(query)
rides_per_year = query_job.to_dataframe()

In [9]:
rides_per_year

Unnamed: 0,year,num_trips
0,2013,27217300
1,2014,37395079
2,2015,32385527
3,2016,31756403
4,2017,24979611
5,2018,20731105
6,2019,16476440
7,2020,3888831
8,2021,3947677
9,2022,6382071


In [10]:
# Your code goes here
query2 = """
        SELECT EXTRACT(MONTH FROM trip_start_timestamp) AS month, 
                COUNT(1) AS num_trips
        FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
        where EXTRACT(YEAR FROM trip_start_timestamp) = 2017
        GROUP BY month
        ORDER BY month
        """

In [12]:
query_job2 = client.query(query2)
rides_per_month = query_job2.to_dataframe()

In [13]:
rides_per_month

Unnamed: 0,month,num_trips
0,1,1971708
1,2,1909479
2,3,2361634
3,4,2194344
4,5,2322631
5,6,2323672
6,7,2053471
7,8,2078623
8,9,1949427
9,10,2140326


In [14]:
query3 = """
        WITH RelevantRides AS
        (
            SELECT EXTRACT(HOUR FROM trip_start_timestamp) AS hour_of_day, trip_miles, trip_seconds
            FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
            WHERE trip_start_timestamp > '2017-01-01' AND 
                    trip_start_timestamp < '2017-07-01' AND 
                    trip_seconds > 0 AND 
                    trip_miles > 0
        )
        SELECT hour_of_day, 
                COUNT(1) AS num_trips, 
                3600 * SUM(trip_miles) / SUM(trip_seconds) AS avg_mph
        FROM RelevantRides
        GROUP BY hour_of_day
        ORDER BY hour_of_day
                        """

In [15]:
query_job3 = client.query(query3)
speeds_query = query_job3.to_dataframe()

In [16]:
speeds_query

Unnamed: 0,hour_of_day,num_trips,avg_mph
0,0,319326,20.231735
1,1,266525,18.937962
2,2,210147,18.77707
3,3,159666,20.158374
4,4,122183,26.736014
5,5,119311,30.769411
6,6,182736,24.58856
7,7,358403,17.736209
8,8,541745,15.081015
9,9,565455,16.547683
