# Week 1. Docker / SQL. Solution

```bash
pip install -r requirements.txt
```

In [1]:
import pandas as pd
from sqlalchemy import create_engine

In [2]:
engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')
engine.connect()

<sqlalchemy.engine.base.Connection at 0x7fa120560640>

### Build docker image
```
docker build -t data_ingestion .
```

### Ingest data into DB
```
docker run --network week_1_docker_sql_taxi data_ingestion \
    --user=root \
    --password=root \
    --host=pgdatabase \
    --port=5432 \
    --db=ny_taxi \
    --trips_url="https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-01.csv.gz" \
    --zones_url="https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv"
```

### How many taxi trips were totally made on January 15?

In [5]:
query = """
SELECT count(*)
FROM yellow_taxi_data ytd
WHERE lpep_pickup_datetime >= '2019-01-15'
  AND lpep_pickup_datetime < '2019-01-16';
"""

pd.read_sql(query, con=engine)

Unnamed: 0,count
0,20689


### Which was the day with the largest trip distance?

In [6]:
query = """
SELECT *
FROM yellow_taxi_data ytd
ORDER BY trip_distance DESC
LIMIT 1;
"""

pd.read_sql(query, con=engine)

Unnamed: 0,index,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,...,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,297377,2,2019-01-15 19:27:58,2019-01-15 22:59:01,N,1,221,265,1,117.99,...,1.0,0.5,0.0,10.5,,0.3,339.2,2,1,


### In 2019-01-01 how many trips had 2 and 3 passengers?

In [8]:
query = """
SELECT count(*),
       passenger_count
FROM yellow_taxi_data ytd
WHERE lpep_pickup_datetime >= '2019-01-01'
  AND lpep_pickup_datetime < '2019-01-02'
GROUP BY passenger_count;
"""

pd.read_sql(query, con=engine)

Unnamed: 0,count,passenger_count
0,21,0
1,12415,1
2,1282,2
3,254,3
4,129,4
5,616,5
6,273,6


### For the passengers picked up in the Astoria Zone which was the drop off zone that had the largest tip?

In [9]:
query = """
SELECT z2."Zone",
       tip_amount
FROM
  (SELECT *
   FROM yellow_taxi_data ytd
   JOIN zones z ON z."LocationID" = ytd."PULocationID"
   WHERE z."Zone" = 'Astoria'
   ORDER BY tip_amount DESC
   LIMIT 1) AS max_tip
JOIN zones z2 ON max_tip."DOLocationID" = z2."LocationID";
"""

pd.read_sql(query, con=engine)

Unnamed: 0,Zone,tip_amount
0,Long Island City/Queens Plaza,88.0
