# Data Manipulation in Python
## INFORMS 2020 Python Workshop
### Amy Newman

## Import Packages

In [235]:
import pandas as pd
import numpy as np
import json

## Loading / Saving JSON 
- JSON (Javascript Object Notation)
    - it's a standardized format to pass data around; it's easy for machines and humans to use
    - it's a form of text data written in Javascript notation and can be sent to and from a server
    - Twitter and Facebook are examples where you would retrieve data in JSON format

In [118]:
#--JSON: kind of looks like a dictionary, huh?

json = { "firstName": "Alan", "lastName": "Doe", "hobbies": ["running", "climbing", "dancing"], 
        "age": 35, "children": [ { "firstName": "Sam", "age": 6 }, { "firstName": "Jamie", "age": 8 } ] }



#--Let's make it a bit easier to see the structure--

json = {
    "firstName": "Alan",
    "lastName": "Doe",
    "hobbies": ["running", "climbing", "dancing"],
    "age": 35,
    "children": [
        {
            "firstName": "Sam",
            "age": 6
        },
        {
            "firstName": "Jamie",
            "age": 8
        }
    ]
}

In [119]:
json['firstName']

'Alan'

In [123]:
json['children'][0]['firstName']

'Sam'

In [65]:
#--Loading JSON Tweet data--
with open('data/tweets_from_nyc_taxis.json', 'r') as f:
    tweets = f.readlines()

In [60]:
len(data)

13269

In [127]:
tw_lst = []
for elem in tweets:
    tw_lst.append(elem)

In [131]:
tw_lst[0]

'{"created_at": "Tue May 01 14:18:26 +0000 2018", "id": 991320947700523009, "id_str": "991320947700523009", "text": "@NeilECollins @nyc311 Thank you for reading, Mr. Collins.  Having a TLC driver license is a privilege.  Taxi passen\\u2026 https://t.co/78C3glo1yl", "truncated": true, "entities": {"hashtags": [], "symbols": [], "user_mentions": [{"screen_name": "NeilECollins", "name": "Neil Collins", "id": 2813412205, "id_str": "2813412205", "indices": [0, 13]}, {"screen_name": "nyc311", "name": "New York City 311", "id": 37683414, "id_str": "37683414", "indices": [14, 21]}], "urls": [{"url": "https://t.co/78C3glo1yl", "expanded_url": "https://twitter.com/i/web/status/991320947700523009", "display_url": "twitter.com/i/web/status/9\\u2026", "indices": [117, 140]}]}, "source": "<a href=\\"http://twitter.com\\" rel=\\"nofollow\\">Twitter Web Client</a>", "in_reply_to_status_id": 991311021905207298, "in_reply_to_status_id_str": "991311021905207298", "in_reply_to_user_id": 2813412205, "in_re

In [125]:
tweets[0]

'{"created_at": "Tue May 01 14:18:26 +0000 2018", "id": 991320947700523009, "id_str": "991320947700523009", "text": "@NeilECollins @nyc311 Thank you for reading, Mr. Collins.  Having a TLC driver license is a privilege.  Taxi passen\\u2026 https://t.co/78C3glo1yl", "truncated": true, "entities": {"hashtags": [], "symbols": [], "user_mentions": [{"screen_name": "NeilECollins", "name": "Neil Collins", "id": 2813412205, "id_str": "2813412205", "indices": [0, 13]}, {"screen_name": "nyc311", "name": "New York City 311", "id": 37683414, "id_str": "37683414", "indices": [14, 21]}], "urls": [{"url": "https://t.co/78C3glo1yl", "expanded_url": "https://twitter.com/i/web/status/991320947700523009", "display_url": "twitter.com/i/web/status/9\\u2026", "indices": [117, 140]}]}, "source": "<a href=\\"http://twitter.com\\" rel=\\"nofollow\\">Twitter Web Client</a>", "in_reply_to_status_id": 991311021905207298, "in_reply_to_status_id_str": "991311021905207298", "in_reply_to_user_id": 2813412205, "in_re

In [71]:
#--Pretty print the first tweet to get a sense of the data--
parsed = json.loads(tweets[0])
print(json.dumps(parsed, indent=4, sort_keys=True))

{
    "contributors": null,
    "coordinates": null,
    "created_at": "Tue May 01 14:18:26 +0000 2018",
    "entities": {
        "hashtags": [],
        "symbols": [],
        "urls": [
            {
                "display_url": "twitter.com/i/web/status/9\u2026",
                "expanded_url": "https://twitter.com/i/web/status/991320947700523009",
                "indices": [
                    117,
                    140
                ],
                "url": "https://t.co/78C3glo1yl"
            }
        ],
        "user_mentions": [
            {
                "id": 2813412205,
                "id_str": "2813412205",
                "indices": [
                    0,
                    13
                ],
                "name": "Neil Collins",
                "screen_name": "NeilECollins"
            },
            {
                "id": 37683414,
                "id_str": "37683414",
                "indices": [
                    14,
                    21
   

In [74]:
your_json = '["foo", {"bar":["baz", null, 1.0, 2]}]'
parsed = json.loads(your_json)
print(json.dumps(parsed, indent=4, sort_keys=True))

[
    "foo",
    {
        "bar": [
            "baz",
            null,
            1.0,
            2
        ]
    }
]


In [98]:
#--Loading and saving the JSON data--
with open('data/tweets_from_nyc_taxis.json', 'r', encoding='utf-8') as f:
    data = f.read()

In [99]:
type(data)

str

In [100]:
#--Loading and saving the JSON data--
with open('data/tweets_from_nyc_taxis.json', 'r') as f:
    data = f.readlines()

In [102]:
type(data)

list

In [117]:
#--Loading JSON Twitter data--
taxi_tweets = 'data/tweets_from_nyc_taxis.json'

with open(taxi_tweets) as f:
    data = json.loads(f.read())

JSONDecodeError: Extra data: line 2 column 1 (char 3082)

In [109]:
#--Saving JSON data to a file--
file_name = "data/data_file.json" 

with open(file_name, "w") as write_file:
    json.dump(data, write_file)

## Loading / Saving pandas

In [303]:
#--Loading a .csv file to pandas--
zones = pd.read_csv('data/taxi_zones_simple.csv')

In [304]:
#--taking a look at the first 5 rows to see if it loaded ok--
zones.head()

Unnamed: 0,locationid,zone,borough
0,1,Newark Airport,EWR
1,2,Jamaica Bay,Queens
2,3,Allerton/Pelham Gardens,Bronx
3,4,Alphabet City,Manhattan
4,5,Arden Heights,Staten Island


In [222]:
trips = pd.read_csv('data/clean_taxi_sample.tsv.gz', sep='\t')

In [223]:
trips.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2018-01-01 00:00:00,2018-01-01 00:00:00,1,9.14,1,N,229,17,2,27.0,0.0,0.5,0.0,0.0,0.3,27.8
1,2,2018-01-01 00:00:02,2018-01-01 00:08:48,1,1.36,1,N,68,249,2,7.5,0.5,0.5,0.0,0.0,0.3,8.8
2,1,2018-01-01 00:00:03,2018-01-01 00:21:06,1,6.1,1,N,255,236,2,20.5,0.5,0.5,0.0,0.0,0.3,21.8
3,2,2018-01-01 00:00:03,2018-01-01 00:03:52,3,0.99,1,N,236,43,2,5.5,0.5,0.5,0.0,0.0,0.3,6.8
4,1,2018-01-01 00:00:04,2018-01-01 00:13:24,1,3.6,1,N,37,157,2,13.5,0.5,0.5,0.0,0.0,0.3,14.8


In [170]:
counts = pd.read_csv('data/daily_taxi_counts_2018.tsv', sep='\t')

In [172]:
counts.head()

Unnamed: 0,date,n_trips
0,2018-03-01,332802
1,2018-03-02,326944
2,2018-03-03,316823
3,2018-03-04,286326
4,2018-03-05,290631


## Viewing / Selecting pandas
- .head( )
    - allows you to view the first 5 rows in your data, including column names
    - it's helpful to do this anytime you create a dataframe
- .shape
    - allows you to see the number of rows and columns within your dataframe
    - it's helpful to do this anytime you create a dataframe
- .loc[ ]
    - gets rows (or columns) with specific labels from the index
- .iloc[ ]
    - gets rows (or columns) at specific positions in the index (so it only takes integers)
- df [ ['column_name'] ]
    - returns the named column of data

In [157]:
#--Is this the shape you expected for your dataframe?--
trips.shape

(20000, 17)

In [136]:
#--Viewing your data--
trips.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2018-01-01 00:00:00,2018-01-01 00:00:00,1,9.14,1,N,229,17,2,27.0,0.0,0.5,0.0,0.0,0.3,27.8
1,2,2018-01-01 00:00:02,2018-01-01 00:08:48,1,1.36,1,N,68,249,2,7.5,0.5,0.5,0.0,0.0,0.3,8.8
2,1,2018-01-01 00:00:03,2018-01-01 00:21:06,1,6.1,1,N,255,236,2,20.5,0.5,0.5,0.0,0.0,0.3,21.8
3,2,2018-01-01 00:00:03,2018-01-01 00:03:52,3,0.99,1,N,236,43,2,5.5,0.5,0.5,0.0,0.0,0.3,6.8
4,1,2018-01-01 00:00:04,2018-01-01 00:13:24,1,3.6,1,N,37,157,2,13.5,0.5,0.5,0.0,0.0,0.3,14.8


In [137]:
#--Seeing more than 5 rows of your data--
trips.head(7)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2018-01-01 00:00:00,2018-01-01 00:00:00,1,9.14,1,N,229,17,2,27.0,0.0,0.5,0.0,0.0,0.3,27.8
1,2,2018-01-01 00:00:02,2018-01-01 00:08:48,1,1.36,1,N,68,249,2,7.5,0.5,0.5,0.0,0.0,0.3,8.8
2,1,2018-01-01 00:00:03,2018-01-01 00:21:06,1,6.1,1,N,255,236,2,20.5,0.5,0.5,0.0,0.0,0.3,21.8
3,2,2018-01-01 00:00:03,2018-01-01 00:03:52,3,0.99,1,N,236,43,2,5.5,0.5,0.5,0.0,0.0,0.3,6.8
4,1,2018-01-01 00:00:04,2018-01-01 00:13:24,1,3.6,1,N,37,157,2,13.5,0.5,0.5,0.0,0.0,0.3,14.8
5,2,2018-01-01 00:00:04,2018-01-01 00:08:13,1,1.59,1,N,141,161,2,8.0,0.5,0.5,0.0,0.0,0.3,9.3
6,1,2018-01-01 00:00:06,2018-01-01 00:24:34,1,6.9,1,N,162,49,1,23.5,0.5,0.5,4.95,0.0,0.3,29.75


In [139]:
#--Choosing rows based on a condition in a column--
id2 = trips.loc[trips['VendorID'] == 2]
id2.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2018-01-01 00:00:00,2018-01-01 00:00:00,1,9.14,1,N,229,17,2,27.0,0.0,0.5,0.0,0.0,0.3,27.8
1,2,2018-01-01 00:00:02,2018-01-01 00:08:48,1,1.36,1,N,68,249,2,7.5,0.5,0.5,0.0,0.0,0.3,8.8
3,2,2018-01-01 00:00:03,2018-01-01 00:03:52,3,0.99,1,N,236,43,2,5.5,0.5,0.5,0.0,0.0,0.3,6.8
5,2,2018-01-01 00:00:04,2018-01-01 00:08:13,1,1.59,1,N,141,161,2,8.0,0.5,0.5,0.0,0.0,0.3,9.3
12,2,2018-01-01 00:00:15,2018-01-01 00:14:17,1,2.06,1,N,90,137,1,10.5,0.5,0.5,2.36,0.0,0.3,14.16


In [142]:
#--Choosing rows by Combining multiple conditions--
trips.loc[(trips['trip_distance'] >= 5) & (trips['trip_distance'] <= 10)] #--want all trips beteween 5 and 10 miles

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2018-01-01 00:00:00,2018-01-01 00:00:00,1,9.14,1,N,229,17,2,27.0,0.0,0.5,0.00,0.0,0.3,27.80
2,1,2018-01-01 00:00:03,2018-01-01 00:21:06,1,6.10,1,N,255,236,2,20.5,0.5,0.5,0.00,0.0,0.3,21.80
6,1,2018-01-01 00:00:06,2018-01-01 00:24:34,1,6.90,1,N,162,49,1,23.5,0.5,0.5,4.95,0.0,0.3,29.75
22,1,2018-01-01 00:00:25,2018-01-01 00:20:37,1,7.40,1,N,237,116,2,24.0,0.5,0.5,0.00,0.0,0.3,25.30
40,2,2018-01-01 00:00:39,2018-01-01 00:28:19,1,5.33,1,N,238,142,2,22.0,0.5,0.5,0.00,0.0,0.3,23.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19977,1,2018-01-01 01:10:49,2018-01-01 01:41:24,2,5.30,1,N,148,143,1,23.0,0.5,0.5,4.85,0.0,0.3,29.15
19987,2,2018-01-01 01:10:51,2018-01-01 01:24:32,1,5.40,1,N,231,233,1,17.0,0.5,0.5,0.00,0.0,0.3,18.30
19991,1,2018-01-01 01:10:52,2018-01-01 01:40:14,1,9.40,1,N,229,181,1,30.0,0.5,0.5,9.35,0.0,0.3,40.65
19998,1,2018-01-01 01:10:54,2018-01-01 01:35:00,1,8.00,1,N,141,255,1,26.5,0.5,0.5,5.55,0.0,0.3,33.35


In [144]:
#--Choosing rows by excluding a condition--
trips.loc[trips['passenger_count'] != 1]

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
3,2,2018-01-01 00:00:03,2018-01-01 00:03:52,3,0.99,1,N,236,43,2,5.5,0.5,0.5,0.0,0.0,0.3,6.8
11,1,2018-01-01 00:00:15,2018-01-01 00:08:21,2,1.20,1,N,232,107,2,7.0,0.5,0.5,0.0,0.0,0.3,8.3
15,1,2018-01-01 00:00:18,2018-01-01 00:29:50,4,13.20,1,N,138,72,2,39.0,0.5,0.5,0.0,0.0,0.3,40.3
17,1,2018-01-01 00:00:20,2018-01-01 00:06:23,2,1.10,1,N,164,162,3,6.5,0.5,0.5,0.0,0.0,0.3,7.8
28,1,2018-01-01 00:00:27,2018-01-01 00:04:11,4,0.30,1,N,233,161,2,4.5,0.5,0.5,0.0,0.0,0.3,5.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19988,2,2018-01-01 01:10:51,2018-01-01 01:19:56,2,1.02,1,N,237,163,1,7.5,0.5,0.5,3.3,0.0,0.3,12.1
19992,1,2018-01-01 01:10:53,2018-01-01 01:29:19,3,3.70,1,N,50,231,2,15.0,0.5,0.5,0.0,0.0,0.3,16.3
19994,2,2018-01-01 01:10:53,2018-01-01 01:18:56,2,1.20,1,N,166,238,1,7.5,0.5,0.5,2.2,0.0,0.3,11.0
19996,1,2018-01-01 01:10:53,2018-01-01 01:34:04,4,1.90,1,N,246,249,1,15.0,0.5,0.5,0.0,0.0,0.3,16.3


In [166]:
#--Choosing data by column--
trips.iloc[:,:4] #--This chooses all of the rows in the dataframe from column 0 to 3

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count
0,2,2018-01-01 00:00:00,2018-01-01 00:00:00,1
1,2,2018-01-01 00:00:02,2018-01-01 00:08:48,1
2,1,2018-01-01 00:00:03,2018-01-01 00:21:06,1
3,2,2018-01-01 00:00:03,2018-01-01 00:03:52,3
4,1,2018-01-01 00:00:04,2018-01-01 00:13:24,1
...,...,...,...,...
19995,2,2018-01-01 01:10:53,2018-01-01 01:32:37,1
19996,1,2018-01-01 01:10:53,2018-01-01 01:34:04,4
19997,2,2018-01-01 01:10:53,2018-01-01 01:24:39,1
19998,1,2018-01-01 01:10:54,2018-01-01 01:35:00,1


In [None]:
#--Selecting certain columns from the entire dataframe by name, making a new dataframe--
trip_sub = trips[['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'total_amount']]

## Checking / Changing Column Types
- Especially pd.to_datetime() for the date data
- I (Jeff) used this in a talk a while ago and will use it on Saturday morning again to talk about optimization. I don’t think we really need to go into all these options in this lesson. It might be good to flag that there are many ways to do the same thing, such as making a column an integer, though, and that in most cases they’re all equally good choices. (look at Lesson_content_outline in Google Docs for image)


In [None]:
#--Converting column to int or float--

In [215]:
trips.dtypes

VendorID                   int64
tpep_pickup_datetime      object
tpep_dropoff_datetime     object
passenger_count            int64
trip_distance            float64
RatecodeID                 int64
store_and_fwd_flag        object
PULocationID               int64
DOLocationID               int64
payment_type               int64
fare_amount              float64
extra                    float64
mta_tax                  float64
tip_amount               float64
tolls_amount             float64
improvement_surcharge    float64
total_amount             float64
fair_tip_amt             float64
tip_percent              float64
dtype: object

In [217]:
trips.store_and_fwd_flag.unique()

array(['N', 'Y'], dtype=object)

In [255]:
zones.dtypes

locationid     int64
zone          object
borough       object
dtype: object

In [256]:
zones.head()

Unnamed: 0,locationid,zone,borough
0,1,Newark Airport,EWR
1,2,Jamaica Bay,Queens
2,3,Allerton/Pelham Gardens,Bronx
3,4,Alphabet City,Manhattan
4,5,Arden Heights,Staten Island


In [177]:
counts = pd.read_csv('data/daily_taxi_counts_2018.tsv', sep='\t')
counts.head()

Unnamed: 0,date,n_trips
0,2018-03-01,332802
1,2018-03-02,326944
2,2018-03-03,316823
3,2018-03-04,286326
4,2018-03-05,290631


In [176]:
counts.dtypes

date       object
n_trips     int64
dtype: object

In [174]:
#--Convert date column to datetime to make data more accessible--
counts['date'] = pd.to_datetime(counts.date)
counts.dtypes

date       datetime64[ns]
n_trips             int64
dtype: object

In [179]:
#--extract day only--

AttributeError: 'Series' object has no attribute 'days'

## Summarizing

In [153]:
#-- df.describe() returns statistical details like percentile, mean, std etc. of a data frame or a series of numeric values--
trips.describe()

Unnamed: 0,VendorID,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,1.565,1.7354,2.749124,1.031,158.82775,159.07455,1.40945,12.58336,0.49379,0.496975,1.621925,0.105998,0.29946,15.61176
std,0.495769,1.277289,2.961542,0.326411,71.059017,74.392553,0.518859,9.87974,0.060857,0.043052,2.437912,0.895354,0.017226,11.354625
min,1.0,0.0,0.0,1.0,3.0,1.0,1.0,-52.0,-0.5,-0.5,0.0,0.0,-0.3,-52.8
25%,1.0,1.0,1.0,1.0,107.0,95.0,1.0,6.5,0.5,0.5,0.0,0.0,0.3,8.76
50%,2.0,1.0,1.8,1.0,158.0,161.0,1.0,10.0,0.5,0.5,1.06,0.0,0.3,12.36
75%,2.0,2.0,3.4,1.0,234.0,234.0,2.0,15.5,0.5,0.5,2.45,0.0,0.3,18.96
max,2.0,6.0,37.82,6.0,265.0,265.0,4.0,250.0,1.0,0.5,80.0,26.02,0.3,250.8


## Replacing

In [187]:
zones.head()

Unnamed: 0,locationid,zone,borough
0,1,Newark Airport,EWR
1,2,Jamaica Bay,Queens
2,3,Allerton/Pelham Gardens,Bronx
3,4,Alphabet City,Manhattan
4,5,Arden Heights,Staten Island


In [259]:
# this will replace instanceas of "Newark Airport" with "Newark Liberty International Airport"
zones = zones.replace(to_replace =["Newark Airport"],  
                            value = "Newark Liberty International Airport") 

In [261]:
zones.head()

Unnamed: 0,locationid,zone,borough
0,1,Newark Liberty International Airport,EWR
1,2,Jamaica Bay,Queens
2,3,Allerton/Pelham Gardens,Bronx
3,4,Alphabet City,Manhattan
4,5,Arden Heights,Staten Island


In [None]:
#--Replace nonetype with.....






In [None]:
# # Making data frame from the csv file 
# df = pd.read_csv("nba.csv") 
  
# # will replace  Nan value in dataframe with value 0
# df.replace(to_replace = np.nan, value =0) 

## Make New Columns (vector ops)

In [224]:
trips.shape

(20000, 17)

In [229]:
#--tip percentage per trip--
trips['tip_percent'] = trips['tip_amount']/trips['total_amount']

In [227]:
trips.shape

(20000, 18)

In [230]:
trips.head(10)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,tip_percent
0,2,2018-01-01 00:00:00,2018-01-01 00:00:00,1,9.14,1,N,229,17,2,27.0,0.0,0.5,0.0,0.0,0.3,27.8,0.0
1,2,2018-01-01 00:00:02,2018-01-01 00:08:48,1,1.36,1,N,68,249,2,7.5,0.5,0.5,0.0,0.0,0.3,8.8,0.0
2,1,2018-01-01 00:00:03,2018-01-01 00:21:06,1,6.1,1,N,255,236,2,20.5,0.5,0.5,0.0,0.0,0.3,21.8,0.0
3,2,2018-01-01 00:00:03,2018-01-01 00:03:52,3,0.99,1,N,236,43,2,5.5,0.5,0.5,0.0,0.0,0.3,6.8,0.0
4,1,2018-01-01 00:00:04,2018-01-01 00:13:24,1,3.6,1,N,37,157,2,13.5,0.5,0.5,0.0,0.0,0.3,14.8,0.0
5,2,2018-01-01 00:00:04,2018-01-01 00:08:13,1,1.59,1,N,141,161,2,8.0,0.5,0.5,0.0,0.0,0.3,9.3,0.0
6,1,2018-01-01 00:00:06,2018-01-01 00:24:34,1,6.9,1,N,162,49,1,23.5,0.5,0.5,4.95,0.0,0.3,29.75,0.166387
7,1,2018-01-01 00:00:11,2018-01-01 00:06:05,1,1.7,1,N,238,143,1,7.0,0.5,0.5,1.65,0.0,0.3,9.95,0.165829
8,1,2018-01-01 00:00:13,2018-01-01 00:07:03,1,0.9,1,N,144,249,1,6.0,0.5,0.5,1.8,0.0,0.3,9.1,0.197802
9,1,2018-01-01 00:00:14,2018-01-01 00:11:38,1,4.0,1,N,170,232,2,14.0,0.5,0.5,0.0,0.0,0.3,15.3,0.0


In [None]:
#--TODO: Add another example, then add an exercise here--

## Applying Functions to Columns
- Applying a function directly to a column
    - Example: The tip_percent column we created above looks a bit messy, lets fix it with the round() function 
- Using .apply()
    - Allows you to apply a function along an axis in a dataframe

In [231]:
#-- Rounding values: this will change all of the values in the trips['tip_percent'] column --
trips['tip_percent'] = round(trips['tip_percent'], 4) #--change the value from a 4 to a 2 and see what happens--

In [233]:
trips.head(10)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,tip_percent
0,2,2018-01-01 00:00:00,2018-01-01 00:00:00,1,9.14,1,N,229,17,2,27.0,0.0,0.5,0.0,0.0,0.3,27.8,0.0
1,2,2018-01-01 00:00:02,2018-01-01 00:08:48,1,1.36,1,N,68,249,2,7.5,0.5,0.5,0.0,0.0,0.3,8.8,0.0
2,1,2018-01-01 00:00:03,2018-01-01 00:21:06,1,6.1,1,N,255,236,2,20.5,0.5,0.5,0.0,0.0,0.3,21.8,0.0
3,2,2018-01-01 00:00:03,2018-01-01 00:03:52,3,0.99,1,N,236,43,2,5.5,0.5,0.5,0.0,0.0,0.3,6.8,0.0
4,1,2018-01-01 00:00:04,2018-01-01 00:13:24,1,3.6,1,N,37,157,2,13.5,0.5,0.5,0.0,0.0,0.3,14.8,0.0
5,2,2018-01-01 00:00:04,2018-01-01 00:08:13,1,1.59,1,N,141,161,2,8.0,0.5,0.5,0.0,0.0,0.3,9.3,0.0
6,1,2018-01-01 00:00:06,2018-01-01 00:24:34,1,6.9,1,N,162,49,1,23.5,0.5,0.5,4.95,0.0,0.3,29.75,0.1664
7,1,2018-01-01 00:00:11,2018-01-01 00:06:05,1,1.7,1,N,238,143,1,7.0,0.5,0.5,1.65,0.0,0.3,9.95,0.1658
8,1,2018-01-01 00:00:13,2018-01-01 00:07:03,1,0.9,1,N,144,249,1,6.0,0.5,0.5,1.8,0.0,0.3,9.1,0.1978
9,1,2018-01-01 00:00:14,2018-01-01 00:11:38,1,4.0,1,N,170,232,2,14.0,0.5,0.5,0.0,0.0,0.3,15.3,0.0


In [239]:
#--Using .apply() to find the sum of all taxi ride totals in this dataset--
trips[['total_amount']].apply(np.sum, axis=0)

total_amount    312235.2
dtype: float64

In [249]:
zones.head()

Unnamed: 0,locationid,zone,borough
0,1,Newark Airport,EWR
1,2,Jamaica Bay,Queens
2,3,Allerton/Pelham Gardens,Bronx
3,4,Alphabet City,Manhattan
4,5,Arden Heights,Staten Island


In [250]:
#--Applying .lower() to columns in the zones df--
zones['zone'] = zones['zone'].str.lower()

In [251]:
zones.head()

Unnamed: 0,locationid,zone,borough
0,1,newark airport,EWR
1,2,jamaica bay,Queens
2,3,allerton/pelham gardens,Bronx
3,4,alphabet city,Manhattan
4,5,arden heights,Staten Island


## Renaming Things
- The rename() method can be used to rename any index, column, or row

In [316]:
zones.columns

Index(['LOCATIONID', 'ZONE', 'BOROUGH'], dtype='object')

In [309]:
#--Renaming 1 column at a time--
zones.rename(columns={'zone': 'ZONE'}, inplace=True)

In [310]:
zones.columns

Index(['LOCATIONID', 'ZONE', 'BOROUGH'], dtype='object')

In [305]:
#--Renaming multiple columns at a time: 3 different approaches--

# zones.rename(columns={'locationid': 'LOCATIONID', 'borough': 'BOROUGH'}, inplace=True) #--Where inplace=True causes the name change to stay with the dataframe zones
# zones.rename({'locationid': 'LOCATIONID', 'borough': 'BOROUGH'},  axis='columns')
zones.rename({'locationid': 'LOCATIONID','borough': 'BOROUGH'}, inplace=True, axis= 1)

In [311]:
zones.columns

Index(['LOCATIONID', 'ZONE', 'BOROUGH'], dtype='object')

In [315]:
#--Renaming rows and index--





## groupby and unstack

## Resample and timedelta

In [321]:
#--timedelta
#-- https://docs.python.org/3/library/datetime.html
from datetime import timedelta

In [319]:
hours = pd.read_csv('data/hourly_taxi_counts_2018.tsv', sep='\t')

In [320]:
hours.head()

Unnamed: 0,date,n_trips
0,2018-03-01 00:00:00,7568
1,2018-03-01 01:00:00,4330
2,2018-03-01 02:00:00,2753
3,2018-03-01 03:00:00,1844
4,2018-03-01 04:00:00,1921
