# Data Manipulation in Python
## INFORMS 2020 Python Workshop
### Amy Newman

## Import Packages

In [235]:
import pandas as pd
import numpy as np
import json

## Loading / Saving JSON 
- JSON (Javascript Object Notation)
    - it's a standardized format to pass data around; it's easy for machines and humans to use
    - it's a form of text data written in Javascript notation and can be sent to and from a server
    - Twitter and Facebook are examples where you would retrieve data in JSON format
- JSON Viewer: can give a better picture of the structure of your data. Copy and paste the JSON into it. 
    - https://codebeautify.org/jsonviewer or https://jsonformatter.org/json-pretty-print

In [444]:
#--JSON: kind of looks like a dictionary, huh?
json1 = { "firstName": "Alan", "lastName": "Doe", "hobbies": ["running", "climbing", "dancing"], 
        "age": 35, "children": [ { "firstName": "Sam", "age": 6 }, { "firstName": "Jamie", "age": 8 } ] }



#--If you put json1 in the JSON Code Beautifier, it will give a structure that looks like json2--
json2 = {
    "firstName": "Alan",
    "lastName": "Doe",
    "hobbies": ["running", "climbing", "dancing"],
    "age": 35,
    "children": [
        {
            "firstName": "Sam",
            "age": 6
        },
        {
            "firstName": "Jamie",
            "age": 8
        }
    ]
}

In [447]:
#--Accessing data within the json file is a little like accessing data in a Python dictionary--
json1['firstName']

'Alan'

In [448]:
#--Notice that to get to 'Sam', we have to go through a dictionary, a list, and then a dictionary--
json1['children'][0]['firstName']

'Sam'

In [446]:
#--Using json.dumps() to convert a Python object and create a string--
pdict = {'team': 'Brazil', 'position': 1, 'host': True, 'lastGame': 'Win'}
dictToJson = json.dumps(pdict)
print(type(dictToJson))
print(dictToJson)

<class 'str'>
{"team": "Brazil", "position": 1, "host": true, "lastGame": "Win"}


#### Loading JSON Files
- json.loads() takes a file-like object, reads the data from that object, and uses that string to create a json object
- json.dumps() takes an json object and produces a string

In [377]:
#--Loading JSON Tweet data: Full data set--WORKS--
with open('data/tweets_from_nyc_taxis.json', 'r') as f:
    tweets = f.readlines()

In [378]:
len(tweets)

13269

In [None]:
filename = 'data/mini_tweets_nyc_taxi.json'

In [438]:
with open(filename, 'r') as f:
    read_tweet = f.read()

str

In [411]:
#--Puts your Twitter data into a dataframe--WORKS
df = pd.read_json(filename, orient = 'records', lines = True)

In [408]:
with open(filename, "r") as read_file:
    data = json.load(read_file)

AttributeError: 'dict' object has no attribute 'load'

In [409]:
filename = 'data/mini_tweets_nyc_taxi.json' #--WORKS--
with open(filename, 'r') as f:
    tweets = f.readlines()

In [434]:
type(tweets)

list

In [432]:
jsonTodict = json.loads(tweets[0]).items()

AttributeError: 'dict' object has no attribute 'loads'

In [427]:
tweet_lst = []

with open(filename, 'r') as f:
    for line in f.readlines():
        print(type(line))
#         tweet_lst.append(line)

<class 'str'>
<class 'str'>
<class 'str'>


AttributeError: 'dict' object has no attribute 'loads'

In [410]:
tweets

['{"created_at": "Tue May 01 14:18:26 +0000 2018", "id": 991320947700523009, "id_str": "991320947700523009", "text": "@NeilECollins @nyc311 Thank you for reading, Mr. Collins.  Having a TLC driver license is a privilege.  Taxi passen\\u2026 https://t.co/78C3glo1yl", "truncated": true, "entities": {"hashtags": [], "symbols": [], "user_mentions": [{"screen_name": "NeilECollins", "name": "Neil Collins", "id": 2813412205, "id_str": "2813412205", "indices": [0, 13]}, {"screen_name": "nyc311", "name": "New York City 311", "id": 37683414, "id_str": "37683414", "indices": [14, 21]}], "urls": [{"url": "https://t.co/78C3glo1yl", "expanded_url": "https://twitter.com/i/web/status/991320947700523009", "display_url": "twitter.com/i/web/status/9\\u2026", "indices": [117, 140]}]}, "source": "<a href=\\"http://twitter.com\\" rel=\\"nofollow\\">Twitter Web Client</a>", "in_reply_to_status_id": 991311021905207298, "in_reply_to_status_id_str": "991311021905207298", "in_reply_to_user_id": 2813412205, "in_r

In [464]:
# with open(filename) as tweet_data:
#     json_data = json.load(tweet_data.readlines())

In [465]:
# tweets[0].dumps()

In [387]:
#--Loading JSON Tweet data: mini data set--
filename = 'data/mini_tweets_nyc_taxi.json'
f = open(filename, 'r', encoding = 'UTF-8', errors = "replace")

tweet_lst = []

for line in f.readlines():
#     print(line)
    tweet_lst.append(line)
#     lst = line.split('\n')


In [127]:
tw_lst = []
for elem in tweets:
    tw_lst.append(elem)

In [131]:
tw_lst[0]

'{"created_at": "Tue May 01 14:18:26 +0000 2018", "id": 991320947700523009, "id_str": "991320947700523009", "text": "@NeilECollins @nyc311 Thank you for reading, Mr. Collins.  Having a TLC driver license is a privilege.  Taxi passen\\u2026 https://t.co/78C3glo1yl", "truncated": true, "entities": {"hashtags": [], "symbols": [], "user_mentions": [{"screen_name": "NeilECollins", "name": "Neil Collins", "id": 2813412205, "id_str": "2813412205", "indices": [0, 13]}, {"screen_name": "nyc311", "name": "New York City 311", "id": 37683414, "id_str": "37683414", "indices": [14, 21]}], "urls": [{"url": "https://t.co/78C3glo1yl", "expanded_url": "https://twitter.com/i/web/status/991320947700523009", "display_url": "twitter.com/i/web/status/9\\u2026", "indices": [117, 140]}]}, "source": "<a href=\\"http://twitter.com\\" rel=\\"nofollow\\">Twitter Web Client</a>", "in_reply_to_status_id": 991311021905207298, "in_reply_to_status_id_str": "991311021905207298", "in_reply_to_user_id": 2813412205, "in_re

In [125]:
tweets[0]

'{"created_at": "Tue May 01 14:18:26 +0000 2018", "id": 991320947700523009, "id_str": "991320947700523009", "text": "@NeilECollins @nyc311 Thank you for reading, Mr. Collins.  Having a TLC driver license is a privilege.  Taxi passen\\u2026 https://t.co/78C3glo1yl", "truncated": true, "entities": {"hashtags": [], "symbols": [], "user_mentions": [{"screen_name": "NeilECollins", "name": "Neil Collins", "id": 2813412205, "id_str": "2813412205", "indices": [0, 13]}, {"screen_name": "nyc311", "name": "New York City 311", "id": 37683414, "id_str": "37683414", "indices": [14, 21]}], "urls": [{"url": "https://t.co/78C3glo1yl", "expanded_url": "https://twitter.com/i/web/status/991320947700523009", "display_url": "twitter.com/i/web/status/9\\u2026", "indices": [117, 140]}]}, "source": "<a href=\\"http://twitter.com\\" rel=\\"nofollow\\">Twitter Web Client</a>", "in_reply_to_status_id": 991311021905207298, "in_reply_to_status_id_str": "991311021905207298", "in_reply_to_user_id": 2813412205, "in_re

In [71]:
#--Pretty print the first tweet to get a sense of the data--
parsed = json.loads(tweets[0])
print(json.dumps(parsed, indent=4, sort_keys=True))

{
    "contributors": null,
    "coordinates": null,
    "created_at": "Tue May 01 14:18:26 +0000 2018",
    "entities": {
        "hashtags": [],
        "symbols": [],
        "urls": [
            {
                "display_url": "twitter.com/i/web/status/9\u2026",
                "expanded_url": "https://twitter.com/i/web/status/991320947700523009",
                "indices": [
                    117,
                    140
                ],
                "url": "https://t.co/78C3glo1yl"
            }
        ],
        "user_mentions": [
            {
                "id": 2813412205,
                "id_str": "2813412205",
                "indices": [
                    0,
                    13
                ],
                "name": "Neil Collins",
                "screen_name": "NeilECollins"
            },
            {
                "id": 37683414,
                "id_str": "37683414",
                "indices": [
                    14,
                    21
   

In [74]:
your_json = '["foo", {"bar":["baz", null, 1.0, 2]}]'
parsed = json.loads(your_json)
print(json.dumps(parsed, indent=4, sort_keys=True))

[
    "foo",
    {
        "bar": [
            "baz",
            null,
            1.0,
            2
        ]
    }
]


In [98]:
#--Loading and saving the JSON data--
with open('data/tweets_from_nyc_taxis.json', 'r', encoding='utf-8') as f:
    data = f.read()

In [99]:
type(data)

str

In [100]:
#--Loading and saving the JSON data--
with open('data/tweets_from_nyc_taxis.json', 'r') as f:
    data = f.readlines()

In [102]:
type(data)

list

In [117]:
#--Loading JSON Twitter data--
taxi_tweets = 'data/tweets_from_nyc_taxis.json'

with open(taxi_tweets) as f:
    data = json.loads(f.read())

JSONDecodeError: Extra data: line 2 column 1 (char 3082)

In [109]:
#--Saving JSON data to a file--
file_name = "data/data_file.json" 

with open(file_name, "w") as write_file:
    json.dump(data, write_file)

## Loading / Saving pandas

In [456]:
#--Loading a .csv file to pandas--
zones = pd.read_csv('data/taxi_zones_simple.csv')

#--Loading a .tsv.gz to pandas--
trips = pd.read_csv('data/clean_taxi_sample.tsv.gz', sep='\t')

In [466]:
#--taking a look at the first 5 rows to see if it loaded ok--
zones.head()

In [467]:
zones.shape

In [None]:
# trips.head()

In [None]:
# trips.shape()

In [462]:
#--Saving the zones dataframe to a .csv file--
zones.to_csv("data/zones_output.csv")
# zones.to_csv("data/zones_output.csv", index=False) #--to get rid of the index use this

#--Saving the zones dataframe to a .xls file--
zones.to_excel("data/zones_output.xls")

### *Exercise: Loading/Saving pandas

In [531]:
#--1. Load 'data/daily_taxi_counts_2018.tsv' using pandas. Assign the loaded data to the variable counts. 

counts = pd.read_csv('data/daily_taxi_counts_2018.tsv', sep='\t')

In [357]:
#--2. View your counts data and take a look at the shape--
# print(counts.shape())
# counts.head()

In [None]:
#--3. Save the dataframe to a file named 'counts_output.csv'
# counts.to_csv("data/counts_output.csv")

## Viewing / Selecting pandas
- .head()
    - allows you to view the first 5 rows in your data, including column names
- .sample(5)
    - this will give a random sample of 5 rows from the data (you can change from 5 to another number for more or less samples)
- .shape
    - allows you to see the number of rows and columns within your dataframe
- .loc[ ]
    - gets rows (or columns) with specific labels from the index
- .iloc[ ]
    - gets rows (or columns) at specific positions in the index (so it only takes integers)

In [468]:
#--Viewing the first 5 rows of your data--
trips.head()

#--Viewing the first 7 rows of your data: you can change from 7 to whatever number you need--
# trips.head(7) 

In [None]:
#--Gets a sample of rows from your dataframe. You can change the number to whatever you need here--
trips.sample(5)

In [476]:
#--Getting some basic information about the size and shape of the DataFrame:
print("The number of rows of the dataset is: ", len(trips))
print("The number of columns of the dataset is: ", len(trips.columns))
print("The shape of the dataset is: ", trips.shape)

In [475]:
#--Seeing all of your dataframe columns in list format--
trips.columns

In [469]:
trips['passenger_count'] #--extracting one column in Series format

#--This will give the same result as trips['passenger_count'] but uses dot notation:
# trips.passenger_count

In [373]:
#--Column Extraction: double square brackets b/c the column names are the list
dist_cost = trips[['PULocationID','trip_distance', 'total_amount']] 

In [470]:
#--Choosing rows based on a condition in a column--
id2 = trips.loc[trips['VendorID'] == 2]
id2.head()

In [474]:
#--Choosing rows by Combining multiple conditions--
trips.loc[(trips['trip_distance'] >= 5) & (trips['trip_distance'] <= 10)] #--want all trips beteween 5 and 10 miles

In [472]:
#--Choosing rows by excluding a condition--
trips.loc[trips['passenger_count'] != 1]

In [473]:
#--Choosing data by column--
trips.iloc[:,:4] #--This chooses all of the rows in the dataframe from column 0 to 3

In [None]:
#--Selecting certain columns from the entire dataframe by name, making a new dataframe--
trip_sub = trips[['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'total_amount']]

### *Exercise: Viewing/Selecting pandas 

In [None]:
#--1. View the data in the zones dataframe--

In [486]:
#--2. How many columns does the zones dataframe have?--
# len(zones.columns)

In [485]:
#--3. Create a subset of the zones dataframe called queens, where all of the data only comes from the borough queens--
# queens = zones.loc[zones['borough']=='Queens']
# queens

In [484]:
#--4. Find all of the data in the trips dataframe where the fare_amount is between 100 and 200--
# trips.loc[(trips['fare_amount'] >= 100) & (trips['fare_amount'] <= 200)]

## Checking / Changing Column Types
- Viewing column data
- Column data can be converted to different types including, but not limited to: str, float, and int
- Converting date data with dates and times to more usable formats

In [512]:
#--Converting column to int or float--
trips.head()

In [502]:
#--Check all the data types of the trips dataframe--
trips.dtypes

In [541]:
#--all the unique values in the payment_type column of the trips dataframe--
trips.payment_type.unique()

In [496]:
#--Converting all of the values in the payment_type column from int to float--
trips.payment_type = trips.payment_type.astype(float) #--This is one way to do it, there are options as well
trips.dtypes

In [530]:
#--Convert date column to datetime to make data more accessible--
counts['date'] = pd.to_datetime(counts.date)

In [534]:
#--Create new columns from datetime data, one with month data and one with day of week data--
counts['month'] = pd.DatetimeIndex(counts['date']).month
counts['day_of_week'] = pd.DatetimeIndex(counts['date']).dayofweek #--Sunday thru Saturday in numerical form
counts.head()

### *Exercise: Checking/Changing Column Types

In [None]:
#--1. Convert the payment_type column from trips from float back to int--
# trips.payment_type = trips.payment_type.astype(int)
# trips.dtypes

In [554]:
#--2. Create a column called 'day' in trips, where its values are the day number of the month--
# counts['day'] = pd.DatetimeIndex(counts['date']).day
counts.head()

Unnamed: 0,date,n_trips,month,day_of_week,day
0,2018-03-01,332802,3,3,1
1,2018-03-02,326944,3,4,2
2,2018-03-03,316823,3,5,3
3,2018-03-04,286326,3,6,4
4,2018-03-05,290631,3,0,5


## Summarizing
- descriptive statistics
    - df.describe()
    - df['column_name'].max()
    - df['column_name'].sum()
    - and more
- Any interesting findings in the trips dataframe?

In [153]:
#-- df.describe() returns statistical details like percentile, mean, std etc. of a data frame or a series of numeric values--
trips.describe()

Unnamed: 0,VendorID,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,1.565,1.7354,2.749124,1.031,158.82775,159.07455,1.40945,12.58336,0.49379,0.496975,1.621925,0.105998,0.29946,15.61176
std,0.495769,1.277289,2.961542,0.326411,71.059017,74.392553,0.518859,9.87974,0.060857,0.043052,2.437912,0.895354,0.017226,11.354625
min,1.0,0.0,0.0,1.0,3.0,1.0,1.0,-52.0,-0.5,-0.5,0.0,0.0,-0.3,-52.8
25%,1.0,1.0,1.0,1.0,107.0,95.0,1.0,6.5,0.5,0.5,0.0,0.0,0.3,8.76
50%,2.0,1.0,1.8,1.0,158.0,161.0,1.0,10.0,0.5,0.5,1.06,0.0,0.3,12.36
75%,2.0,2.0,3.4,1.0,234.0,234.0,2.0,15.5,0.5,0.5,2.45,0.0,0.3,18.96
max,2.0,6.0,37.82,6.0,265.0,265.0,4.0,250.0,1.0,0.5,80.0,26.02,0.3,250.8


### *Exercise: Summarizing

In [555]:
#--1. Find the maximum number of trips in the counts dataframe?--
counts.describe()

Unnamed: 0,n_trips,month,day_of_week,day
count,365.0,365.0,365.0,365.0
mean,281636.715068,6.526027,2.991781,15.720548
std,37644.05442,3.452584,2.006155,8.808321
min,122222.0,1.0,0.0,1.0
25%,261619.0,4.0,1.0,8.0
50%,286326.0,7.0,3.0,16.0
75%,308094.0,10.0,5.0,23.0
max,349840.0,12.0,6.0,31.0


In [558]:
#--2. What's the total number of trips in March in the counts dataframe?--
counts.loc[counts['month'] == 3].sum()

date           2018-03-012018-03-022018-03-032018-03-042018-0...
n_trips                                                  9429041
month                                                         93
day_of_week                                                   96
day                                                          496
dtype: object

## Replacing

In [187]:
zones.head()

Unnamed: 0,locationid,zone,borough
0,1,Newark Airport,EWR
1,2,Jamaica Bay,Queens
2,3,Allerton/Pelham Gardens,Bronx
3,4,Alphabet City,Manhattan
4,5,Arden Heights,Staten Island


In [259]:
# this will replace instanceas of "Newark Airport" with "Newark Liberty International Airport"
zones = zones.replace(to_replace =["Newark Airport"],  
                            value = "Newark Liberty International Airport") 

In [261]:
zones.head()

Unnamed: 0,locationid,zone,borough
0,1,Newark Liberty International Airport,EWR
1,2,Jamaica Bay,Queens
2,3,Allerton/Pelham Gardens,Bronx
3,4,Alphabet City,Manhattan
4,5,Arden Heights,Staten Island


In [None]:
#--Replace nonetype with.....






In [None]:
# # Making data frame from the csv file 
# df = pd.read_csv("nba.csv") 
  
# # will replace  Nan value in dataframe with value 0
# df.replace(to_replace = np.nan, value =0) 

### *Exercise: Replacing

In [None]:
#--1. 

## Make New Columns (vector ops)

In [561]:
trips.shape

In [229]:
#--tip percentage per trip--
trips['tip_percent'] = trips['tip_amount']/trips['total_amount']

In [562]:
trips.shape

In [563]:
# trips.head(10)

In [None]:
#--TODO: Add another example, then add an exercise here--

### *Exercise: Making New Columns with vector operations

In [None]:
#--1. 

## Applying Functions to Columns
- Applying a function directly to a column
    - Example: The tip_percent column we created above looks a bit messy, lets fix it with the round() function 
- Using .apply()
    - Allows you to apply a function along an axis in a dataframe

In [231]:
#-- Rounding values: this will change all of the values in the trips['tip_percent'] column --
trips['tip_percent'] = round(trips['tip_percent'], 4) #--change the value from a 4 to a 2 and see what happens--

In [233]:
trips.head(10)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,tip_percent
0,2,2018-01-01 00:00:00,2018-01-01 00:00:00,1,9.14,1,N,229,17,2,27.0,0.0,0.5,0.0,0.0,0.3,27.8,0.0
1,2,2018-01-01 00:00:02,2018-01-01 00:08:48,1,1.36,1,N,68,249,2,7.5,0.5,0.5,0.0,0.0,0.3,8.8,0.0
2,1,2018-01-01 00:00:03,2018-01-01 00:21:06,1,6.1,1,N,255,236,2,20.5,0.5,0.5,0.0,0.0,0.3,21.8,0.0
3,2,2018-01-01 00:00:03,2018-01-01 00:03:52,3,0.99,1,N,236,43,2,5.5,0.5,0.5,0.0,0.0,0.3,6.8,0.0
4,1,2018-01-01 00:00:04,2018-01-01 00:13:24,1,3.6,1,N,37,157,2,13.5,0.5,0.5,0.0,0.0,0.3,14.8,0.0
5,2,2018-01-01 00:00:04,2018-01-01 00:08:13,1,1.59,1,N,141,161,2,8.0,0.5,0.5,0.0,0.0,0.3,9.3,0.0
6,1,2018-01-01 00:00:06,2018-01-01 00:24:34,1,6.9,1,N,162,49,1,23.5,0.5,0.5,4.95,0.0,0.3,29.75,0.1664
7,1,2018-01-01 00:00:11,2018-01-01 00:06:05,1,1.7,1,N,238,143,1,7.0,0.5,0.5,1.65,0.0,0.3,9.95,0.1658
8,1,2018-01-01 00:00:13,2018-01-01 00:07:03,1,0.9,1,N,144,249,1,6.0,0.5,0.5,1.8,0.0,0.3,9.1,0.1978
9,1,2018-01-01 00:00:14,2018-01-01 00:11:38,1,4.0,1,N,170,232,2,14.0,0.5,0.5,0.0,0.0,0.3,15.3,0.0


In [239]:
#--Using .apply() to find the sum of all taxi ride totals in this dataset--
trips[['total_amount']].apply(np.sum, axis=0)

total_amount    312235.2
dtype: float64

In [249]:
zones.head()

Unnamed: 0,locationid,zone,borough
0,1,Newark Airport,EWR
1,2,Jamaica Bay,Queens
2,3,Allerton/Pelham Gardens,Bronx
3,4,Alphabet City,Manhattan
4,5,Arden Heights,Staten Island


In [250]:
#--Applying .lower() to columns in the zones df--
zones['zone'] = zones['zone'].str.lower()

In [251]:
zones.head()

Unnamed: 0,locationid,zone,borough
0,1,newark airport,EWR
1,2,jamaica bay,Queens
2,3,allerton/pelham gardens,Bronx
3,4,alphabet city,Manhattan
4,5,arden heights,Staten Island


## Renaming Things
- The rename() method can be used to rename any index, column, or row

In [316]:
zones.columns

Index(['LOCATIONID', 'ZONE', 'BOROUGH'], dtype='object')

In [309]:
#--Renaming 1 column at a time--
zones.rename(columns={'zone': 'ZONE'}, inplace=True)

In [310]:
zones.columns

Index(['LOCATIONID', 'ZONE', 'BOROUGH'], dtype='object')

In [305]:
#--Renaming multiple columns at a time: 3 different approaches--

# zones.rename(columns={'locationid': 'LOCATIONID', 'borough': 'BOROUGH'}, inplace=True) #--Where inplace=True causes the name change to stay with the dataframe zones
# zones.rename({'locationid': 'LOCATIONID', 'borough': 'BOROUGH'},  axis='columns')
zones.rename({'locationid': 'LOCATIONID','borough': 'BOROUGH'}, inplace=True, axis= 1)

In [311]:
zones.columns

Index(['LOCATIONID', 'ZONE', 'BOROUGH'], dtype='object')

In [315]:
#--Renaming rows and index--





## groupby and unstack
- We use the groupby() function to split data into different groups based on some criteria
- ___FIX THIS: unstack() is a pandas reshaping function, that pivots a level of the index labels__

In [325]:
#--find dataset for groupby--

Unnamed: 0,LOCATIONID,ZONE,BOROUGH
0,1,Newark Airport,EWR
1,2,Jamaica Bay,Queens
2,3,Allerton/Pelham Gardens,Bronx
3,4,Alphabet City,Manhattan
4,5,Arden Heights,Staten Island


In [375]:
by_borough = zones.groupby('BOROUGH')
by_borough.groups

{'Bronx': Int64Index([  2,  17,  19,  30,  31,  45,  46,  50,  57,  58,  59,  68,  77,
              80,  93, 118, 125, 135, 146, 158, 166, 167, 168, 173, 181, 182,
             183, 184, 198, 199, 207, 211, 212, 219, 234, 239, 240, 241, 246,
             247, 249, 253, 258],
            dtype='int64'),
 'Brooklyn': Int64Index([ 10,  13,  16,  20,  21,  24,  25,  28,  32,  33,  34,  35,  36,
              38,  39,  48,  51,  53,  54,  60,  61,  62,  64,  65,  66,  70,
              71,  75,  76,  79,  84,  88,  90,  96, 105, 107, 110, 111, 122,
             132, 148, 149, 153, 154, 164, 176, 177, 180, 187, 188, 189, 194,
             209, 216, 221, 224, 226, 227, 254, 255, 256],
            dtype='int64'),
 'EWR': Int64Index([0], dtype='int64'),
 'Manhattan': Int64Index([  3,  11,  12,  23,  40,  41,  42,  44,  47,  49,  67,  73,  74,
              78,  86,  87,  89,  99, 102, 103, 104, 106, 112, 113, 115, 119,
             124, 126, 127, 136, 139, 140, 141, 142, 143, 147, 150, 151, 15

In [328]:
#--groupby() on boroughs--
zones.loc[zones.BOROUGH == 'Queens']

Unnamed: 0,LOCATIONID,ZONE,BOROUGH
1,2,Jamaica Bay,Queens
6,7,Astoria,Queens
7,8,Astoria Park,Queens
8,9,Auburndale,Queens
9,10,Baisley Park,Queens
...,...,...,...
225,226,Sunnyside,Queens
251,252,Whitestone,Queens
252,253,Willets Point,Queens
257,258,Woodhaven,Queens


## Resample and timedelta

In [321]:
#--timedelta: https://docs.python.org/3/library/datetime.html
from datetime import timedelta

In [319]:
hours = pd.read_csv('data/hourly_taxi_counts_2018.tsv', sep='\t')

In [320]:
hours.head()

Unnamed: 0,date,n_trips
0,2018-03-01 00:00:00,7568
1,2018-03-01 01:00:00,4330
2,2018-03-01 02:00:00,2753
3,2018-03-01 03:00:00,1844
4,2018-03-01 04:00:00,1921


## Other useful functions
- .unique()
- .sort_values()

In [353]:
#--Find unique values in a certain column--
zones.BOROUGH.unique()

array(['EWR', 'Queens', 'Bronx', 'Manhattan', 'Staten Island', 'Brooklyn'],
      dtype=object)

In [346]:
len(zones.ZONE.unique())

260

In [545]:
# .drop_duplicates()

In [355]:
#--Sort your dataframe by total amount pad for taxi trip, from greatest to least--
sort_bytotal = trips.sort_values('total_amount', ascending=False)
sort_bytotal.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,tip_percent
17881,2,2018-01-01 01:04:29,2018-01-01 01:04:32,2,0.0,5,N,68,68,1,250.0,0.0,0.5,0.0,0.0,0.3,250.8,0.0
12056,2,2018-01-01 00:46:06,2018-01-01 00:48:11,1,0.0,5,N,265,265,1,215.0,0.0,0.5,0.0,25.0,0.3,240.8,0.0
4021,2,2018-01-01 00:20:02,2018-01-01 01:21:02,1,4.56,5,N,158,265,2,175.0,0.0,0.5,0.0,10.5,0.3,186.3,0.0
14585,2,2018-01-01 00:54:08,2018-01-01 01:34:38,1,36.01,4,N,132,265,1,165.0,0.5,0.5,17.0,0.0,0.3,183.3,0.0927
19413,1,2018-01-01 01:09:04,2018-01-01 01:47:43,2,28.0,5,N,170,265,1,136.0,0.0,0.0,21.23,5.76,0.3,163.29,0.13
