# Pandas - Data Analysis Library

Pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool,
built on top of the Python programming language.

## Importing Pandas

In [1]:
import pandas as pd

In [2]:
pd.__version__

'1.2.1'

## Series

In [4]:
a = pd.Series(data = [1,2,3,4])
a

0    1
1    2
2    3
3    4
dtype: int64

In [5]:
type(a)

pandas.core.series.Series

In [6]:
b = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
b

a    1
b    2
c    3
d    4
dtype: int64

In [8]:
data_dict = {
    'k1': 1, 
    'k2': 2,
    'k3': 3,
    'k4': 4
}

pd.Series(data_dict)

k1    1
k2    2
k3    3
k4    4
dtype: int64

## DataFrame

In [10]:
b = pd.DataFrame(data= {"Nama" : ["Selly", "Emir"], "Umur": [12, 13]})
b

Unnamed: 0,Nama,Umur
0,Selly,12
1,Emir,13


In [11]:
type(b)

pandas.core.frame.DataFrame

![](pandas/series-and-dataframe.width-1200.png)

## Creating DataFrame from dictionary

In [12]:
data_dict = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}

In [13]:
df0 = pd.DataFrame(data_dict)
df0

Unnamed: 0,col_1,col_2
0,3,a
1,2,b
2,1,c
3,0,d


Specify orient='index' to create the DataFrame using dictionary keys as rows:

In [14]:
df1 = pd.DataFrame.from_dict(data_dict, orient='index')
df1

Unnamed: 0,0,1,2,3
col_1,3,2,1,0
col_2,a,b,c,d


When using the ‘index’ orientation, the column names can be specified manually:

In [15]:
df2 = pd.DataFrame.from_dict(data_dict, orient='index',
                       columns=['A', 'B', 'C', 'D'])
df2

Unnamed: 0,A,B,C,D
col_1,3,2,1,0
col_2,a,b,c,d


We can change the columns' name

In [16]:
df2.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [17]:
df2.columns = ["first", "second", "third", "fourth"]

In [18]:
df2

Unnamed: 0,first,second,third,fourth
col_1,3,2,1,0
col_2,a,b,c,d


In [20]:
df2.columns.values[3]

'fourth'

In [21]:
df2.columns.values[0] = "zero"

In [22]:
df2

Unnamed: 0,zero,second,third,fourth
col_1,3,2,1,0
col_2,a,b,c,d


### Exercise 1

1. Create the following dataframe
![](pandas/ex00.png)

In [23]:
data_dict = {
    'Age': [24, 13, 53],
    'Location': ['New York', 'Paris', 'Berlin'],
    'Name': ['John', 'Anna', 'Peter']
}

data_dict

{'Age': [24, 13, 53],
 'Location': ['New York', 'Paris', 'Berlin'],
 'Name': ['John', 'Anna', 'Peter']}

In [24]:
df = pd.DataFrame(data_dict)
df

Unnamed: 0,Age,Location,Name
0,24,New York,John
1,13,Paris,Anna
2,53,Berlin,Peter


2. change "Location" into "City"

In [25]:
df.columns.values[1] = 'City'
df

Unnamed: 0,Age,City,Name
0,24,New York,John
1,13,Paris,Anna
2,53,Berlin,Peter


# Open CSV file

We will be using data of Uber drive in 2016. The data can be obtained from Kaggle (https://www.kaggle.com/zusmani/uberdrives)

In [26]:
data = pd.read_csv("My Uber Drives - 2016.csv")
data

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,PURPOSE*
0,1/1/2016 21:11,1/1/2016 21:17,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain
1,1/2/2016 1:25,1/2/2016 1:37,Business,Fort Pierce,Fort Pierce,5.0,
2,1/2/2016 20:25,1/2/2016 20:38,Business,Fort Pierce,Fort Pierce,4.8,Errand/Supplies
3,1/5/2016 17:31,1/5/2016 17:45,Business,Fort Pierce,Fort Pierce,4.7,Meeting
4,1/6/2016 14:42,1/6/2016 15:49,Business,Fort Pierce,West Palm Beach,63.7,Customer Visit
...,...,...,...,...,...,...,...
1151,12/31/2016 13:24,12/31/2016 13:42,Business,Kar?chi,Unknown Location,3.9,Temporary Site
1152,12/31/2016 15:03,12/31/2016 15:38,Business,Unknown Location,Unknown Location,16.2,Meeting
1153,12/31/2016 21:32,12/31/2016 21:50,Business,Katunayake,Gampaha,6.4,Temporary Site
1154,12/31/2016 22:08,12/31/2016 23:51,Business,Gampaha,Ilukwatta,48.2,Temporary Site


### Basic Operation

In [29]:
data.head(3)

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,PURPOSE*
0,1/1/2016 21:11,1/1/2016 21:17,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain
1,1/2/2016 1:25,1/2/2016 1:37,Business,Fort Pierce,Fort Pierce,5.0,
2,1/2/2016 20:25,1/2/2016 20:38,Business,Fort Pierce,Fort Pierce,4.8,Errand/Supplies


In [30]:
data.tail()

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,PURPOSE*
1151,12/31/2016 13:24,12/31/2016 13:42,Business,Kar?chi,Unknown Location,3.9,Temporary Site
1152,12/31/2016 15:03,12/31/2016 15:38,Business,Unknown Location,Unknown Location,16.2,Meeting
1153,12/31/2016 21:32,12/31/2016 21:50,Business,Katunayake,Gampaha,6.4,Temporary Site
1154,12/31/2016 22:08,12/31/2016 23:51,Business,Gampaha,Ilukwatta,48.2,Temporary Site
1155,Totals,,,,,12204.7,


In [34]:
brs, klm = data.shape
print(f'banyak baris data = {brs:,}')
print(f'banyak kolom data = {klm}')

banyak baris data = 1,156
banyak kolom data = 7


In [35]:
data.dtypes

START_DATE*     object
END_DATE*       object
CATEGORY*       object
START*          object
STOP*           object
MILES*         float64
PURPOSE*        object
dtype: object

In [36]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1156 entries, 0 to 1155
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   START_DATE*  1156 non-null   object 
 1   END_DATE*    1155 non-null   object 
 2   CATEGORY*    1155 non-null   object 
 3   START*       1155 non-null   object 
 4   STOP*        1155 non-null   object 
 5   MILES*       1156 non-null   float64
 6   PURPOSE*     653 non-null    object 
dtypes: float64(1), object(6)
memory usage: 63.3+ KB


### Convert data type

It can be seen that the START_DATE* and END_DATE* is object type data. While in fact, it is a date

In [37]:
data1 = pd.DataFrame({"Cost":["5","5","7"],"Amount":[11,12,13],"Date": ["11-10-2020","12-10-2020","13-10-2020"]})
data1

Unnamed: 0,Cost,Amount,Date
0,5,11,11-10-2020
1,5,12,12-10-2020
2,7,13,13-10-2020


In [38]:
data1.dtypes

Cost      object
Amount     int64
Date      object
dtype: object

In [39]:
data1["Date"] = pd.to_datetime(data1["Date"])

In [40]:
data1["Cost"] = pd.to_numeric(data1["Cost"])

In [41]:
data1

Unnamed: 0,Cost,Amount,Date
0,5,11,2020-11-10
1,5,12,2020-12-10
2,7,13,2020-10-13


In [42]:
data1.dtypes

Cost               int64
Amount             int64
Date      datetime64[ns]
dtype: object

#### Apply to our dataframe

In [45]:
data.tail(3)

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,PURPOSE*
1153,12/31/2016 21:32,12/31/2016 21:50,Business,Katunayake,Gampaha,6.4,Temporary Site
1154,12/31/2016 22:08,12/31/2016 23:51,Business,Gampaha,Ilukwatta,48.2,Temporary Site
1155,Totals,,,,,12204.7,


In [44]:
# convert data to datetime format
pd.to_datetime(data["START_DATE*"], format='%m/%d/%Y %H:%M')

ValueError: time data 'Totals' does not match format '%m/%d/%Y %H:%M' (match)

In [46]:
pd.to_datetime(data["START_DATE*"],format='%m/%d/%Y %H:%M', errors = 'coerce')

0      2016-01-01 21:11:00
1      2016-01-02 01:25:00
2      2016-01-02 20:25:00
3      2016-01-05 17:31:00
4      2016-01-06 14:42:00
               ...        
1151   2016-12-31 13:24:00
1152   2016-12-31 15:03:00
1153   2016-12-31 21:32:00
1154   2016-12-31 22:08:00
1155                   NaT
Name: START_DATE*, Length: 1156, dtype: datetime64[ns]

In [47]:
data.dtypes

START_DATE*     object
END_DATE*       object
CATEGORY*       object
START*          object
STOP*           object
MILES*         float64
PURPOSE*        object
dtype: object

Why the `START_DATA*` is still object? because it is not changed in the data frame

In [48]:
data["START_DATE*"] = pd.to_datetime(data["START_DATE*"],format='%m/%d/%Y %H:%M', errors = 'coerce')

In [49]:
data.dtypes

START_DATE*    datetime64[ns]
END_DATE*              object
CATEGORY*              object
START*                 object
STOP*                  object
MILES*                float64
PURPOSE*               object
dtype: object

In [50]:
data["END_DATE*"] = pd.to_datetime(data["START_DATE*"],format='%m/%d/%Y %H:%M', errors = 'coerce')

In [51]:
data.dtypes

START_DATE*    datetime64[ns]
END_DATE*      datetime64[ns]
CATEGORY*              object
START*                 object
STOP*                  object
MILES*                float64
PURPOSE*               object
dtype: object

### Dataset summarization

In [52]:
data.describe()

Unnamed: 0,MILES*
count,1156.0
mean,21.115398
std,359.299007
min,0.5
25%,2.9
50%,6.0
75%,10.4
max,12204.7


In [53]:
data.describe(include='all')

  data.describe(include='all')
  data.describe(include='all')


Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,PURPOSE*
count,1155,1155,1155,1155,1155,1156.0,653
unique,1154,1154,2,177,188,,10
top,2016-06-28 23:34:00,2016-06-28 23:34:00,Business,Cary,Cary,,Meeting
freq,2,2,1078,201,203,,187
first,2016-01-01 21:11:00,2016-01-01 21:11:00,,,,,
last,2016-12-31 22:08:00,2016-12-31 22:08:00,,,,,
mean,,,,,,21.115398,
std,,,,,,359.299007,
min,,,,,,0.5,
25%,,,,,,2.9,


In [54]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1156 entries, 0 to 1155
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   START_DATE*  1155 non-null   datetime64[ns]
 1   END_DATE*    1155 non-null   datetime64[ns]
 2   CATEGORY*    1155 non-null   object        
 3   START*       1155 non-null   object        
 4   STOP*        1155 non-null   object        
 5   MILES*       1156 non-null   float64       
 6   PURPOSE*     653 non-null    object        
dtypes: datetime64[ns](2), float64(1), object(4)
memory usage: 63.3+ KB


In [57]:
data['START*'].nunique()

177

In [60]:
data['START*'].unique()

array(['Fort Pierce', 'West Palm Beach', 'Cary', 'Jamaica', 'New York',
       'Elmhurst', 'Midtown', 'East Harlem', 'Flatiron District',
       'Midtown East', 'Hudson Square', 'Lower Manhattan',
       "Hell's Kitchen", 'Downtown', 'Gulfton', 'Houston', 'Eagan Park',
       'Morrisville', 'Durham', 'Farmington Woods', 'Whitebridge',
       'Lake Wellingborough', 'Fayetteville Street', 'Raleigh',
       'Hazelwood', 'Fairmont', 'Meredith Townes', 'Apex', 'Chapel Hill',
       'Northwoods', 'Edgehill Farms', 'Tanglewood', 'Preston',
       'Eastgate', 'East Elmhurst', 'Jackson Heights', 'Long Island City',
       'Katunayaka', 'Unknown Location', 'Colombo', 'Nugegoda',
       'Islamabad', 'R?walpindi', 'Noorpur Shahan', 'Heritage Pines',
       'Westpark Place', 'Waverly Place', 'Wayne Ridge', 'Weston',
       'East Austin', 'West University', 'South Congress', 'The Drag',
       'Congress Ave District', 'Red River District', 'Georgian Acres',
       'North Austin', 'Coxville', 'Conven

In [58]:
# count of unique start locations
data["START*"].value_counts()

Cary                  201
Unknown Location      148
Morrisville            85
Whitebridge            68
Islamabad              57
                     ... 
Meredith                1
Connecticut Avenue      1
Red River District      1
Hayesville              1
Ingleside               1
Name: START*, Length: 177, dtype: int64

### > Exercise 2

1. Create the following dataframe with “Umur” is object type and convert it into integer
![](pandas/ex1.png)

In [61]:
df = pd.DataFrame({
    'Nama': ['Ahmad', 'Joko', 'Adi'],
    'Umur': ['12', '13', 15],
    'Kelas': [6, 7, 8]
})

df

Unnamed: 0,Nama,Umur,Kelas
0,Ahmad,12,6
1,Joko,13,7
2,Adi,15,8


In [62]:
df.dtypes

Nama     object
Umur     object
Kelas     int64
dtype: object

In [63]:
df['Umur'] = pd.to_numeric(df['Umur'])
df

Unnamed: 0,Nama,Umur,Kelas
0,Ahmad,12,6
1,Joko,13,7
2,Adi,15,8


In [64]:
df.dtypes

Nama     object
Umur      int64
Kelas     int64
dtype: object

2. Go to Kaggle, download the Titanic data and do the data basic exploration.\
head, tail, describe, info, size, shape

In [65]:
df = pd.read_csv('train.csv')

In [66]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [67]:
df.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [68]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [70]:
df.size

10692

In [71]:
df.shape

(891, 12)

## Data Manipulation Tasks

There are five common data manipulations tasks:
1. Selecting/Indexing
2. Filtering
3. Sorting
4. Mutating/conditionally adding columns
5. Groupby/summarize

## 1. Selecting/Indexing

### `loc` and `iloc`

![](pandas/loc.png)

In [72]:
data.head()

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,PURPOSE*
0,2016-01-01 21:11:00,2016-01-01 21:11:00,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain
1,2016-01-02 01:25:00,2016-01-02 01:25:00,Business,Fort Pierce,Fort Pierce,5.0,
2,2016-01-02 20:25:00,2016-01-02 20:25:00,Business,Fort Pierce,Fort Pierce,4.8,Errand/Supplies
3,2016-01-05 17:31:00,2016-01-05 17:31:00,Business,Fort Pierce,Fort Pierce,4.7,Meeting
4,2016-01-06 14:42:00,2016-01-06 14:42:00,Business,Fort Pierce,West Palm Beach,63.7,Customer Visit


### Positional indexing

In [73]:
data.iloc[0:3, [1,3]]

Unnamed: 0,END_DATE*,START*
0,2016-01-01 21:11:00,Fort Pierce
1,2016-01-02 01:25:00,Fort Pierce
2,2016-01-02 20:25:00,Fort Pierce


In [74]:
data.iloc[:, 3:6]

Unnamed: 0,START*,STOP*,MILES*
0,Fort Pierce,Fort Pierce,5.1
1,Fort Pierce,Fort Pierce,5.0
2,Fort Pierce,Fort Pierce,4.8
3,Fort Pierce,Fort Pierce,4.7
4,Fort Pierce,West Palm Beach,63.7
...,...,...,...
1151,Kar?chi,Unknown Location,3.9
1152,Unknown Location,Unknown Location,16.2
1153,Katunayake,Gampaha,6.4
1154,Gampaha,Ilukwatta,48.2


In [75]:
data.iloc[1:3, 3:6]

Unnamed: 0,START*,STOP*,MILES*
1,Fort Pierce,Fort Pierce,5.0
2,Fort Pierce,Fort Pierce,4.8


### Label indexing

In [76]:
data.loc[0:5, :"START*"]

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*
0,2016-01-01 21:11:00,2016-01-01 21:11:00,Business,Fort Pierce
1,2016-01-02 01:25:00,2016-01-02 01:25:00,Business,Fort Pierce
2,2016-01-02 20:25:00,2016-01-02 20:25:00,Business,Fort Pierce
3,2016-01-05 17:31:00,2016-01-05 17:31:00,Business,Fort Pierce
4,2016-01-06 14:42:00,2016-01-06 14:42:00,Business,Fort Pierce
5,2016-01-06 17:15:00,2016-01-06 17:15:00,Business,West Palm Beach


In [77]:
data.loc[:, ["START_DATE*", "MILES*"]].head()

Unnamed: 0,START_DATE*,MILES*
0,2016-01-01 21:11:00,5.1
1,2016-01-02 01:25:00,5.0
2,2016-01-02 20:25:00,4.8
3,2016-01-05 17:31:00,4.7
4,2016-01-06 14:42:00,63.7


In [78]:
data.loc[:1154, ["START_DATE*", "MILES*"]]

Unnamed: 0,START_DATE*,MILES*
0,2016-01-01 21:11:00,5.1
1,2016-01-02 01:25:00,5.0
2,2016-01-02 20:25:00,4.8
3,2016-01-05 17:31:00,4.7
4,2016-01-06 14:42:00,63.7
...,...,...
1150,2016-12-31 01:07:00,0.7
1151,2016-12-31 13:24:00,3.9
1152,2016-12-31 15:03:00,16.2
1153,2016-12-31 21:32:00,6.4


In [79]:
a = data.loc[:, "START*"]

In [80]:
type(a)

pandas.core.series.Series

In [87]:
b = data.loc[:, ["START*", 'STOP*']].head()
b

Unnamed: 0,START*,STOP*
0,Fort Pierce,Fort Pierce
1,Fort Pierce,Fort Pierce
2,Fort Pierce,Fort Pierce
3,Fort Pierce,Fort Pierce
4,Fort Pierce,West Palm Beach


In [82]:
type(b)

pandas.core.frame.DataFrame

##### All function work in df, not in series

### > Exercise 3

1. Select columns: `START_DATE*, START*, STOP*`

In [84]:
df = data.loc[:, ['START_DATE*', 'START*', 'STOP*']]
df

Unnamed: 0,START_DATE*,START*,STOP*
0,2016-01-01 21:11:00,Fort Pierce,Fort Pierce
1,2016-01-02 01:25:00,Fort Pierce,Fort Pierce
2,2016-01-02 20:25:00,Fort Pierce,Fort Pierce
3,2016-01-05 17:31:00,Fort Pierce,Fort Pierce
4,2016-01-06 14:42:00,Fort Pierce,West Palm Beach
...,...,...,...
1151,2016-12-31 13:24:00,Kar?chi,Unknown Location
1152,2016-12-31 15:03:00,Unknown Location,Unknown Location
1153,2016-12-31 21:32:00,Katunayake,Gampaha
1154,2016-12-31 22:08:00,Gampaha,Ilukwatta


2. Extract the first & last 10 rows of the previous columns

In [85]:
df.head(10)

Unnamed: 0,START_DATE*,START*,STOP*
0,2016-01-01 21:11:00,Fort Pierce,Fort Pierce
1,2016-01-02 01:25:00,Fort Pierce,Fort Pierce
2,2016-01-02 20:25:00,Fort Pierce,Fort Pierce
3,2016-01-05 17:31:00,Fort Pierce,Fort Pierce
4,2016-01-06 14:42:00,Fort Pierce,West Palm Beach
5,2016-01-06 17:15:00,West Palm Beach,West Palm Beach
6,2016-01-06 17:30:00,West Palm Beach,Palm Beach
7,2016-01-07 13:27:00,Cary,Cary
8,2016-01-10 08:05:00,Cary,Morrisville
9,2016-01-10 12:17:00,Jamaica,New York


In [86]:
df.tail(10)

Unnamed: 0,START_DATE*,START*,STOP*
1146,2016-12-30 11:31:00,Kar?chi,Kar?chi
1147,2016-12-30 15:41:00,Kar?chi,Kar?chi
1148,2016-12-30 16:45:00,Kar?chi,Kar?chi
1149,2016-12-30 23:06:00,Kar?chi,Kar?chi
1150,2016-12-31 01:07:00,Kar?chi,Kar?chi
1151,2016-12-31 13:24:00,Kar?chi,Unknown Location
1152,2016-12-31 15:03:00,Unknown Location,Unknown Location
1153,2016-12-31 21:32:00,Katunayake,Gampaha
1154,2016-12-31 22:08:00,Gampaha,Ilukwatta
1155,NaT,,


## 2. Filtering

In [None]:
df1 = data.loc[data["MILES*"] > 10, ["MILES*"]]
df1

In [None]:
df1 = data.loc[data["MILES*"] > 10, ["START*"]]
df1

In [None]:
df2 = df1.loc[0:3]
df2

#### find all rides that is greater that 10 miles

In [None]:
data.loc[data["MILES*"] > 10]

#### find all rides from NY

In [None]:
data.loc[data["START*"] == "New York"]

In [None]:
data.loc[data["START*"] == "New York", ["MILES*", "STOP*"]]

#### Find out all rides from Cary & New York    

In [None]:
# match multiple condition
st = data[data["START*"].isin(["Cary","New York"])]
st.head(n = 10)

In [None]:
st.iloc[0:5, :]

In [None]:
st.loc[0:5, :]

In [None]:
st.reset_index(inplace = True, drop = True)

In [None]:
st

#### Find out all rides to Cary & Morrisville    

In [None]:
data[data["STOP*"].isin(["Cary","Morrisville"])]

### > Exercise 4

1. Find all trips that is greater than 10 miles and originated from New York and Morrisville

Hint: use `and`

## 3. Sorting

In [None]:
data.sort_values(by=["MILES*"], ascending=False)

In [None]:
data.sort_values(by=["START*"], ascending=True)

In [None]:
data.sort_values(by=["START*", "STOP*"], ascending=True)

## 4. Conditionally adding column

In [None]:
import numpy as np

In [None]:
data["DISTANCE"] = np.where(data["MILES*"] > 5, "Long Trip", "Short Trip")

In [None]:
data.head()

In [None]:
data["DISTANCE"].value_counts()

In [None]:
data["YEAR"] = np.array("2020")
data

In [None]:
data["TIME_CAT"] = np.where(data["START_DATE*"] > "1/5/2016", "New Trip", "Old Trip")

In [None]:
data

In [None]:
data['START*'].value_counts().head()

### Exercise 5

1. Create a new column with the following condition:\
    a) >10    : Long Trip\
    b) 5-10   : Medium Trip\
    c) <5     : Short Trip