### Reading tabular data

In [9]:
import pandas as pd

In [10]:
orders= pd.read_table('orders.tsv')
orders.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


In [11]:
user_cols = ['user id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_table('imdb_1000.csv', sep='|', header=None, names = user_cols)
users.head()

Unnamed: 0,user id,age,gender,occupation,zip_code
0,"star_rating,title,content_rating,genre,duratio...",,,,
1,"9.3,The Shawshank Redemption,R,Crime,142,""[u'T...",,,,
2,"9.2,The Godfather,R,Crime,175,""[u'Marlon Brand...",,,,
3,"9.1,The Godfather: Part II,R,Crime,200,""[u'Al ...",,,,
4,"9,The Dark Knight,PG-13,Action,152,""[u'Christi...",,,,


#### Reading only select columns

In [12]:
orders = pd.read_table('orders.tsv', usecols = ['item_name', 'item_price'])   # using column name
orders = pd.read_table('orders.tsv', usecols = [2,4])                         # using column index
orders.head()

Unnamed: 0,item_name,item_price
0,Chips and Fresh Tomato Salsa,$2.39
1,Izze,$3.39
2,Nantucket Nectar,$3.39
3,Chips and Tomatillo-Green Chili Salsa,$2.39
4,Chicken Bowl,$16.98


#### Read only the first number of rows

In [13]:
orders = pd.read_table('orders.tsv', nrows = 3)

### Select a Pandas Series from a DataFrame

In [14]:
ufo = pd.read_csv('ufo.csv')

In [15]:
ufo.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


#### Selecting a column as Series can be done using either bracket notation or dot notation. However, dot notation would not work if the column name has space in it.

In [16]:
ufo['City']
ufo.City
ufo['Colors Reported']

0           NaN
1           NaN
2           NaN
3           NaN
4           NaN
5           NaN
6           NaN
7           NaN
8           NaN
9           NaN
10          NaN
11          NaN
12          RED
13          NaN
14          NaN
15          NaN
16          NaN
17          NaN
18          NaN
19          RED
20          NaN
21          NaN
22          NaN
23          NaN
24          NaN
25          NaN
26          NaN
27          NaN
28          NaN
29          NaN
          ...  
18211       NaN
18212       NaN
18213     GREEN
18214       NaN
18215       NaN
18216    ORANGE
18217       NaN
18218       NaN
18219       NaN
18220      BLUE
18221       NaN
18222       NaN
18223       NaN
18224       NaN
18225       NaN
18226       NaN
18227       NaN
18228       NaN
18229       NaN
18230       NaN
18231       NaN
18232       NaN
18233       RED
18234       NaN
18235       NaN
18236       NaN
18237       NaN
18238       NaN
18239       RED
18240       NaN
Name: Colors Reported, L

In [17]:
ufo.shape

(18241, 5)

#### Pandas Series can be joined together using the + operator.
#### Pandas DataFrame can assign a new column using the bracket notation, but not the dot notation.

In [18]:
ufo['Location'] = ufo.City + ', ' + ufo.State

In [19]:
ufo.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time,Location
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00,"Ithaca, NY"
1,Willingboro,,OTHER,NJ,6/30/1930 20:00,"Willingboro, NJ"
2,Holyoke,,OVAL,CO,2/15/1931 14:00,"Holyoke, CO"
3,Abilene,,DISK,KS,6/1/1931 13:00,"Abilene, KS"
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00,"New York Worlds Fair, NY"


### Common methods and attributes for DataFrame

In [20]:
movies = pd.read_csv('imdb_1000.csv')
movies.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


In [21]:
movies.describe()

Unnamed: 0,star_rating,duration
count,979.0,979.0
mean,7.889785,120.979571
std,0.336069,26.21801
min,7.4,64.0
25%,7.6,102.0
50%,7.8,117.0
75%,8.1,134.0
max,9.3,242.0


In [22]:
movies.shape

(979, 6)

In [23]:
movies.dtypes

star_rating       float64
title              object
content_rating     object
genre              object
duration            int64
actors_list        object
dtype: object

In [24]:
movies.describe(include=['object'])    #includes only objects in the DataFrame

Unnamed: 0,title,content_rating,genre,actors_list
count,979,976,979,979
unique,975,12,16,969
top,Dracula,R,Drama,"[u'Daniel Radcliffe', u'Emma Watson', u'Rupert..."
freq,2,460,278,6


### Rename columns in DataFrame

In [25]:
ufo = pd.read_csv('ufo.csv')
ufo.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [26]:
ufo.columns

Index(['City', 'Colors Reported', 'Shape Reported', 'State', 'Time'], dtype='object')

In [27]:
ufo.rename(columns = {'Colors Reported': 'Colors_Reported', 'Shape Reported': 'Shape_Reported'})

Unnamed: 0,City,Colors_Reported,Shape_Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00
5,Valley City,,DISK,ND,9/15/1934 15:30
6,Crater Lake,,CIRCLE,CA,6/15/1935 0:00
7,Alma,,DISK,MI,7/15/1936 0:00
8,Eklutna,,CIGAR,AK,10/15/1936 17:00
9,Hubbard,,CYLINDER,OR,6/15/1937 0:00


In [28]:
ufo.rename({'Colors Reported': 'Colors_Reported', 'Shape Reported': 'Shape_Reported'})

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00
5,Valley City,,DISK,ND,9/15/1934 15:30
6,Crater Lake,,CIRCLE,CA,6/15/1935 0:00
7,Alma,,DISK,MI,7/15/1936 0:00
8,Eklutna,,CIGAR,AK,10/15/1936 17:00
9,Hubbard,,CYLINDER,OR,6/15/1937 0:00


#### Alternatively:

In [29]:
ufo_cols = ['city', 'colors', 'shape', 'state', 'time']
ufo.columns = ufo_cols
ufo.head()

Unnamed: 0,city,colors,shape,state,time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


#### Also, columns can be renamed during reading of the file:

In [30]:
ufo = pd.read_csv('ufo.csv', names = ufo_cols, header=0)
ufo.head()

Unnamed: 0,city,colors,shape,state,time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


#### Alternatively, str method can be used to quickly change column names:

In [31]:
ufo.columns = ufo.columns.str.replace(' ', '_')

### Remove columns from DataFrame

In [32]:
ufo = pd.read_csv('ufo.csv')
ufo.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [33]:
ufo.drop('Colors Reported', axis = 1, inplace=True)    # axis = 0 for row, axis = 1 for column

In [34]:
ufo.head()

Unnamed: 0,City,Shape Reported,State,Time
0,Ithaca,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,OTHER,NJ,6/30/1930 20:00
2,Holyoke,OVAL,CO,2/15/1931 14:00
3,Abilene,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,LIGHT,NY,4/18/1933 19:00


#### Passing column names in a list to remove multiple columns at once:

In [35]:
ufo.drop(['City', 'State'], axis = 1) #alternatively,
ufo.drop(columns=['City', 'State'],inplace=True)
ufo.head()

Unnamed: 0,Shape Reported,Time
0,TRIANGLE,6/1/1930 22:00
1,OTHER,6/30/1930 20:00
2,OVAL,2/15/1931 14:00
3,DISK,6/1/1931 13:00
4,LIGHT,4/18/1933 19:00


#### Similarly, set axis=0 and pass row index to drop entire rows from the DataFrame:

In [36]:
ufo.drop( ufo.index[1] , axis=0, inplace=True)
ufo.head()

Unnamed: 0,Shape Reported,Time
0,TRIANGLE,6/1/1930 22:00
2,OVAL,2/15/1931 14:00
3,DISK,6/1/1931 13:00
4,LIGHT,4/18/1933 19:00
5,DISK,9/15/1934 15:30


### Sorting DataFrame or Series

In [37]:
movies = pd.read_csv('imdb_1000.csv')
movies.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


#### Sorting a column and returning just the column as a Series:

In [38]:
movies.title.sort_values()      #using dot notation
movies['title'].sort_values()   #using bracket notation

542                   (500) Days of Summer
5                             12 Angry Men
201                       12 Years a Slave
698                              127 Hours
110                  2001: A Space Odyssey
910                                   2046
596                               21 Grams
624                              25th Hour
708                       28 Days Later...
60                                3 Idiots
225                                 3-Iron
570                                    300
555                           3:10 to Yuma
427           4 Months, 3 Weeks and 2 Days
824                                     42
597                                  50/50
203                                  8 1/2
170                       A Beautiful Mind
941                       A Bridge Too Far
571                           A Bronx Tale
266                      A Christmas Story
86                      A Clockwork Orange
716                         A Few Good Men
750        

#### Sorting the whole table by a column and returning a DataFrame:

In [39]:
movies.sort_values('title')

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
542,7.8,(500) Days of Summer,PG-13,Comedy,95,"[u'Zooey Deschanel', u'Joseph Gordon-Levitt', ..."
5,8.9,12 Angry Men,NOT RATED,Drama,96,"[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals..."
201,8.1,12 Years a Slave,R,Biography,134,"[u'Chiwetel Ejiofor', u'Michael Kenneth Willia..."
698,7.6,127 Hours,R,Adventure,94,"[u'James Franco', u'Amber Tamblyn', u'Kate Mara']"
110,8.3,2001: A Space Odyssey,G,Mystery,160,"[u'Keir Dullea', u'Gary Lockwood', u'William S..."
910,7.5,2046,R,Drama,129,"[u'Tony Chiu Wai Leung', u'Ziyi Zhang', u'Faye..."
596,7.7,21 Grams,R,Crime,124,"[u'Sean Penn', u'Benicio Del Toro', u'Naomi Wa..."
624,7.7,25th Hour,R,Crime,135,"[u'Edward Norton', u'Barry Pepper', u'Philip S..."
708,7.6,28 Days Later...,R,Horror,113,"[u'Cillian Murphy', u'Naomie Harris', u'Christ..."
60,8.5,3 Idiots,PG-13,Comedy,170,"[u'Aamir Khan', u'Madhavan', u'Mona Singh']"


#### Sorting by multiple columns by passing a list (sorted in the order of the list):

In [40]:
movies.sort_values(['content_rating', 'duration'])

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
713,7.6,The Jungle Book,APPROVED,Animation,78,"[u'Phil Harris', u'Sebastian Cabot', u'Louis P..."
513,7.8,Invasion of the Body Snatchers,APPROVED,Horror,80,"[u'Kevin McCarthy', u'Dana Wynter', u'Larry Ga..."
272,8.1,The Killing,APPROVED,Crime,85,"[u'Sterling Hayden', u'Coleen Gray', u'Vince E..."
703,7.6,Dracula,APPROVED,Horror,85,"[u'Bela Lugosi', u'Helen Chandler', u'David Ma..."
612,7.7,A Hard Day's Night,APPROVED,Comedy,87,"[u'John Lennon', u'Paul McCartney', u'George H..."
58,8.5,Paths of Glory,APPROVED,Drama,88,"[u'Kirk Douglas', u'Ralph Meeker', u'Adolphe M..."
210,8.1,Laura,APPROVED,Film-Noir,88,"[u'Gene Tierney', u'Dana Andrews', u'Clifton W..."
656,7.7,Snow White and the Seven Dwarfs,APPROVED,Animation,88,"[u'Adriana Caselotti', u'Harry Stockwell', u'L..."
844,7.5,Pinocchio,APPROVED,Animation,88,"[u'Dickie Jones', u'Christian Rub', u'Mel Blanc']"
233,8.1,The Night of the Hunter,APPROVED,Crime,92,"[u'Robert Mitchum', u'Shelley Winters', u'Lill..."


### Filter rows by column value:

In [41]:
movies.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


#### first create a mask:

In [42]:
duration_mask = []
for length in movies.duration:
    if length >= 200:
        duration_mask.append(True)
    else:
        duration_mask.append(False)

#### Convert the mask into a Pandas Series and apply to DataFrame:

In [43]:
is_long = pd.Series(duration_mask)
movies[is_long]

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
7,8.9,The Lord of the Rings: The Return of the King,PG-13,Adventure,201,"[u'Elijah Wood', u'Viggo Mortensen', u'Ian McK..."
17,8.7,Seven Samurai,UNRATED,Drama,207,"[u'Toshir\xf4 Mifune', u'Takashi Shimura', u'K..."
78,8.4,Once Upon a Time in America,R,Crime,229,"[u'Robert De Niro', u'James Woods', u'Elizabet..."
85,8.4,Lawrence of Arabia,PG,Adventure,216,"[u""Peter O'Toole"", u'Alec Guinness', u'Anthony..."
142,8.3,Lagaan: Once Upon a Time in India,PG,Adventure,224,"[u'Aamir Khan', u'Gracy Singh', u'Rachel Shell..."
157,8.2,Gone with the Wind,G,Drama,238,"[u'Clark Gable', u'Vivien Leigh', u'Thomas Mit..."
204,8.1,Ben-Hur,G,Adventure,212,"[u'Charlton Heston', u'Jack Hawkins', u'Stephe..."
445,7.9,The Ten Commandments,APPROVED,Adventure,220,"[u'Charlton Heston', u'Yul Brynner', u'Anne Ba..."
476,7.8,Hamlet,PG-13,Drama,242,"[u'Kenneth Branagh', u'Julie Christie', u'Dere..."


#### Alternatively, this code can be much shorter:

In [44]:
is_long = movies.duration >= 200
movies[is_long]

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
7,8.9,The Lord of the Rings: The Return of the King,PG-13,Adventure,201,"[u'Elijah Wood', u'Viggo Mortensen', u'Ian McK..."
17,8.7,Seven Samurai,UNRATED,Drama,207,"[u'Toshir\xf4 Mifune', u'Takashi Shimura', u'K..."
78,8.4,Once Upon a Time in America,R,Crime,229,"[u'Robert De Niro', u'James Woods', u'Elizabet..."
85,8.4,Lawrence of Arabia,PG,Adventure,216,"[u""Peter O'Toole"", u'Alec Guinness', u'Anthony..."
142,8.3,Lagaan: Once Upon a Time in India,PG,Adventure,224,"[u'Aamir Khan', u'Gracy Singh', u'Rachel Shell..."
157,8.2,Gone with the Wind,G,Drama,238,"[u'Clark Gable', u'Vivien Leigh', u'Thomas Mit..."
204,8.1,Ben-Hur,G,Adventure,212,"[u'Charlton Heston', u'Jack Hawkins', u'Stephe..."
445,7.9,The Ten Commandments,APPROVED,Adventure,220,"[u'Charlton Heston', u'Yul Brynner', u'Anne Ba..."
476,7.8,Hamlet,PG-13,Drama,242,"[u'Kenneth Branagh', u'Julie Christie', u'Dere..."


#### Alternatively, to make the code even shorter:

In [45]:
movies[movies.duration >= 200]

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
7,8.9,The Lord of the Rings: The Return of the King,PG-13,Adventure,201,"[u'Elijah Wood', u'Viggo Mortensen', u'Ian McK..."
17,8.7,Seven Samurai,UNRATED,Drama,207,"[u'Toshir\xf4 Mifune', u'Takashi Shimura', u'K..."
78,8.4,Once Upon a Time in America,R,Crime,229,"[u'Robert De Niro', u'James Woods', u'Elizabet..."
85,8.4,Lawrence of Arabia,PG,Adventure,216,"[u""Peter O'Toole"", u'Alec Guinness', u'Anthony..."
142,8.3,Lagaan: Once Upon a Time in India,PG,Adventure,224,"[u'Aamir Khan', u'Gracy Singh', u'Rachel Shell..."
157,8.2,Gone with the Wind,G,Drama,238,"[u'Clark Gable', u'Vivien Leigh', u'Thomas Mit..."
204,8.1,Ben-Hur,G,Adventure,212,"[u'Charlton Heston', u'Jack Hawkins', u'Stephe..."
445,7.9,The Ten Commandments,APPROVED,Adventure,220,"[u'Charlton Heston', u'Yul Brynner', u'Anne Ba..."
476,7.8,Hamlet,PG-13,Drama,242,"[u'Kenneth Branagh', u'Julie Christie', u'Dere..."


#### To get a column after filtering:

In [46]:
movies[movies.duration >= 200]['genre']         # Using bracket notation
movies[movies.duration >= 200].genre            # Using dot notation
movies.loc[movies.duration >= 200, 'genre']     # Best practice: using .loc method

2          Crime
7      Adventure
17         Drama
78         Crime
85     Adventure
142    Adventure
157        Drama
204    Adventure
445    Adventure
476        Drama
630    Biography
767       Action
Name: genre, dtype: object

### Multiple filter criteria

In [47]:
movies.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


#### Filter that satisfies ALL conditions across multiple columns:

In [48]:
movies[(movies.duration >= 200) & (movies.genre == 'Drama')]    # Must use '&' as the 'and' operator

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
17,8.7,Seven Samurai,UNRATED,Drama,207,"[u'Toshir\xf4 Mifune', u'Takashi Shimura', u'K..."
157,8.2,Gone with the Wind,G,Drama,238,"[u'Clark Gable', u'Vivien Leigh', u'Thomas Mit..."
476,7.8,Hamlet,PG-13,Drama,242,"[u'Kenneth Branagh', u'Julie Christie', u'Dere..."


#### Filter that satisfies ANY conditions across multiple columns:

In [49]:
movies[(movies.duration >= 200) | (movies.genre == 'Drama')]    # Must use '|' as the 'or' operator

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
5,8.9,12 Angry Men,NOT RATED,Drama,96,"[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals..."
7,8.9,The Lord of the Rings: The Return of the King,PG-13,Adventure,201,"[u'Elijah Wood', u'Viggo Mortensen', u'Ian McK..."
9,8.9,Fight Club,R,Drama,139,"[u'Brad Pitt', u'Edward Norton', u'Helena Bonh..."
13,8.8,Forrest Gump,PG-13,Drama,142,"[u'Tom Hanks', u'Robin Wright', u'Gary Sinise']"
16,8.7,One Flew Over the Cuckoo's Nest,R,Drama,133,"[u'Jack Nicholson', u'Louise Fletcher', u'Mich..."
17,8.7,Seven Samurai,UNRATED,Drama,207,"[u'Toshir\xf4 Mifune', u'Takashi Shimura', u'K..."
22,8.7,It's a Wonderful Life,APPROVED,Drama,130,"[u'James Stewart', u'Donna Reed', u'Lionel Bar..."
24,8.7,Se7en,R,Drama,127,"[u'Morgan Freeman', u'Brad Pitt', u'Kevin Spac..."
27,8.6,The Silence of the Lambs,R,Drama,118,"[u'Jodie Foster', u'Anthony Hopkins', u'Lawren..."


#### Filter that satisfies multiple conditions in one column:

In [50]:
movies[(movies.genre == 'Drama') | (movies.genre == 'Action')]            # works
movies[movies.genre.isin(['Crime', 'Drama'])]                             # cleander code

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."
5,8.9,12 Angry Men,NOT RATED,Drama,96,"[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals..."
9,8.9,Fight Club,R,Drama,139,"[u'Brad Pitt', u'Edward Norton', u'Helena Bonh..."
13,8.8,Forrest Gump,PG-13,Drama,142,"[u'Tom Hanks', u'Robin Wright', u'Gary Sinise']"
16,8.7,One Flew Over the Cuckoo's Nest,R,Drama,133,"[u'Jack Nicholson', u'Louise Fletcher', u'Mich..."
17,8.7,Seven Samurai,UNRATED,Drama,207,"[u'Toshir\xf4 Mifune', u'Takashi Shimura', u'K..."
21,8.7,City of God,R,Crime,130,"[u'Alexandre Rodrigues', u'Matheus Nachtergael..."


### Iterating through DataFrame

In [200]:
for index, row in movies.iterrows():
    print(index, row.title, row.star_rating)

star_rating                                                     9.3
title                                      The Shawshank Redemption
content_rating                                                    R
genre                                                         Crime
duration                                                        142
actors_list       [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...
Name: 0, dtype: object
star_rating                                                   9.2
title                                               The Godfather
content_rating                                                  R
genre                                                       Crime
duration                                                      175
actors_list       [u'Marlon Brando', u'Al Pacino', u'James Caan']
Name: 1, dtype: object
star_rating                                                     9.1
title                                        The Godfather: Part II
content_rating

Name: 349, dtype: object
star_rating                                                       8
title                                             Shadow of a Doubt
content_rating                                             APPROVED
genre                                                      Thriller
duration                                                        108
actors_list       [u'Teresa Wright', u'Joseph Cotten', u'Macdona...
Name: 350, dtype: object
star_rating                                                       8
title                                                  Frankenstein
content_rating                                              UNRATED
genre                                                        Horror
duration                                                         70
actors_list       [u'Colin Clive', u'Mae Clarke', u'Boris Karloff']
Name: 351, dtype: object
star_rating                                                      8
title                                     

Name: 665, dtype: object
star_rating                                                     7.7
title                                              Eastern Promises
content_rating                                                    R
genre                                                         Crime
duration                                                        100
actors_list       [u'Naomi Watts', u'Viggo Mortensen', u'Armin M...
Name: 666, dtype: object
star_rating                                                     7.7
title                                             Midnight in Paris
content_rating                                                PG-13
genre                                                        Comedy
duration                                                         94
actors_list       [u'Owen Wilson', u'Rachel McAdams', u'Kathy Ba...
Name: 667, dtype: object
star_rating                                                     7.7
title                                    

### Selecting data from DataFrame

In [52]:
ufo.sample(n=3, random_state=1)

Unnamed: 0,Shape Reported,Time
4384,FORMATION,7/17/1980 22:10
1925,OVAL,1/16/1970 19:00
2555,DISK,11/15/1973 17:30


In [53]:
train = ufo.sample(frac=0.75, random_state=99)
test = ufo.loc[~ufo.index.isin(train.index), :] # selecting only portion of the data not part of the train set

### Drop non-numeric columns from a DataFrame

In [54]:
drinks = pd.read_csv('drinks.csv')
drinks.dtypes

country                          object
beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object

In [55]:
import numpy as np
drinks.select_dtypes(include=[np.number]).dtypes

beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
dtype: object

### 'axis' parameter in Pandas

In [56]:
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [57]:
drinks.drop('continent', axis = 1).head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
0,Afghanistan,0,0,0,0.0
1,Albania,89,132,54,4.9
2,Algeria,25,0,14,0.7
3,Andorra,245,138,312,12.4
4,Angola,217,57,45,5.9


In [58]:
drinks.drop(2, axis = 0).head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa
5,Antigua & Barbuda,102,128,45,4.9,North America


In [59]:
drinks.mean(axis = 0)    #the parameter is by default axis=0

beer_servings                   106.160622
spirit_servings                  80.994819
wine_servings                    49.450777
total_litres_of_pure_alcohol      4.717098
dtype: float64

In [60]:
drinks.mean(axis = 1)

0        0.000
1       69.975
2        9.925
3      176.850
4       81.225
5       69.975
6      111.825
7       53.700
8      138.850
9      138.675
10      18.325
11      88.825
12      28.500
13       0.000
14      89.575
15     142.850
16     150.375
17      97.950
18      13.025
19       5.850
20      54.950
21      65.400
22      62.100
23     103.300
24       8.650
25     146.825
26      10.825
27      23.575
28      12.250
29      55.000
        ...   
163     79.650
164     24.675
165    101.300
166    143.800
167     14.250
168      4.325
169     91.100
170     55.725
171      1.525
172     14.575
173     15.775
174     91.600
175     18.825
176     20.350
177     31.050
178     14.250
179     15.575
180    124.225
181     39.700
182    137.600
183     12.175
184    124.925
185     94.150
186     34.100
187     12.725
188    110.925
189     29.000
190      1.525
191     14.375
192     22.675
Length: 193, dtype: float64

In [61]:
drinks.mean(axis='index')     #'index' is an alias for axis = 0

beer_servings                   106.160622
spirit_servings                  80.994819
wine_servings                    49.450777
total_litres_of_pure_alcohol      4.717098
dtype: float64

In [62]:
drinks.mean(axis = 'columns')     #'columns' is an alias for axis = 1

0        0.000
1       69.975
2        9.925
3      176.850
4       81.225
5       69.975
6      111.825
7       53.700
8      138.850
9      138.675
10      18.325
11      88.825
12      28.500
13       0.000
14      89.575
15     142.850
16     150.375
17      97.950
18      13.025
19       5.850
20      54.950
21      65.400
22      62.100
23     103.300
24       8.650
25     146.825
26      10.825
27      23.575
28      12.250
29      55.000
        ...   
163     79.650
164     24.675
165    101.300
166    143.800
167     14.250
168      4.325
169     91.100
170     55.725
171      1.525
172     14.575
173     15.775
174     91.600
175     18.825
176     20.350
177     31.050
178     14.250
179     15.575
180    124.225
181     39.700
182    137.600
183     12.175
184    124.925
185     94.150
186     34.100
187     12.725
188    110.925
189     29.000
190      1.525
191     14.375
192     22.675
Length: 193, dtype: float64

### string methods in Pandas

In [63]:
orders= pd.read_table('orders.tsv')
orders.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


#### string method is called upon by using .str

In [64]:
orders.item_name.str.upper()

0                CHIPS AND FRESH TOMATO SALSA
1                                        IZZE
2                            NANTUCKET NECTAR
3       CHIPS AND TOMATILLO-GREEN CHILI SALSA
4                                CHICKEN BOWL
5                                CHICKEN BOWL
6                               SIDE OF CHIPS
7                               STEAK BURRITO
8                            STEAK SOFT TACOS
9                               STEAK BURRITO
10                        CHIPS AND GUACAMOLE
11                       CHICKEN CRISPY TACOS
12                         CHICKEN SOFT TACOS
13                               CHICKEN BOWL
14                        CHIPS AND GUACAMOLE
15      CHIPS AND TOMATILLO-GREEN CHILI SALSA
16                            CHICKEN BURRITO
17                            CHICKEN BURRITO
18                                CANNED SODA
19                               CHICKEN BOWL
20                        CHIPS AND GUACAMOLE
21                           BARBA

### Filter column by using string methods

In [65]:
orders[orders.item_name.str.contains('Chicken')]

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",$10.98
11,6,1,Chicken Crispy Tacos,"[Roasted Chili Corn Salsa, [Fajita Vegetables,...",$8.75
12,6,1,Chicken Soft Tacos,"[Roasted Chili Corn Salsa, [Rice, Black Beans,...",$8.75
13,7,1,Chicken Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Rice,...",$11.25
16,8,1,Chicken Burrito,"[Tomatillo-Green Chili Salsa (Medium), [Pinto ...",$8.49
17,9,1,Chicken Burrito,"[Fresh Tomato Salsa (Mild), [Black Beans, Rice...",$8.49
19,10,1,Chicken Bowl,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",$8.75
23,12,1,Chicken Burrito,"[[Tomatillo-Green Chili Salsa (Medium), Tomati...",$10.98
26,13,1,Chicken Bowl,"[Roasted Chili Corn Salsa (Medium), [Pinto Bea...",$8.49


#### str methods can be chained together:

In [66]:
orders.choice_description.str.replace('[','').str.replace(']','')

0                                                     NaN
1                                              Clementine
2                                                   Apple
3                                                     NaN
4       Tomatillo-Red Chili Salsa (Hot), Black Beans, ...
5       Fresh Tomato Salsa (Mild), Rice, Cheese, Sour ...
6                                                     NaN
7       Tomatillo Red Chili Salsa, Fajita Vegetables, ...
8       Tomatillo Green Chili Salsa, Pinto Beans, Chee...
9       Fresh Tomato Salsa, Rice, Black Beans, Pinto B...
10                                                    NaN
11      Roasted Chili Corn Salsa, Fajita Vegetables, R...
12      Roasted Chili Corn Salsa, Rice, Black Beans, C...
13      Fresh Tomato Salsa, Fajita Vegetables, Rice, C...
14                                                    NaN
15                                                    NaN
16      Tomatillo-Green Chili Salsa (Medium), Pinto Be...
17      Fresh 

#### Also, regex works in str methods:

In [67]:
orders.choice_description.str.replace('[\[\]]','')

0                                                     NaN
1                                              Clementine
2                                                   Apple
3                                                     NaN
4       Tomatillo-Red Chili Salsa (Hot), Black Beans, ...
5       Fresh Tomato Salsa (Mild), Rice, Cheese, Sour ...
6                                                     NaN
7       Tomatillo Red Chili Salsa, Fajita Vegetables, ...
8       Tomatillo Green Chili Salsa, Pinto Beans, Chee...
9       Fresh Tomato Salsa, Rice, Black Beans, Pinto B...
10                                                    NaN
11      Roasted Chili Corn Salsa, Fajita Vegetables, R...
12      Roasted Chili Corn Salsa, Rice, Black Beans, C...
13      Fresh Tomato Salsa, Fajita Vegetables, Rice, C...
14                                                    NaN
15                                                    NaN
16      Tomatillo-Green Chili Salsa (Medium), Pinto Be...
17      Fresh 

### Change data type of a Pandas Series

In [68]:
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [69]:
drinks.dtypes

country                          object
beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object

In [70]:
drinks['beer_servings'] = drinks.beer_servings.astype(float)
drinks.dtypes

country                          object
beer_servings                   float64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object

#### Alternatively, dtypes can be assigned when loading the file:

In [71]:
drinks = pd.read_csv('drinks.csv', dtype={'beer_servings': float})
drinks.dtypes

country                          object
beer_servings                   float64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object

### Change data types of multiple DataFrame columns

In [72]:
drinks = pd.read_csv('drinks.csv')
drinks = drinks.astype({'beer_servings':'float', 'spirit_servings':'float'})

### Working with object types that Pandas does not recognize

In [73]:
orders.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


In [74]:
orders.dtypes

order_id               int64
quantity               int64
item_name             object
choice_description    object
item_price            object
dtype: object

#### we can use string method to get rid of the dollar sign, and then use astype method to change it to float:

In [75]:
orders.item_price.str.replace('$','').astype(float).mean()

7.464335785374397

#### Similiarly, boolean values can be changed to int for computing purpose (machine learning)

In [76]:
orders.item_name.str.contains('Chicken').astype(int).head()

0    0
1    0
2    0
3    0
4    1
Name: item_name, dtype: int64

### Groupby

In [203]:
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


#### Categorizing data in one column by value in another:

In [206]:
drinks.groupby('continent').beer_servings.mean()

continent
Africa            61.471698
Asia              37.045455
Europe           193.777778
North America    145.434783
Oceania           89.687500
South America    175.083333
Name: beer_servings, dtype: float64

In [79]:
drinks.groupby('continent').beer_servings.agg(['count', 'min', 'max', 'mean'])

Unnamed: 0_level_0,count,min,max,mean
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,53,0.0,376.0,61.471698
Asia,44,0.0,247.0,37.045455
Europe,45,0.0,361.0,193.777778
North America,23,1.0,285.0,145.434783
Oceania,16,0.0,306.0,89.6875
South America,12,93.0,333.0,175.083333


In [80]:
drinks.beer_servings.agg(['mean', 'min', 'max'])

mean    106.160622
min       0.000000
max     376.000000
Name: beer_servings, dtype: float64

In [81]:
drinks.agg(['mean', 'min', 'max'])

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
min,Afghanistan,0.0,0.0,0.0,0.0,Africa
max,Zimbabwe,376.0,438.0,370.0,14.4,South America
mean,,106.160622,80.994819,49.450777,4.717098,


### Pandas Series

In [82]:
movies.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


In [83]:
movies.genre.describe()

count       979
unique       16
top       Drama
freq        278
Name: genre, dtype: object

#### Count value occurance in a column

In [84]:
movies.genre.value_counts()

Drama        278
Comedy       156
Action       136
Crime        124
Biography     77
Adventure     75
Animation     62
Horror        29
Mystery       16
Western        9
Thriller       5
Sci-Fi         5
Film-Noir      3
Family         2
Fantasy        1
History        1
Name: genre, dtype: int64

#### Value occurance frequency in a column

In [85]:
movies.genre.value_counts(normalize=True)

Drama        0.283963
Comedy       0.159346
Action       0.138917
Crime        0.126660
Biography    0.078652
Adventure    0.076609
Animation    0.063330
Horror       0.029622
Mystery      0.016343
Western      0.009193
Thriller     0.005107
Sci-Fi       0.005107
Film-Noir    0.003064
Family       0.002043
Fantasy      0.001021
History      0.001021
Name: genre, dtype: float64

#### Unique items in a column

In [86]:
movies.genre.unique()

array(['Crime', 'Action', 'Drama', 'Western', 'Adventure', 'Biography',
       'Comedy', 'Animation', 'Mystery', 'Horror', 'Film-Noir', 'Sci-Fi',
       'History', 'Thriller', 'Family', 'Fantasy'], dtype=object)

#### Number of unique items in a column

In [87]:
movies.genre.nunique()

16

#### Crosstab: Tally counts of each genre in each content_rating

In [88]:
pd.crosstab(movies.genre, movies.content_rating)

content_rating,APPROVED,G,GP,NC-17,NOT RATED,PASSED,PG,PG-13,R,TV-MA,UNRATED,X
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Action,3,1,1,0,4,1,11,44,67,0,3,0
Adventure,3,2,0,0,5,1,21,23,17,0,2,0
Animation,3,20,0,0,3,0,25,5,5,0,1,0
Biography,1,2,1,0,1,0,6,29,36,0,0,0
Comedy,9,2,1,1,16,3,23,23,73,0,4,1
Crime,6,0,0,1,7,1,6,4,87,0,11,1
Drama,12,3,0,4,24,1,25,55,143,1,9,1
Family,0,1,0,0,0,0,1,0,0,0,0,0
Fantasy,0,0,0,0,0,0,0,0,1,0,0,0
Film-Noir,1,0,0,0,1,0,0,0,0,0,1,0


### Handling missing values

In [89]:
ufo = pd.read_csv('ufo.csv')
ufo.tail()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
18236,Grant Park,,TRIANGLE,IL,12/31/2000 23:00
18237,Spirit Lake,,DISK,IA,12/31/2000 23:00
18238,Eagle River,,,WI,12/31/2000 23:45
18239,Eagle River,RED,LIGHT,WI,12/31/2000 23:45
18240,Ybor,,OVAL,FL,12/31/2000 23:59


In [90]:
ufo.isnull().tail()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
18236,False,True,False,False,False
18237,False,True,False,False,False
18238,False,True,True,False,False
18239,False,False,False,False,False
18240,False,True,False,False,False


In [91]:
ufo.notnull().tail()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
18236,True,False,True,True,True
18237,True,False,True,True,True
18238,True,False,False,True,True
18239,True,True,True,True,True
18240,True,False,True,True,True


#### Pandas treats true/false as 1/0 and can perform numerical operations on them:

In [92]:
ufo.isnull().sum()    # sum() by default has axis=0, which sums in the column direction

City                  25
Colors Reported    15359
Shape Reported      2644
State                  0
Time                   0
dtype: int64

#### Drop an entire row if any of the columns has a nan value

In [93]:
ufo.dropna(how='any')

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
12,Belton,RED,SPHERE,SC,6/30/1939 20:00
19,Bering Sea,RED,OTHER,AK,4/30/1943 23:00
36,Portsmouth,RED,FORMATION,VA,7/10/1945 1:30
44,Blairsden,GREEN,SPHERE,CA,6/30/1946 19:00
82,San Jose,BLUE,CHEVRON,CA,7/15/1947 21:00
84,Modesto,BLUE,DISK,CA,8/8/1947 22:00
91,Scipio,RED,SPHERE,IN,5/10/1948 19:00
111,Tarrant City,ORANGE,CIRCLE,AL,8/15/1949 22:00
129,Napa,GREEN,DISK,CA,6/10/1950 0:00
138,Coeur d'Alene,ORANGE,CIGAR,ID,7/2/1950 13:00


#### Drop an entire row only of all of the columns has nan value

In [94]:
ufo.dropna(how='all')

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00
5,Valley City,,DISK,ND,9/15/1934 15:30
6,Crater Lake,,CIRCLE,CA,6/15/1935 0:00
7,Alma,,DISK,MI,7/15/1936 0:00
8,Eklutna,,CIGAR,AK,10/15/1936 17:00
9,Hubbard,,CYLINDER,OR,6/15/1937 0:00


#### Drop an entire row only if the specified columns has nan value

In [95]:
ufo.dropna(subset=['City', 'Shape Reported'], how='any')

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00
5,Valley City,,DISK,ND,9/15/1934 15:30
6,Crater Lake,,CIRCLE,CA,6/15/1935 0:00
7,Alma,,DISK,MI,7/15/1936 0:00
8,Eklutna,,CIGAR,AK,10/15/1936 17:00
9,Hubbard,,CYLINDER,OR,6/15/1937 0:00


### Filling missing values

In [96]:
ufo['Shape Reported'].fillna(value='VARIOUS', inplace=True)
ufo['Shape Reported'].value_counts(dropna=False)

VARIOUS      2977
LIGHT        2803
DISK         2122
TRIANGLE     1889
OTHER        1402
CIRCLE       1365
SPHERE       1054
FIREBALL     1039
OVAL          845
CIGAR         617
FORMATION     434
RECTANGLE     303
CYLINDER      294
CHEVRON       248
DIAMOND       234
EGG           197
FLASH         188
TEARDROP      119
CONE           60
CROSS          36
DELTA           7
CRESCENT        2
ROUND           2
FLARE           1
PYRAMID         1
DOME            1
HEXAGON         1
Name: Shape Reported, dtype: int64

### Pandas Indexing

In [97]:
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0.0,0.0,0,0.0,Asia
1,Albania,89.0,132.0,54,4.9,Europe
2,Algeria,25.0,0.0,14,0.7,Africa
3,Andorra,245.0,138.0,312,12.4,Europe
4,Angola,217.0,57.0,45,5.9,Africa


In [98]:
drinks.loc[23, 'beer_servings']    # get value from specific row, column

245.0

#### Select a column as index, and select data from the table:

In [99]:
drinks.set_index('country', inplace=True)
drinks.head()

Unnamed: 0_level_0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Afghanistan,0.0,0.0,0,0.0,Asia
Albania,89.0,132.0,54,4.9,Europe
Algeria,25.0,0.0,14,0.7,Africa
Andorra,245.0,138.0,312,12.4,Europe
Angola,217.0,57.0,45,5.9,Africa


In [100]:
drinks.loc['Brazil', 'beer_servings']

245.0

#### To put the index back into the column, first give it a name, and then reset index

In [101]:
drinks.index.name = 'country'
drinks.reset_index(inplace=True)
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0.0,0.0,0,0.0,Asia
1,Albania,89.0,132.0,54,4.9,Europe
2,Algeria,25.0,0.0,14,0.7,Africa
3,Andorra,245.0,138.0,312,12.4,Europe
4,Angola,217.0,57.0,45,5.9,Africa


#### Methods that return DataFrame can be used with loc method to select item from table based on some calculated attributes:

In [102]:
drinks.describe()

Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
count,193.0,193.0,193.0,193.0
mean,106.160622,80.994819,49.450777,4.717098
std,101.143103,88.284312,79.697598,3.773298
min,0.0,0.0,0.0,0.0
25%,20.0,4.0,1.0,1.3
50%,76.0,56.0,8.0,4.2
75%,188.0,128.0,59.0,7.2
max,376.0,438.0,370.0,14.4


In [103]:
drinks.describe().loc['25%', 'beer_servings']

20.0

In [104]:
drinks.continent.head()

0      Asia
1    Europe
2    Africa
3    Europe
4    Africa
Name: continent, dtype: object

#### Sort by column as index

In [212]:
drinks.set_index('country', inplace=True)    # Set a column as index
drinks.continent.value_counts().sort_index()

Africa           53
Asia             44
Europe           45
North America    23
Oceania          16
South America    12
Name: continent, dtype: int64

#### Series with index can multiply with the proper alignment: 

In [106]:
people = pd.Series([3000000, 85000], index = ['Albania', 'Andorra'] , name = 'population')
people

Albania    3000000
Andorra      85000
Name: population, dtype: int64

In [107]:
drinks.beer_servings * people

Afghanistan                     NaN
Albania                 267000000.0
Algeria                         NaN
Andorra                  20825000.0
Angola                          NaN
Antigua & Barbuda               NaN
Argentina                       NaN
Armenia                         NaN
Australia                       NaN
Austria                         NaN
Azerbaijan                      NaN
Bahamas                         NaN
Bahrain                         NaN
Bangladesh                      NaN
Barbados                        NaN
Belarus                         NaN
Belgium                         NaN
Belize                          NaN
Benin                           NaN
Bhutan                          NaN
Bolivia                         NaN
Bosnia-Herzegovina              NaN
Botswana                        NaN
Brazil                          NaN
Brunei                          NaN
Bulgaria                        NaN
Burkina Faso                    NaN
Burundi                     

#### Combining DataFrame with Seiries, with index alignment

In [108]:
pd.concat([drinks, people], axis = 1).head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent,population
Afghanistan,0.0,0.0,0,0.0,Asia,
Albania,89.0,132.0,54,4.9,Europe,3000000.0
Algeria,25.0,0.0,14,0.7,Africa,
Andorra,245.0,138.0,312,12.4,Europe,85000.0
Angola,217.0,57.0,45,5.9,Africa,


### Slicing, selecting multiple rows and columns

In [109]:
ufo = pd.read_csv('ufo.csv')
ufo.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [110]:
ufo.loc[0, :]

City                       Ithaca
Colors Reported               NaN
Shape Reported           TRIANGLE
State                          NY
Time               6/1/1930 22:00
Name: 0, dtype: object

In [111]:
ufo.loc[0:2, :]

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00


In [112]:
ufo.loc[:, 'City']

0                      Ithaca
1                 Willingboro
2                     Holyoke
3                     Abilene
4        New York Worlds Fair
5                 Valley City
6                 Crater Lake
7                        Alma
8                     Eklutna
9                     Hubbard
10                    Fontana
11                   Waterloo
12                     Belton
13                     Keokuk
14                  Ludington
15                Forest Home
16                Los Angeles
17                  Hapeville
18                     Oneida
19                 Bering Sea
20                   Nebraska
21                        NaN
22                        NaN
23                  Owensboro
24                 Wilderness
25                  San Diego
26                 Wilderness
27                     Clovis
28                 Los Alamos
29               Ft. Duschene
                 ...         
18211                 Holyoke
18212                  Carson
18213     

In [113]:
ufo[['City', 'State']]

Unnamed: 0,City,State
0,Ithaca,NY
1,Willingboro,NJ
2,Holyoke,CO
3,Abilene,KS
4,New York Worlds Fair,NY
5,Valley City,ND
6,Crater Lake,CA
7,Alma,MI
8,Eklutna,AK
9,Hubbard,OR


In [114]:
ufo.loc[:, ['City', 'State']]

Unnamed: 0,City,State
0,Ithaca,NY
1,Willingboro,NJ
2,Holyoke,CO
3,Abilene,KS
4,New York Worlds Fair,NY
5,Valley City,ND
6,Crater Lake,CA
7,Alma,MI
8,Eklutna,AK
9,Hubbard,OR


In [115]:
ufo.loc[:, 'City' : 'State']

Unnamed: 0,City,Colors Reported,Shape Reported,State
0,Ithaca,,TRIANGLE,NY
1,Willingboro,,OTHER,NJ
2,Holyoke,,OVAL,CO
3,Abilene,,DISK,KS
4,New York Worlds Fair,,LIGHT,NY
5,Valley City,,DISK,ND
6,Crater Lake,,CIRCLE,CA
7,Alma,,DISK,MI
8,Eklutna,,CIGAR,AK
9,Hubbard,,CYLINDER,OR


In [116]:
ufo.drop('Time', axis =1)

Unnamed: 0,City,Colors Reported,Shape Reported,State
0,Ithaca,,TRIANGLE,NY
1,Willingboro,,OTHER,NJ
2,Holyoke,,OVAL,CO
3,Abilene,,DISK,KS
4,New York Worlds Fair,,LIGHT,NY
5,Valley City,,DISK,ND
6,Crater Lake,,CIRCLE,CA
7,Alma,,DISK,MI
8,Eklutna,,CIGAR,AK
9,Hubbard,,CYLINDER,OR


#### Filtering the data table by value in a column

In [117]:
ufo[ufo.City=='Oakland']

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
1694,Oakland,,CIGAR,CA,7/21/1968 14:00
2144,Oakland,,DISK,CA,8/19/1971 0:00
4686,Oakland,,LIGHT,MD,6/1/1982 0:00
7293,Oakland,,LIGHT,CA,3/28/1994 17:00
8488,Oakland,,,CA,8/10/1995 21:45
8768,Oakland,,,CA,10/10/1995 22:40
10816,Oakland,,LIGHT,OR,10/1/1997 21:30
10948,Oakland,,DISK,CA,11/14/1997 19:55
11045,Oakland,,TRIANGLE,CA,12/10/1997 1:30
12322,Oakland,,FIREBALL,CA,10/9/1998 19:40


In [118]:
ufo.loc[ufo.City=='Oakland', :]

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
1694,Oakland,,CIGAR,CA,7/21/1968 14:00
2144,Oakland,,DISK,CA,8/19/1971 0:00
4686,Oakland,,LIGHT,MD,6/1/1982 0:00
7293,Oakland,,LIGHT,CA,3/28/1994 17:00
8488,Oakland,,,CA,8/10/1995 21:45
8768,Oakland,,,CA,10/10/1995 22:40
10816,Oakland,,LIGHT,OR,10/1/1997 21:30
10948,Oakland,,DISK,CA,11/14/1997 19:55
11045,Oakland,,TRIANGLE,CA,12/10/1997 1:30
12322,Oakland,,FIREBALL,CA,10/9/1998 19:40


### Pandas Dataframe optimization

In [119]:
drinks.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 193 entries, Afghanistan to Zimbabwe
Data columns (total 5 columns):
beer_servings                   193 non-null float64
spirit_servings                 193 non-null float64
wine_servings                   193 non-null int64
total_litres_of_pure_alcohol    193 non-null float64
continent                       193 non-null object
dtypes: float64(3), int64(1), object(1)
memory usage: 35.4 KB


In [120]:
drinks.memory_usage(deep=True)

Index                           17708
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                       12332
dtype: int64

In [121]:
sorted(drinks.continent.unique())

['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America']

#### Instead of storing the strings, we can store integers that represent these strings

In [122]:
drinks['continent']=drinks.continent.astype('category')

In [123]:
drinks.continent.head()

country
Afghanistan      Asia
Albania        Europe
Algeria        Africa
Andorra        Europe
Angola         Africa
Name: continent, dtype: category
Categories (6, object): [Africa, Asia, Europe, North America, Oceania, South America]

#### the column can also be imported as a category while reading in the file

In [124]:
drinks = pd.read_csv('drinks.csv', dtype={'continent':'category'})
drinks.dtypes

country                           object
beer_servings                      int64
spirit_servings                    int64
wine_servings                      int64
total_litres_of_pure_alcohol     float64
continent                       category
dtype: object

#### Pandas is now storing the strings as codes, and we can check them below:

In [125]:
drinks.continent.cat.codes.head()

0    1
1    2
2    0
3    2
4    0
dtype: int8

In [126]:
drinks.memory_usage(deep=True)

Index                              80
country                         12588
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                         744
dtype: int64

#### Logical ordering of strings

In [127]:
df = pd.DataFrame({'ID': [100,101,102,103], 'Quality':['good', 'very good', 'good', 'excellent']})

In [128]:
from pandas.api.types import CategoricalDtype
quality_cat = CategoricalDtype(['good', 'very good', 'excellent'], ordered = True)
df['Quality'] = df.Quality.astype(quality_cat)
df.sort_values(by='Quality')

Unnamed: 0,ID,Quality
0,100,good
2,102,good
1,101,very good
3,103,excellent


#### And as a result, now we can use this for filtering too

In [129]:
df.loc[df.Quality>'good', :]

Unnamed: 0,ID,Quality
1,101,very good
3,103,excellent


### Creating dummy variables in Pandas

In [213]:
train = pd.read_csv('titanic_train.csv')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [215]:
train['Sex_male'] = train['Sex'].map({'female':0, 'male':1})
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_male
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1


#### Alternatively: 

In [132]:
pd.get_dummies(train.Sex)

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
5,0,1
6,0,1
7,0,1
8,1,0
9,1,0


In [133]:
pd.get_dummies(train.Sex, prefix='Sex').iloc[:, 1:]

Unnamed: 0,Sex_male
0,1
1,0
2,0
3,0
4,1
5,1
6,1
7,1
8,0
9,0


In [134]:
embarked_dummies = pd.get_dummies(train.Embarked, prefix="Embarked").iloc[:, 1:]

In [135]:
pd.concat([train, embarked_dummies], axis =1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_male,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,1,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,0,0,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,1,0,1
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,1,1,0
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,1,0,1
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S,1,0,1
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,0,0,1
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,0,0,0


#### Finally, to do this for multiple columns at once in a DataFrame

In [136]:
pd.get_dummies(train, columns=['Sex', 'Embarked'], drop_first=True)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_male,Sex_male.1,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.2500,,1,1,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,0,0,0,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.9250,,0,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1000,C123,0,0,0,1
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.0500,,1,1,0,1
5,6,0,3,"Moran, Mr. James",,0,0,330877,8.4583,,1,1,1,0
6,7,0,1,"McCarthy, Mr. Timothy J",54.0,0,0,17463,51.8625,E46,1,1,0,1
7,8,0,3,"Palsson, Master. Gosta Leonard",2.0,3,1,349909,21.0750,,1,1,0,1
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",27.0,0,2,347742,11.1333,,0,0,0,1
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",14.0,1,0,237736,30.0708,,0,0,0,0


### Working with dates and times

In [137]:
ufo.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [138]:
ufo['Time']=pd.to_datetime(ufo.Time)

In [139]:
ufo.dtypes

City                       object
Colors Reported            object
Shape Reported             object
State                      object
Time               datetime64[ns]
dtype: object

#### Now the Time can be used for comparison:

In [140]:
ts = pd.to_datetime('1/1/1999')
ufo.loc[ufo.Time >= ts, :]

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
12832,Loma Rica,,LIGHT,CA,1999-01-01 02:30:00
12833,Bauxite,,,AR,1999-01-01 03:00:00
12834,Florence,,CYLINDER,SC,1999-01-01 14:00:00
12835,Lake Henshaw,,CIGAR,CA,1999-01-01 15:00:00
12836,Wilmington Island,,LIGHT,GA,1999-01-01 17:15:00
12837,DeWitt,,LIGHT,AR,1999-01-01 18:00:00
12838,Bainbridge Island,,,WA,1999-01-01 19:12:00
12839,Camano Island,,FIREBALL,WA,1999-01-01 19:30:00
12840,Cheaha Mountain,,TRIANGLE,AL,1999-01-01 21:00:00
12841,Coyoty Canyon,,DISK,NM,1999-01-01 21:00:00


In [141]:
(ufo.Time.max() - ufo.Time.min()).days

25781

#### Creating date time from columns with specific names that Pandas recognize as date time

In [142]:
df = pd.DataFrame([[12,25,2017,10], [1,15,2018,11]], columns=['month', 'day', 'year','hour'])
df

Unnamed: 0,month,day,year,hour
0,12,25,2017,10
1,1,15,2018,11


In [143]:
pd.to_datetime(df)

0   2017-12-25 10:00:00
1   2018-01-15 11:00:00
dtype: datetime64[ns]

In [144]:
df.index = pd.to_datetime(df[['month', 'day', 'year']])
df

Unnamed: 0,month,day,year,hour
2017-12-25,12,25,2017,10
2018-01-15,1,15,2018,11


### Removing duplicate rows

In [233]:
orders = pd.read_table('orders.tsv')
orders.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


In [217]:
orders.shape

(4622, 5)

In [147]:
orders.item_name.duplicated() #duplicated values in a column

0       False
1       False
2       False
3       False
4       False
5        True
6       False
7       False
8       False
9        True
10      False
11      False
12      False
13       True
14       True
15       True
16      False
17       True
18      False
19       True
20       True
21      False
22       True
23       True
24       True
25       True
26       True
27      False
28       True
29       True
        ...  
4592     True
4593     True
4594     True
4595     True
4596     True
4597     True
4598     True
4599     True
4600     True
4601     True
4602     True
4603     True
4604     True
4605     True
4606     True
4607     True
4608     True
4609     True
4610     True
4611     True
4612     True
4613     True
4614     True
4615     True
4616     True
4617     True
4618     True
4619     True
4620     True
4621     True
Name: item_name, Length: 4622, dtype: bool

In [148]:
orders.duplicated() #duplicated rows in the dataframe

0       False
1       False
2       False
3       False
4       False
5       False
6       False
7       False
8       False
9       False
10      False
11      False
12      False
13      False
14      False
15      False
16      False
17      False
18      False
19      False
20      False
21      False
22      False
23      False
24      False
25      False
26      False
27      False
28      False
29      False
        ...  
4592    False
4593    False
4594    False
4595    False
4596    False
4597    False
4598    False
4599    False
4600    False
4601    False
4602    False
4603     True
4604    False
4605    False
4606    False
4607    False
4608    False
4609    False
4610    False
4611    False
4612    False
4613    False
4614    False
4615    False
4616    False
4617    False
4618    False
4619    False
4620    False
4621    False
Length: 4622, dtype: bool

#### To see duplicate rows:

In [236]:
orders.duplicated(keep='first').shape #the 'first' duplicate is marked as false in the dataframe

(4622,)

#### Removing duplicates:

In [237]:
orders.drop_duplicates(keep='last') #keeping only the last occurance of duplicates

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",$10.98
6,3,1,Side of Chips,,$1.69
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",$11.75
8,4,1,Steak Soft Tacos,"[Tomatillo Green Chili Salsa, [Pinto Beans, Ch...",$9.25
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",$9.25


In [151]:
orders.drop_duplicates(subset=['quantity', 'item_name']) #Looking at a subset of columns to identify duplicates

(103, 5)

### Overriding value with np.nan

In [239]:
movies = pd.read_csv('imdb_1000.csv')
movies.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


In [240]:
movies.loc[movies['content_rating']=='NOT RATED', 'content_rating']

5      NOT RATED
6      NOT RATED
41     NOT RATED
63     NOT RATED
66     NOT RATED
72     NOT RATED
83     NOT RATED
87     NOT RATED
88     NOT RATED
89     NOT RATED
93     NOT RATED
100    NOT RATED
104    NOT RATED
105    NOT RATED
108    NOT RATED
109    NOT RATED
111    NOT RATED
116    NOT RATED
122    NOT RATED
128    NOT RATED
132    NOT RATED
133    NOT RATED
134    NOT RATED
140    NOT RATED
149    NOT RATED
165    NOT RATED
167    NOT RATED
169    NOT RATED
174    NOT RATED
178    NOT RATED
         ...    
215    NOT RATED
231    NOT RATED
234    NOT RATED
246    NOT RATED
252    NOT RATED
254    NOT RATED
255    NOT RATED
263    NOT RATED
265    NOT RATED
315    NOT RATED
328    NOT RATED
343    NOT RATED
405    NOT RATED
419    NOT RATED
427    NOT RATED
453    NOT RATED
478    NOT RATED
481    NOT RATED
491    NOT RATED
528    NOT RATED
531    NOT RATED
546    NOT RATED
573    NOT RATED
592    NOT RATED
647    NOT RATED
665    NOT RATED
673    NOT RATED
763    NOT RAT

In [242]:
movies.loc[movies['content_rating']=='NOT RATED', 'content_rating'] = np.nan
movies

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."
5,8.9,12 Angry Men,,Drama,96,"[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals..."
6,8.9,"The Good, the Bad and the Ugly",,Western,161,"[u'Clint Eastwood', u'Eli Wallach', u'Lee Van ..."
7,8.9,The Lord of the Rings: The Return of the King,PG-13,Adventure,201,"[u'Elijah Wood', u'Viggo Mortensen', u'Ian McK..."
8,8.9,Schindler's List,R,Biography,195,"[u'Liam Neeson', u'Ralph Fiennes', u'Ben Kings..."
9,8.9,Fight Club,R,Drama,139,"[u'Brad Pitt', u'Edward Norton', u'Helena Bonh..."


### Explicitly creating copy (instead of view) of a DataFrame

In [155]:
top_movies = movies.loc[movies.star_rating >= 9, :].copy

### Common display options for Pandas

In [156]:
pd.set_option('display.max_rows', None)
pd.reset_option('display.max_rows')

In [157]:
pd.set_option('display.max_colwidth', 1000)
pd.reset_option('display.max_colwidth')

In [158]:
pd.set_option('display.precision', 2)
pd.reset_option('display.precision')

In [159]:
pd.describe_option('rows') #search in docs for methods containing 'row'

display.max_info_rows : int or None
    df.info() will usually show null-counts for each column.
    For large frames this can be quite slow. max_info_rows and max_info_cols
    limit this null check only to frames with smaller dimensions than
    specified.
    [default: 1690785] [currently: 1690785]

display.max_rows : int
    If max_rows is exceeded, switch to truncate view. Depending on
    `large_repr`, objects are either centrally truncated or printed as
    a summary view. 'None' value means unlimited.

    In case python/IPython is running in a terminal and `large_repr`
    equals 'truncate' this can be set to 0 and pandas will auto-detect
    the height of the terminal and print a truncated object which fits
    the screen height. The IPython notebook, IPython qtconsole, or
    IDLE do not run in a terminal and hence it is not possible to do
    correct auto-detection.
    [default: 60] [currently: 60]




In [160]:
pd.reset_option('all') #reset all options to default

html.border has been deprecated, use display.html.border instead
(currently both are identical)


: boolean
    use_inf_as_null had been deprecated and will be removed in a future
    version. Use `use_inf_as_na` instead.



(currently both are identical)

: boolean
    use_inf_as_null had been deprecated and will be removed in a future
    version. Use `use_inf_as_na` instead.



### Creating DataFrame

In [161]:
df = pd.DataFrame({'id': [100,101,102], 'color':['red','blue','red']}, columns=['id', 'color'], index = ['a','b','c'])
df

Unnamed: 0,id,color
a,100,red
b,101,blue
c,102,red


#### Alternatively format: (inner list representing a row item)

In [162]:
pd.DataFrame([[100,'red'],[101, 'blue'], [102, 'red']], columns=['id', 'color'], index=['a','b','c'])

Unnamed: 0,id,color
a,100,red
b,101,blue
c,102,red


### Creating DataFrame from np array

In [163]:
import numpy as np
arr = np.random.rand(4,2)
arr

array([[0.88249877, 0.04540946],
       [0.02934836, 0.80923425],
       [0.51482079, 0.67257187],
       [0.62068364, 0.19800552]])

In [164]:
pd.DataFrame(arr, columns = ['one', 'two'], index = ['a','b','c', 'd'])

Unnamed: 0,one,two
a,0.882499,0.045409
b,0.029348,0.809234
c,0.514821,0.672572
d,0.620684,0.198006


In [165]:
pd.DataFrame({'student': np.arange(100, 110, 1), 'test':np.random.randint(60,101, 10)})

Unnamed: 0,student,test
0,100,100
1,101,87
2,102,74
3,103,93
4,104,93
5,105,63
6,106,99
7,107,65
8,108,98
9,109,63


### Attach a series to a DataFrame with index matching

In [166]:
df

Unnamed: 0,id,color
a,100,red
b,101,blue
c,102,red


In [167]:
s = pd.Series(['round', 'square'], index=['c', 'b'], name='shape' )
s

c     round
b    square
Name: shape, dtype: object

In [168]:
pd.concat([df, s], axis = 1, sort=False)

Unnamed: 0,id,color,shape
a,100,red,
b,101,blue,square
c,102,red,round


### Apply a function to Pandas Series or DataFrame

In [169]:
train = pd.read_csv('titanic_train.csv')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


#### Mapping a column based on value to a new column

In [170]:
train['Sex_num'] = train.Sex.map({'female':0, 'male':1})
train.loc[0:4, ['Sex', 'Sex_num']]

Unnamed: 0,Sex,Sex_num
0,male,1
1,female,0
2,female,0
3,female,0
4,male,1


#### Apply length function to a column and store result in a new column

In [171]:
train['name_length']=train.Name.apply(len)
train.loc[0:4, ['Name', 'name_length']]

Unnamed: 0,Name,name_length
0,"Braund, Mr. Owen Harris",23
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",51
2,"Heikkinen, Miss. Laina",22
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",44
4,"Allen, Mr. William Henry",24


#### Apply ceiling function to values in a column

In [172]:
train['Fare_ceil'] = train.Fare.apply(np.ceil)
train.loc[0:4, ['Fare', 'Fare_ceil']]

Unnamed: 0,Fare,Fare_ceil
0,7.25,8.0
1,71.2833,72.0
2,7.925,8.0
3,53.1,54.0
4,8.05,9.0


#### Get last name from a column

In [173]:
train.Name.str.split(',').head()

0                           [Braund,  Mr. Owen Harris]
1    [Cumings,  Mrs. John Bradley (Florence Briggs ...
2                            [Heikkinen,  Miss. Laina]
3      [Futrelle,  Mrs. Jacques Heath (Lily May Peel)]
4                          [Allen,  Mr. William Henry]
Name: Name, dtype: object

In [245]:
train.Name.str.split(',').apply(lambda x: x[0]).head()

0       Braund
1      Cumings
2    Heikkinen
3     Futrelle
4        Allen
Name: Name, dtype: object

#### Get max value in a column or row:

In [175]:
drinks = pd.read_csv('drinks.csv')
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [176]:
drinks.loc[:, 'beer_servings' : 'wine_servings'].apply(max, axis=0)

beer_servings      376
spirit_servings    438
wine_servings      370
dtype: int64

In [247]:
drinks.loc[:, 'beer_servings' : 'wine_servings'].apply(np.argmax, axis=0)

will be corrected to return the positional maximum in the future.
Use 'series.values.argmax' to get the position of the maximum now.
  return getattr(obj, method)(*args, **kwds)


beer_servings      Namibia
spirit_servings    Grenada
wine_servings       France
dtype: object

#### Applying function to each element in the dataframe

In [178]:
drinks.loc[:, 'beer_servings' : 'wine_servings'].applymap(float)

Unnamed: 0,beer_servings,spirit_servings,wine_servings
0,0.0,0.0,0.0
1,89.0,132.0,54.0
2,25.0,0.0,14.0
3,245.0,138.0,312.0
4,217.0,57.0,45.0
5,102.0,128.0,45.0
6,193.0,25.0,221.0
7,21.0,179.0,11.0
8,261.0,72.0,212.0
9,279.0,75.0,191.0


### Multi-index

In [179]:
stocks = pd.read_csv('stocks.csv')
stocks

Unnamed: 0,Date,Close,Volume,Symbol
0,2016-10-03,31.5,14070500,CSCO
1,2016-10-03,112.52,21701800,AAPL
2,2016-10-03,57.42,19189500,MSFT
3,2016-10-04,113.0,29736800,AAPL
4,2016-10-04,57.24,20085900,MSFT
5,2016-10-04,31.35,18460400,CSCO
6,2016-10-05,57.64,16726400,MSFT
7,2016-10-05,31.59,11808600,CSCO
8,2016-10-05,113.05,21453100,AAPL


In [180]:
stocks.index

RangeIndex(start=0, stop=9, step=1)

In [181]:
stocks.groupby('Symbol').Close.mean()

Symbol
AAPL    112.856667
CSCO     31.480000
MSFT     57.433333
Name: Close, dtype: float64

In [182]:
ser = stocks.groupby(['Symbol','Date']).Close.mean()
ser.index

MultiIndex(levels=[['AAPL', 'CSCO', 'MSFT'], ['2016-10-03', '2016-10-04', '2016-10-05']],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]],
           names=['Symbol', 'Date'])

#### Unstack this series would give us a DataFrame with the second indx being the columns

In [183]:
df = ser.unstack()
df

Date,2016-10-03,2016-10-04,2016-10-05
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,112.52,113.0,113.05
CSCO,31.5,31.35,31.59
MSFT,57.42,57.24,57.64


#### Alternatively, we can also get this DataFrame using pivot_table

In [184]:
stocks.pivot_table(values='Close', index='Symbol', columns='Date')

Date,2016-10-03,2016-10-04,2016-10-05
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,112.52,113.0,113.05
CSCO,31.5,31.35,31.59
MSFT,57.42,57.24,57.64


### Series with multi-index behave like a 2-dimensional DataFrame

In [185]:
ser

Symbol  Date      
AAPL    2016-10-03    112.52
        2016-10-04    113.00
        2016-10-05    113.05
CSCO    2016-10-03     31.50
        2016-10-04     31.35
        2016-10-05     31.59
MSFT    2016-10-03     57.42
        2016-10-04     57.24
        2016-10-05     57.64
Name: Close, dtype: float64

In [186]:
ser.loc['AAPL']

Date
2016-10-03    112.52
2016-10-04    113.00
2016-10-05    113.05
Name: Close, dtype: float64

In [187]:
ser.loc['AAPL', '2016-10-03']

112.52

In [188]:
ser.loc[:, '2016-10-03']

Symbol
AAPL    112.52
CSCO     31.50
MSFT     57.42
Name: Close, dtype: float64

### DataFrame with multi-index

In [189]:
stocks.head()

Unnamed: 0,Date,Close,Volume,Symbol
0,2016-10-03,31.5,14070500,CSCO
1,2016-10-03,112.52,21701800,AAPL
2,2016-10-03,57.42,19189500,MSFT
3,2016-10-04,113.0,29736800,AAPL
4,2016-10-04,57.24,20085900,MSFT


In [190]:
stocks.set_index(['Symbol','Date'], inplace=True)
stocks

Unnamed: 0_level_0,Unnamed: 1_level_0,Close,Volume
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1
CSCO,2016-10-03,31.5,14070500
AAPL,2016-10-03,112.52,21701800
MSFT,2016-10-03,57.42,19189500
AAPL,2016-10-04,113.0,29736800
MSFT,2016-10-04,57.24,20085900
CSCO,2016-10-04,31.35,18460400
MSFT,2016-10-05,57.64,16726400
CSCO,2016-10-05,31.59,11808600
AAPL,2016-10-05,113.05,21453100


In [191]:
stocks.loc['AAPL']

Unnamed: 0_level_0,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-10-03,112.52,21701800
2016-10-04,113.0,29736800
2016-10-05,113.05,21453100


#### Pass the indexes as a tuple using loc

In [192]:
stocks.loc[('AAPL', '2016-10-03'), :]

Close          112.52
Volume    21701800.00
Name: (AAPL, 2016-10-03), dtype: float64

In [193]:
stocks.loc[('AAPL', '2016-10-03'), 'Close']

112.52

In [194]:
stocks.loc[(['AAPL', 'MSFT'], '2016-10-03'), 'Close']

Symbol  Date      
AAPL    2016-10-03    112.52
MSFT    2016-10-03     57.42
Name: Close, dtype: float64

In [195]:
stocks.loc[(slice(None), '2016-10-03'), 'Volume']

Symbol  Date      
CSCO    2016-10-03    14070500
AAPL    2016-10-03    21701800
MSFT    2016-10-03    19189500
Name: Volume, dtype: int64

### Merging DataFrames

In [196]:
close = pd.read_csv('stocks.csv', usecols=[0,1,3], index_col=['Symbol', 'Date'])
volume = pd.read_csv('stocks.csv', usecols=[0,2,3], index_col=['Symbol', 'Date'])
display(close, volume)

Unnamed: 0_level_0,Unnamed: 1_level_0,Close
Symbol,Date,Unnamed: 2_level_1
CSCO,2016-10-03,31.5
AAPL,2016-10-03,112.52
MSFT,2016-10-03,57.42
AAPL,2016-10-04,113.0
MSFT,2016-10-04,57.24
CSCO,2016-10-04,31.35
MSFT,2016-10-05,57.64
CSCO,2016-10-05,31.59
AAPL,2016-10-05,113.05


Unnamed: 0_level_0,Unnamed: 1_level_0,Volume
Symbol,Date,Unnamed: 2_level_1
CSCO,2016-10-03,14070500
AAPL,2016-10-03,21701800
MSFT,2016-10-03,19189500
AAPL,2016-10-04,29736800
MSFT,2016-10-04,20085900
CSCO,2016-10-04,18460400
MSFT,2016-10-05,16726400
CSCO,2016-10-05,11808600
AAPL,2016-10-05,21453100


In [197]:
both = pd.merge(close, volume, left_index=True, right_index=True)
both

Unnamed: 0_level_0,Unnamed: 1_level_0,Close,Volume
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1
CSCO,2016-10-03,31.5,14070500
AAPL,2016-10-03,112.52,21701800
MSFT,2016-10-03,57.42,19189500
AAPL,2016-10-04,113.0,29736800
MSFT,2016-10-04,57.24,20085900
CSCO,2016-10-04,31.35,18460400
MSFT,2016-10-05,57.64,16726400
CSCO,2016-10-05,31.59,11808600
AAPL,2016-10-05,113.05,21453100


In [198]:
both.reset_index()

Unnamed: 0,Symbol,Date,Close,Volume
0,CSCO,2016-10-03,31.5,14070500
1,AAPL,2016-10-03,112.52,21701800
2,MSFT,2016-10-03,57.42,19189500
3,AAPL,2016-10-04,113.0,29736800
4,MSFT,2016-10-04,57.24,20085900
5,CSCO,2016-10-04,31.35,18460400
6,MSFT,2016-10-05,57.64,16726400
7,CSCO,2016-10-05,31.59,11808600
8,AAPL,2016-10-05,113.05,21453100
