Pandas allows us to treat data in a table-like way (think excel) and to utilize a structure called a Data Frame. These methods make it much easier to manipulate and slice data. No more excessive loops and list comprehensions. 

In [2]:
import pandas as pd

In [5]:
#users = pd.read_table('https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user',  sep='|', index_col='user_id')

In [5]:
users = pd.read_table("../data/u.user", sep = "|", index_col='user_id')

In [6]:
type(users) # Date Frame - pandas is all about this type of object

pandas.core.frame.DataFrame

In [7]:
users.head(5) 

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [8]:
users.columns

Index([u'age', u'gender', u'occupation', u'zip_code'], dtype='object')

In [9]:
users.index # The "index" or the row labels

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
            ...
            934, 935, 936, 937, 938, 939, 940, 941, 942, 943],
           dtype='int64', name=u'user_id', length=943)

In [15]:
users.shape

(943, 4)

In [18]:
users.values # so there's an underlying "array" data type, and the Data Frame object encapsulates it, and adds more functionality

array([[24, 'M', 'technician', '85711'],
       [53, 'F', 'other', '94043'],
       [23, 'M', 'writer', '32067'],
       ..., 
       [20, 'M', 'student', '97229'],
       [48, 'F', 'librarian', '78209'],
       [22, 'M', 'student', '77841']], dtype=object)

In [24]:
print type(users.values), type(users.gender)

<type 'numpy.ndarray'> <class 'pandas.core.series.Series'>


With a Data Frame, we have columns with names, so we can select on those columns.

In [14]:
# select a column
users['gender']         # select one column
users.zip_code           # or with dot notation

user_id
1      85711
2      94043
3      32067
4      43537
5      15213
6      98101
7      91344
8      05201
9      01002
10     90703
11     30329
12     06405
13     29206
14     55106
15     97301
16     10309
17     06355
18     37212
19     02138
20     95660
21     30068
22     40206
23     48197
24     94533
25     55107
26     21044
27     30030
28     55369
29     94043
30     55436
       ...  
914    08105
915    60614
916    N2L5N
917    20006
918    70116
919    14216
920    90008
921    98801
922    21114
923    E2E3R
924    11753
925    49036
926    01701
927    55428
928    55408
929    53711
930    07310
931    33556
932    06437
933    48105
934    22902
935    66221
936    32789
937    98072
938    55038
939    33319
940    02215
941    97229
942    78209
943    77841
Name: zip_code, dtype: object

In [22]:
type(users.gender) # a Series is a basic data type in pandas

pandas.core.series.Series

Data Frames have many built-in functions for things we might commonly want. For example, if we want a statistical summary of the numeric columns.

In [15]:
# summarize (describe) the DataFrame
users.describe()                    # describe all numeric columns


Unnamed: 0,age
count,943.0
mean,34.051962
std,12.19274
min,7.0
25%,25.0
50%,31.0
75%,43.0
max,73.0


In [24]:
# or a statistical summary of all columns
users.describe(include='all') 

Unnamed: 0,age,gender,occupation,zip_code
count,943.0,943,943,943.0
unique,,2,21,795.0
top,,M,student,55414.0
freq,,670,196,9.0
mean,34.051962,,,
std,12.19274,,,
min,7.0,,,
25%,25.0,,,
50%,31.0,,,
75%,43.0,,,


For any column, we can access useful statistical summaries.

In [16]:
users.age.mean()

34.05196182396607

In [25]:
users.gender.value_counts()

M    670
F    273
Name: gender, dtype: int64

** Exercise** 

1. Read drinks.csv into a DataFrame called 'drinks'
2. Print the head and the tail 
3. Examine the default index, data types, and shape
4. Print the 'beer_servings' Series
5. Calculate the mean 'beer_servings' for the entire dataset
6. Count the number of occurrences of each 'continent' value
7. **Bonus** Display only the number of rows of the 'users' DataFrame
8. **Bonus** Display the 3 most frequent occupations in 'users'

Pandas provides easy functionality for a variety of filtering and sorting tasks we inevitably encounter. Again, this kind of work could be done with loops and conditionals, but the Data Frame methods make things much easier.

For example, we can filter on a boolean expression:

In [28]:
drinks = pd.read_table("../data/drinks.csv", sep = ",", index_col='country')
drinks.head(2)
drinks.columns

Index([u'beer_servings', u'spirit_servings', u'wine_servings',
       u'total_litres_of_pure_alcohol', u'continent'],
      dtype='object')

In [27]:
drinks.index

Index([u'Afghanistan', u'Albania', u'Algeria', u'Andorra', u'Angola',
       u'Antigua & Barbuda', u'Argentina', u'Armenia', u'Australia',
       u'Austria',
       ...
       u'Tanzania', u'USA', u'Uruguay', u'Uzbekistan', u'Vanuatu',
       u'Venezuela', u'Vietnam', u'Yemen', u'Zambia', u'Zimbabwe'],
      dtype='object', name=u'country', length=193)

In [29]:
drinks.shape

(193, 5)

In [52]:
drinks.beer_servings.describe()
#drinks.beer_servings.mean()

count    193.000000
mean     106.160622
std      101.143103
min        0.000000
25%       20.000000
50%       76.000000
75%      188.000000
max      376.000000
Name: beer_servings, dtype: float64

In [34]:
drinks.continent.value_counts()
# Note that NA does not show up because it is being interpreted as N/A (!)

AF    53
EU    45
AS    44
OC    16
SA    12
Name: continent, dtype: int64

In [44]:
users.shape()
users.describe()

Unnamed: 0,age
count,943.0
mean,34.051962
std,12.19274
min,7.0
25%,25.0
50%,31.0
75%,43.0
max,73.0


In [48]:
users.occupation.value_counts().head(3)
# Equivalent
users.occupation.value_counts()[:3]

student     196
other       105
educator     95
Name: occupation, dtype: int64

In [26]:
#  boolean filtering: only show users with age < 20
young_bool = users.age < 20         # create a Series of booleans...
users[young_bool]                   # ...and use that Series to filter rows .. only returns rows that are TRUE
users[users.age < 20]               # or, combine into a single step
users[users.age < 20].occupation    # select one column from the filtered results
users[users.age < 20].occupation.value_counts()     # value_counts of resulting Series

student          64
other             4
none              3
writer            2
entertainment     2
salesman          1
artist            1
Name: occupation, dtype: int64

And we can have filtering with multiple conditions:

In [None]:
# boolean filtering with multiple conditions
users[(users.age < 20) & (users.gender=='M')]       # ampersand for AND condition
users[(users.age < 20) | (users.age > 60)]          # pipe for OR condition

And we can sort by various columns:

In [None]:
# sorting
users.age.order()                   # sort a column
users.sort('age')                   # sort a DataFrame by a single column
users.sort('age', ascending=False)  # use descending order instead

In [59]:
#users.age.order()
#users.sort_values("age", ascending = False)

** Exercise** 

1. Filter 'drinks' to only include European countries
2. Filter 'drinks' to only include European countries with wine_servings > 300
3. Calculate the mean 'beer_servings' for all of Europe
4. Determine which 10 countries have the highest total_litres_of_pure_alcohol
5. **Bonus** Sort 'users' by 'occupation' and then by 'age' (in a single command)

In [65]:
drinks = pd.read_table('../data/drinks.csv',sep=',')

In [61]:
drinks[drinks.continent == 'EU']

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
3,Andorra,245,138,312,12.4,EU
61,France,127,151,370,11.8,EU
136,Portugal,194,67,339,11.0,EU


In [62]:
drinks[(drinks.continent == 'EU') & (drinks.wine_servings > 300)]

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
3,Andorra,245,138,312,12.4,EU
61,France,127,151,370,11.8,EU
136,Portugal,194,67,339,11.0,EU


In [66]:
print drinks[(drinks.continent == 'EU')].beer_servings.mean()
print drinks[(drinks.continent == 'EU')].beer_servings.describe()[1]

193.777777778
193.777777778


In [74]:
#drinks.sort('total_litres_of_pure_alcohol', ascending = False)
drinks.sort_values('total_litres_of_pure_alcohol', ascending = False)[:9]

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
15,Belarus,142,373,42,14.4,EU
98,Lithuania,343,244,56,12.9,EU
3,Andorra,245,138,312,12.4,EU
68,Grenada,199,438,28,11.9,
45,Czech Republic,361,170,134,11.8,EU
61,France,127,151,370,11.8,EU
141,Russian Federation,247,326,73,11.5,AS
81,Ireland,313,118,165,11.4,EU
155,Slovakia,196,293,116,11.4,EU


In [77]:
sorted_users = users.sort_values(['occupation','age'],ascending=True)

We can easily rename columns if needed. And it is easy to remove existing columns or to add new ones.

In [78]:
# Answers in class
drinks[drinks.continent == 'EU']
drinks[(drinks.continent == 'EU') & (drinks.wine_servings > 300)]
drinks[drinks.continent == 'EU'].beer_servings.mean()
drinks.total_litres_of_pure_alcohol.head(10)
a = users.sort_values(['occupation','age'])

In [77]:
drinks = pd.read_csv("../data/drinks.csv")
# rename one or more columns
drinks.rename(columns={'beer_servings':'beer', 'wine_servings':'wine'})
drinks.rename(columns={'beer_servings':'beer', 'wine_servings':'wine'}, inplace=True)

"Inplace" means that the original data frame is being changed versus returning another DF.

In [79]:
# replace all column names
drink_cols = ['country', 'beer', 'spirit', 'wine', 'liters', 'continent']
drinks.columns = drink_cols

In [84]:
# replace all column names when reading the file

drinks = pd.read_csv('../data/drinks.csv', header=0, names=drink_cols)
drinks[:3]


Unnamed: 0,country,beer,spirit,wine,liters,continent
0,Afghanistan,0,0,0,0.0,AS
1,Albania,89,132,54,4.9,EU
2,Algeria,25,0,14,0.7,AF


In [301]:
# add a new column as a function of existing columns
drinks['servings'] = drinks.beer + drinks.spirit + drinks.wine
drinks['mL'] = drinks.liters * 1000
drinks[:2]

Unnamed: 0,country,beer,spirit,wine,liters,continent,servings,mL
0,Afghanistan,0,0,0,0.0,AS,0,0
1,Albania,89,132,54,4.9,EU,275,4900


In [None]:
# removing columns
drinks.drop('mL', axis=1)                               # axis=0 for rows, 1 for columns
drinks.drop(['mL', 'servings'], axis=1, inplace=True)   # drop multiple columns

In data analysis, we often encounter missing data and need to think about how to handle this. Many methods will simply ignore missing data, and we just need to make sure we're handling things sensibly. 

In [30]:
# missing values are usually excluded by default
drinks.continent.value_counts()              # excludes missing values
drinks.continent.value_counts(dropna=False)  # includes missing values

AF     53
EU     45
AS     44
NaN    23
OC     16
SA     12
Name: continent, dtype: int64

In [90]:
# find missing values in a Series
drinks.continent.isnull()           # True if missing
drinks.continent.notnull()          # True if not missing 
# Question for instructors:
# 1. difference between "", None, False.  They are not the same. When to use which type?
# 2. a=''; a.split(','); a --> [''] which has len of 1. 
# 3. drinks[23] doesn't work, but drinks[22:23] does, why?

True

In [93]:
# use a boolean Series to filter DataFrame rows
drinks[drinks.continent.isnull()]   # only show rows where continent is missing
drinks[drinks.continent.notnull()]  # only show rows where continent is not missing

Unnamed: 0,country,beer,spirit,wine,liters,continent,servings,mL
0,Afghanistan,0,0,0,0,AS,0,0


In [101]:
# side note: understanding axes
drinks.sum()            # sums "down" the 0 axis (rows)
drinks.sum(axis=0)      # equivalent (since axis=0 is the default)
print drinks.sum(axis=1)[0:3]      # sums "across" the 1 axis (columns)
print drinks[0:3]

0       0.0
1    5454.9
2     778.7
dtype: float64
       country  beer  spirit  wine  liters continent  servings    mL
0  Afghanistan     0       0     0     0.0        AS         0     0
1      Albania    89     132    54     4.9        EU       275  4900
2      Algeria    25       0    14     0.7        AF        39   700


In [102]:
# side note: adding booleans
pd.Series([True, False, True])          # create a boolean Series
pd.Series([True, False, True]).sum()    # converts False to 0 and True to 1

2

In [107]:
# find missing values in a DataFrame
drinks.isnull()             # DataFrame of booleans
drinks.isnull().sum()       # count the missing values in each column

country       0
beer          0
spirit        0
wine          0
liters        0
continent    23
servings      0
mL            0
dtype: int64

In [None]:
# drop missing values
drinks.dropna()             # drop a row if ANY values are missing
drinks.dropna(how='all')    # drop a row only if ALL values are missing

In [None]:
# fill in missing values
drinks.continent.fillna(value='NA', inplace=True)   # fill in missing values with 'NA'

In [None]:
# turn off the missing value filter
drinks = pd.read_csv('drinks.csv', header=0, names=drink_cols, na_filter=False)

** Exercise** 

1. Read ufo.csv into a DataFrame called 'ufo'
2. Check the shape of the DataFrame
3. Calculate the most frequent value for each of the columns (in a single command)
4. What are the four most frequent colors reported?
5. For reports in VA, what's the most frequent city?
6. Show only the UFO reports from Arlington, VA
7. Count the number of missing values in each column
8. Show only the UFO reports in which the City is missing
9. How many rows remain if you drop all rows with any missing values?
10. **Bonus** Create a new column called 'Location' that includes both City and State
11. **Bonus** Replace any spaces in the column names with an underscore

In [218]:
# 1. Read ufo.csv into a DataFrame called 'ufo'
ufo = pd.read_table("../data/ufo.csv",sep=',')

In [219]:
# 2. Check shape of dataframe
ufo.shape

(80543, 5)

In [220]:
# 3. Calculate the most frequent value for each of the columns (in a single command)
# ufo.columns
# ufo.describe()
ufo.describe()[2:3]

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
top,Seattle,ORANGE,LIGHT,CA,7/4/2014 22:00


In [221]:
# 4. What are the four most frequent colors reported?
print ufo['Colors Reported'].value_counts()[0:4]
print ufo['Colors Reported'].value_counts()[0:4].index

ORANGE    5216
RED       4809
GREEN     1897
BLUE      1855
Name: Colors Reported, dtype: int64
Index([u'ORANGE', u'RED', u'GREEN', u'BLUE'], dtype='object')


In [222]:
#5. For reports in VA, what's the most frequent city?
ufo[ufo.State == 'VA'].City.value_counts()[0:1]

Virginia Beach    110
Name: City, dtype: int64

In [223]:
#6. Show only the UFO reports from Arlington, VA
ufo[(ufo.City == 'Arlington') & (ufo.State == 'VA')]

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
202,Arlington,GREEN,OVAL,VA,7/13/1952 21:00
6300,Arlington,,CHEVRON,VA,5/5/1990 21:40
10278,Arlington,,DISK,VA,5/27/1997 15:30
14527,Arlington,,OTHER,VA,9/10/1999 21:41
17984,Arlington,RED,DISK,VA,11/19/2000 22:00
21201,Arlington,GREEN,FIREBALL,VA,1/7/2002 17:45
22633,Arlington,,LIGHT,VA,7/26/2002 1:15
22780,Arlington,,LIGHT,VA,8/7/2002 21:00
25066,Arlington,,CIGAR,VA,6/1/2003 22:34
27398,Arlington,,VARIOUS,VA,12/13/2003 2:00


In [224]:
#7. Count the number of missing values in each column
ufo = pd.read_table("../data/ufo.csv",sep=',')
#ufo.fillna(value='NA',inplace=True)
bool_na = ufo.isnull()
print bool_na.sum()
print bool_na.sum().sum()

City                  47
Colors Reported    63509
Shape Reported      8402
State                  0
Time                   0
dtype: int64
71958


In [225]:
# 8. Show only the UFO reports in which the City is missing
ufo[ufo.City.isnull()]
ufo[ufo.City.isnull()][0:1]

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
21,,,,LA,8/15/1943 0:00


In [226]:
# 9. How many rows remain if you drop all rows with any missing values?
dropped_rows = ufo.dropna()
print "rows remaining: ", dropped_rows.shape[0], " from original: ", ufo.shape[0]

rows remaining:  15510  from original:  80543


In [227]:
# 10. Bonus Create a new column called 'Location' that includes both City and State
ufo["Location"] = ufo["City"] + ", " + ufo["State"]
ufo[0:2]

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time,Location
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00,"Ithaca, NY"
1,Willingboro,,OTHER,NJ,6/30/1930 20:00,"Willingboro, NJ"


In [231]:
# 11. Bonus Replace any spaces in the column names with an underscore
column_names = ["City", "Colors_Reported", "Shape_Reported", "State", "Time", "Location"]
ufo = pd.read_table("../data/ufo.csv",sep=',',names=column_names)
ufo[0:2]

Unnamed: 0,City,Colors_Reported,Shape_Reported,State,Time,Location
0,City,Colors Reported,Shape Reported,State,Time,
1,Ithaca,,TRIANGLE,NY,6/1/1930 22:00,


We can perform operations on Data Frame subsets and then record only the result for each subset. Think about this as a [split-apply-combine](http://i.imgur.com/yjNkiwL.png) operation. Or perhaps you're familiar with the notion of "Group By".

In [232]:
# for each continent, calculate the mean beer servings
drinks.groupby('continent').beer.mean()

continent
AF     61.471698
AS     37.045455
EU    193.777778
OC     89.687500
SA    175.083333
Name: beer, dtype: float64

In [233]:
# for each continent, count the number of occurrences
drinks.continent.value_counts()

AF    53
EU    45
AS    44
OC    16
SA    12
Name: continent, dtype: int64

In [234]:
# for each continent, describe beer servings
drinks.groupby('continent').beer.describe()

continent       
AF         count     53.000000
           mean      61.471698
           std       80.557816
           min        0.000000
           25%       15.000000
           50%       32.000000
           75%       76.000000
           max      376.000000
AS         count     44.000000
           mean      37.045455
           std       49.469725
           min        0.000000
           25%        4.250000
           50%       17.500000
           75%       60.500000
           max      247.000000
EU         count     45.000000
           mean     193.777778
           std       99.631569
           min        0.000000
           25%      127.000000
           50%      219.000000
           75%      270.000000
           max      361.000000
OC         count     16.000000
           mean      89.687500
           std       96.641412
           min        0.000000
           25%       21.000000
           50%       52.500000
           75%      125.750000
           max      30

In [235]:
# similar, but outputs a DataFrame and can be customized
drinks.groupby('continent').beer.agg(['count', 'mean', 'min', 'max'])
drinks.groupby('continent').beer.agg(['count', 'mean', 'min', 'max']).sort('mean')

  app.launch_new_instance()


Unnamed: 0_level_0,count,mean,min,max
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AS,44,37.045455,0,247
AF,53,61.471698,0,376
OC,16,89.6875,0,306
SA,12,175.083333,93,333
EU,45,193.777778,0,361


In [238]:
# if you don't specify a column to which the aggregation function should be applied,
# it will be applied to all numeric columns
drinks.groupby('continent').mean()
drinks.groupby('continent').describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,beer,liters,mL,servings,spirit,wine
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AF,count,53.0,53.0,53.0,53.0,53.0,53.0
AF,mean,61.471698,3.007547,3007.54717,94.075472,16.339623,16.264151
AF,std,80.557816,2.647557,2647.55686,114.040622,28.102794,38.846419
AF,min,0.0,0.0,0.0,0.0,0.0,0.0
AF,25%,15.0,0.7,700.0,20.0,1.0,1.0
AF,50%,32.0,2.3,2300.0,49.0,3.0,2.0
AF,75%,76.0,4.7,4700.0,94.0,19.0,13.0
AF,max,376.0,9.1,9100.0,504.0,152.0,233.0
AS,count,44.0,44.0,44.0,44.0,44.0,44.0
AS,mean,37.045455,2.170455,2170.454545,106.954545,60.840909,9.068182


** Exercise **

1. For each occupation in 'users', count the number of occurrences
2. For each occupation, calculate the mean age
3. **Bonus** For each occupation, calculate the minimum and maximum ages
4. **Bonus** For each combination of occupation and gender, calculate the mean age
5. Think about how obnoxious it would be to do all this stuff with a for loop. 

In [260]:
# 1. For each occupation in 'users', count the number of occurrences
users.groupby('occupation').count()["user_id"].order()

  from ipykernel import kernelapp as app


occupation
doctor             7
homemaker          7
none               9
salesman          12
lawyer            12
retired           14
healthcare        16
entertainment     18
marketing         26
technician        27
artist            28
scientist         31
executive         32
writer            45
librarian         51
programmer        66
engineer          67
administrator     79
educator          95
other            105
student          196
Name: user_id, dtype: int64

In [269]:
# 2. For each occupation, calculate the mean age
users.groupby("occupation").mean().age.order()

  from ipykernel import kernelapp as app


occupation
student          22.081633
none             26.555556
entertainment    29.222222
artist           31.392857
homemaker        32.571429
programmer       33.121212
technician       33.148148
other            34.523810
scientist        35.548387
salesman         35.666667
writer           36.311111
engineer         36.388060
lawyer           36.750000
marketing        37.615385
executive        38.718750
administrator    38.746835
librarian        40.000000
healthcare       41.562500
educator         42.010526
doctor           43.571429
retired          63.071429
Name: age, dtype: float64

In [296]:
# 3. Bonus For each occupation, calculate the minimum and maximum ages
users.groupby("occupation").age.min(), "\n"
users.groupby("occupation").age.max()
# How can I get min and max to be displayed side-by-side?

Unnamed: 0,user_id,age
count,943.0,943.0
mean,472.0,34.051962
std,272.364951,12.19274
min,1.0,7.0
25%,236.5,25.0
50%,472.0,31.0
75%,707.5,43.0
max,943.0,73.0


In [300]:
# 4. Bonus For each combination of occupation and gender, calculate the mean age
users.groupby(["occupation","gender"]).mean().age

occupation     gender
administrator  F         40.638889
               M         37.162791
artist         F         30.307692
               M         32.333333
doctor         M         43.571429
educator       F         39.115385
               M         43.101449
engineer       F         29.500000
               M         36.600000
entertainment  F         31.000000
               M         29.000000
executive      F         44.000000
               M         38.172414
healthcare     F         39.818182
               M         45.400000
homemaker      F         34.166667
               M         23.000000
lawyer         F         39.500000
               M         36.200000
librarian      F         40.000000
               M         40.000000
marketing      F         37.200000
               M         37.875000
none           F         36.500000
               M         18.600000
other          F         35.472222
               M         34.028986
programmer     F         32.16666

Also, note that we can select multiple columns or rows when we're going this kind of subsetting. 

In [None]:
# select multiple columns
my_cols = ['City', 'State']     # create a list of column names...
ufo[my_cols]                    # ...and use that list to select columns
ufo[['City', 'State']]          # or, combine into a single step

In [None]:
# use loc to select columns by name
ufo.loc[:, 'City']              # colon means "all rows", then select one column
ufo.loc[:, ['City', 'State']]   # select two columns
ufo.loc[:, 'City':'State']      # select a range of columns

In [None]:
# loc can also filter rows by "name" (the index)
ufo.loc[0, :]                   # row 0, all columns
ufo.loc[0:2, :]                 # rows 0/1/2, all columns
ufo.loc[0:2, 'City':'State']    # rows 0/1/2, range of columns

In [None]:
# use iloc to filter rows and select columns by integer position
ufo.iloc[:, [0, 3]]             # all rows, columns in position 0/3
ufo.iloc[:, 0:4]                # all rows, columns in position 0/1/2/3
ufo.iloc[0:3, :]                # rows in position 0/1/2, all columns

Here's a grab-bag of other common Data Frame features that you'll commonly use. 

In [None]:
# map existing values to a different set of values
users['is_male'] = users.gender.map({'F':0, 'M':1})

In [None]:
# encode strings as integer values (automatically starts at 0)
users['occupation_num'] = users.occupation.factorize()[0]

In [None]:
# determine unique values in a column
users.occupation.nunique()      # count the number of unique values
users.occupation.unique()       # return the unique values

In [None]:
# replace all instances of a value in a column (must match entire value)
ufo.State.replace('Fl', 'FL', inplace=True)

In [None]:
# string methods are accessed via 'str'
ufo.State.str.upper()                               # converts to uppercase
ufo.Colors_Reported.str.contains('RED', na='False') # checks for a substring

In [None]:
# convert a string to the datetime format
ufo['Time'] = pd.to_datetime(ufo.Time)
ufo.Time.dt.hour                        # datetime format exposes convenient attributes
(ufo.Time.max() - ufo.Time.min()).days  # also allows you to do datetime "math"
ufo[ufo.Time > pd.datetime(2014, 1, 1)] # boolean filtering with datetime format


In [None]:
# setting and then removing an index
ufo.set_index('Time', inplace=True)
ufo.reset_index(inplace=True)

In [None]:
# sort a column by its index
ufo.State.value_counts().sort_index()

In [None]:
# change the data type of a column
drinks['beer'] = drinks.beer.astype('float')

In [None]:
# change the data type of a column when reading in a file
pd.read_csv('drinks.csv', dtype={'beer_servings':float})

In [None]:
# create dummy variables for 'continent' and exclude first dummy column
continent_dummies = pd.get_dummies(drinks.continent, prefix='cont').iloc[:, 1:]

In [None]:
# concatenate two DataFrames (axis=0 for rows, axis=1 for columns)
drinks = pd.concat([drinks, continent_dummies], axis=1)

Here's some less-frequently-used stuff that's good to know about.

In [None]:
# create a DataFrame from a dictionary
pd.DataFrame({'capital':['Montgomery', 'Juneau', 'Phoenix'], 'state':['AL', 'AK', 'AZ']})

In [None]:
# create a DataFrame from a list of lists
pd.DataFrame([['Montgomery', 'AL'], ['Juneau', 'AK'], ['Phoenix', 'AZ']], columns=['capital', 'state'])

In [None]:
# detecting duplicate rows
users.duplicated()          # True if a row is identical to a previous row
users.duplicated().sum()    # count of duplicates
users[users.duplicated()]   # only show duplicates
users.drop_duplicates()     # drop duplicate rows
users.age.duplicated()      # check a single column for duplicates
users.duplicated(['age', 'gender', 'zip_code']).sum()   # specify columns for finding duplicates

In [None]:
# display a cross-tabulation of two Series
pd.crosstab(users.occupation, users.gender)

In [None]:
# alternative syntax for boolean filtering (noted as "experimental" in the documentation)
users.query('age < 20')                 # users[users.age < 20]
users.query("age < 20 and gender=='M'") # users[(users.age < 20) & (users.gender=='M')]
users.query('age < 20 or age > 60')     # users[(users.age < 20) | (users.age > 60)]

In [None]:
# display the memory usage of a DataFrame
ufo.info()          # total usage
ufo.memory_usage()  # usage by column

In [None]:
# change a Series to the 'category' data type (reduces memory usage and increases performance)
ufo['State'] = ufo.State.astype('category')

In [None]:
# temporarily define a new column as a function of existing columns
drinks.assign(servings = drinks.beer + drinks.spirit + drinks.wine)


In [None]:
# limit which rows are read when reading in a file
pd.read_csv('drinks.csv', nrows=10)           # only read first 10 rows
pd.read_csv('drinks.csv', skiprows=[1, 2])    # skip the first two rows of data

In [None]:
# write a DataFrame out to a CSV
drinks.to_csv('drinks_updated.csv')                 # index is used as first column
drinks.to_csv('drinks_updated.csv', index=False)    # ignore index

In [None]:
# save a DataFrame to disk (aka 'pickle') and read it from disk (aka 'unpickle')
drinks.to_pickle('drinks_pickle')
pd.read_pickle('drinks_pickle')

In [None]:
# randomly sample a DataFrame
train = drinks.sample(frac=0.75, random_state=1)    # will contain 75% of the rows
test = drinks[~drinks.index.isin(train.index)]      # will contain the other 25%

In [None]:
# change the maximum number of rows and columns printed ('None' means unlimited)
pd.set_option('max_rows', None)     # default is 60 rows
pd.set_option('max_columns', None)  # default is 20 columns
print drinks

In [None]:
# reset options to defaults
pd.reset_option('max_rows')
pd.reset_option('max_columns')

That's a lot of litte odds-and-ends, but I hope you realize that the Data Frame idea is superbly useful will save us a lot of work.