In [2]:
import pandas as pd

In [3]:
# Read in the survey CSV
surveys_df = pd.read_csv("Downloads/surveys.csv")

In [4]:
# TIP: use the .head() method we saw earlier to make output shorter
# Method 1: select a 'subset' of the data using the column name
surveys_df['species_id']

0         NL
1         NL
2         DM
3         DM
4         DM
        ... 
35544     AH
35545     AH
35546     RM
35547     DO
35548    NaN
Name: species_id, Length: 35549, dtype: object

In [5]:
# Method 2: use the column name as an 'attribute'; gives the same output
surveys_df.species_id

0         NL
1         NL
2         DM
3         DM
4         DM
        ... 
35544     AH
35545     AH
35546     RM
35547     DO
35548    NaN
Name: species_id, Length: 35549, dtype: object

In [6]:
# Creates an object, surveys_species, that only contains the `species_id` column
surveys_species = surveys_df['species_id']

In [8]:
# Select the species and plot columns from the DataFrame
surveys_df[['species_id', 'plot_id']]

Unnamed: 0,species_id,plot_id
0,NL,2
1,NL,3
2,DM,2
3,DM,7
4,DM,3
...,...,...
35544,AH,15
35545,AH,15
35546,RM,10
35547,DO,7


In [9]:
# What happens when you flip the order?
surveys_df[['plot_id', 'species_id']]

Unnamed: 0,plot_id,species_id
0,2,NL
1,3,NL
2,2,DM
3,7,DM
4,3,DM
...,...,...
35544,15,AH
35545,15,AH
35546,10,RM
35547,7,DO


In [11]:
# What happens if you ask for a column that doesn't exist?
surveys_df['speciess']

KeyError: 'species'

In [12]:
# Create a list of numbers:
a = [1, 2, 3, 4, 5]

In [13]:
a[0]

1

In [14]:
a[5]

IndexError: list index out of range

In [15]:
a[len(a)]

IndexError: list index out of range

In [16]:
# Select rows 0, 1, 2 (row 3 is not selected)
surveys_df[0:3]

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
0,1,7,16,1977,2,NL,M,32.0,
1,2,7,16,1977,3,NL,M,33.0,
2,3,7,16,1977,2,DM,F,37.0,


In [17]:
# Select the first 5 rows (rows 0, 1, 2, 3, 4)
surveys_df[:5]

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
0,1,7,16,1977,2,NL,M,32.0,
1,2,7,16,1977,3,NL,M,33.0,
2,3,7,16,1977,2,DM,F,37.0,
3,4,7,16,1977,7,DM,M,36.0,
4,5,7,16,1977,3,DM,M,35.0,


In [18]:
# Select the last element in the list
# (the slice starts at the last element, and ends at the end of the list)
surveys_df[-1:]

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
35548,35549,12,31,2002,5,,,,


In [19]:
# Using the 'copy() method'
true_copy_surveys_df = surveys_df.copy()

In [20]:
# Using the '=' operator
ref_surveys_df = surveys_df

In [21]:
# Assign the value `0` to the first three rows of data in the DataFrame
ref_surveys_df[0:3] = 0

In [22]:
# ref_surveys_df was created using the '=' operator
ref_surveys_df.head()

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
0,0,0,0,0,0,0,0,0.0,0.0
1,0,0,0,0,0,0,0,0.0,0.0
2,0,0,0,0,0,0,0,0.0,0.0
3,4,7,16,1977,7,DM,M,36.0,
4,5,7,16,1977,3,DM,M,35.0,


In [23]:
# surveys_df is the original dataframe
surveys_df.head()

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
0,0,0,0,0,0,0,0,0.0,0.0
1,0,0,0,0,0,0,0,0.0,0.0
2,0,0,0,0,0,0,0,0.0,0.0
3,4,7,16,1977,7,DM,M,36.0,
4,5,7,16,1977,3,DM,M,35.0,


In [24]:
true_copy_surveys_df = surveys_df.copy()

In [25]:
ref_surveys_df = surveys_df

In [26]:
surveys_df = pd.read_csv("Downloads/surveys.csv")

In [27]:
# iloc[row slicing, column slicing]
surveys_df.iloc[0:3, 1:4]

Unnamed: 0,month,day,year
0,7,16,1977
1,7,16,1977
2,7,16,1977


In [28]:
# Select all columns for rows of index values 0 and 10
surveys_df.loc[[0, 10], :]

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
0,1,7,16,1977,2,NL,M,32.0,
10,11,7,16,1977,5,DS,F,53.0,


In [29]:
# What does this do?
surveys_df.loc[0, ['species_id', 'plot_id', 'weight']]

species_id     NL
plot_id         2
weight        NaN
Name: 0, dtype: object

In [30]:
# What happens when you type the code below?
surveys_df.loc[[0, 10, 35549], :]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
0,1.0,7.0,16.0,1977.0,2.0,NL,M,32.0,
10,11.0,7.0,16.0,1977.0,5.0,DS,F,53.0,
35549,,,,,,,,,


In [33]:
# Syntax for iloc indexing to finding a specific data element
dat.iloc[row, column]

NameError: name 'dat' is not defined

In [34]:
surveys_df.iloc[2, 6]

'F'

In [35]:
surveys_df[0:1]

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
0,1,7,16,1977,2,NL,M,32.0,


In [36]:
surveys_df[:4]

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
0,1,7,16,1977,2,NL,M,32.0,
1,2,7,16,1977,3,NL,M,33.0,
2,3,7,16,1977,2,DM,F,37.0,
3,4,7,16,1977,7,DM,M,36.0,


In [37]:
surveys_df[:-1]

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
0,1,7,16,1977,2,NL,M,32.0,
1,2,7,16,1977,3,NL,M,33.0,
2,3,7,16,1977,2,DM,F,37.0,
3,4,7,16,1977,7,DM,M,36.0,
4,5,7,16,1977,3,DM,M,35.0,
...,...,...,...,...,...,...,...,...,...
35543,35544,12,31,2002,15,US,,,
35544,35545,12,31,2002,15,AH,,,
35545,35546,12,31,2002,15,AH,,,
35546,35547,12,31,2002,10,RM,F,15.0,14.0


In [38]:
surveys_df.iloc[0:4, 1:4]

Unnamed: 0,month,day,year
0,7,16,1977
1,7,16,1977
2,7,16,1977
3,7,16,1977


In [39]:
surveys_df.loc[0:4, 1:4]

TypeError: cannot do slice indexing on <class 'pandas.core.indexes.base.Index'> with these indexers [1] of <class 'int'>

In [40]:
surveys_df[surveys_df.year == 2002]

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
33320,33321,1,12,2002,1,DM,M,38.0,44.0
33321,33322,1,12,2002,1,DO,M,37.0,58.0
33322,33323,1,12,2002,1,PB,M,28.0,45.0
33323,33324,1,12,2002,1,AB,,,
33324,33325,1,12,2002,1,DO,M,35.0,29.0
...,...,...,...,...,...,...,...,...,...
35544,35545,12,31,2002,15,AH,,,
35545,35546,12,31,2002,15,AH,,,
35546,35547,12,31,2002,10,RM,F,15.0,14.0
35547,35548,12,31,2002,7,DO,M,36.0,51.0


In [41]:
surveys_df[surveys_df.year != 2002]

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
0,1,7,16,1977,2,NL,M,32.0,
1,2,7,16,1977,3,NL,M,33.0,
2,3,7,16,1977,2,DM,F,37.0,
3,4,7,16,1977,7,DM,M,36.0,
4,5,7,16,1977,3,DM,M,35.0,
...,...,...,...,...,...,...,...,...,...
33315,33316,12,16,2001,11,,,,
33316,33317,12,16,2001,13,,,,
33317,33318,12,16,2001,14,,,,
33318,33319,12,16,2001,15,,,,


In [42]:
surveys_df[(surveys_df.year >= 1980) & (surveys_df.year <= 1985)]

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
2270,2271,1,15,1980,8,DO,M,35.0,53.0
2271,2272,1,15,1980,11,PF,F,16.0,10.0
2272,2273,1,15,1980,18,DM,F,34.0,33.0
2273,2274,1,15,1980,11,DM,M,38.0,37.0
2274,2275,1,15,1980,8,DO,F,33.0,29.0
...,...,...,...,...,...,...,...,...,...
11222,11223,12,8,1985,4,DM,M,36.0,40.0
11223,11224,12,8,1985,11,DM,M,37.0,49.0
11224,11225,12,8,1985,7,PE,M,20.0,18.0
11225,11226,12,8,1985,1,DM,M,38.0,47.0


In [43]:
surveys_df[surveys_df['species_id'].isin([listGoesHere])]

NameError: name 'listGoesHere' is not defined

In [45]:
# Set x to 5
x = 5

In [46]:
# What does the code below return?
x > 5

False

In [47]:
# How about this?
x == 5

True

In [48]:
pd.isnull(surveys_df)

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
0,False,False,False,False,False,False,False,False,True
1,False,False,False,False,False,False,False,False,True
2,False,False,False,False,False,False,False,False,True
3,False,False,False,False,False,False,False,False,True
4,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...
35544,False,False,False,False,False,False,True,True,True
35545,False,False,False,False,False,False,True,True,True
35546,False,False,False,False,False,False,False,False,False
35547,False,False,False,False,False,False,False,False,False


In [49]:
# To select just the rows with NaN values, we can use the 'any()' method
surveys_df[pd.isnull(surveys_df).any(axis=1)]

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
0,1,7,16,1977,2,NL,M,32.0,
1,2,7,16,1977,3,NL,M,33.0,
2,3,7,16,1977,2,DM,F,37.0,
3,4,7,16,1977,7,DM,M,36.0,
4,5,7,16,1977,3,DM,M,35.0,
...,...,...,...,...,...,...,...,...,...
35530,35531,12,31,2002,13,PB,F,27.0,
35543,35544,12,31,2002,15,US,,,
35544,35545,12,31,2002,15,AH,,,
35545,35546,12,31,2002,15,AH,,,


In [50]:
# What does this do?
empty_weights = surveys_df[pd.isnull(surveys_df['weight'])]['weight']
print(empty_weights)

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
         ..
35530   NaN
35543   NaN
35544   NaN
35545   NaN
35548   NaN
Name: weight, Length: 3266, dtype: float64
