# DataFrame
2D data with rows and columns.
When viewed in Jupyter, appears similar to a spreadsheet.

Has index for rows and columns with integer position and label.

Can handle 3 or more dimensions via more columns and/or multi-index
Don't use deprecated "Panel".

In [88]:
import pandas as pd

# create dataframe from dictionary

In [89]:
my_dict = {'person_name':['Alma', 'Bob', 'Charlie', 'Donna'], 'pet_species':['dog', 'cat', 'fish', 'dog']}

In [90]:
df = pd.DataFrame(data=my_dict)
df

Unnamed: 0,person_name,pet_species
0,Alma,dog
1,Bob,cat
2,Charlie,fish
3,Donna,dog


# iloc - select dataframe row by index integer position

In [91]:
df.iloc[2]

person_name    Charlie
pet_species       fish
Name: 2, dtype: object

# set_index() - convert column to index

In [92]:
df.set_index('person_name', inplace=True)
df

Unnamed: 0_level_0,pet_species
person_name,Unnamed: 1_level_1
Alma,dog
Bob,cat
Charlie,fish
Donna,dog


In [93]:
df.index

Index(['Alma', 'Bob', 'Charlie', 'Donna'], dtype='object', name='person_name')

# add column pet_name

In [94]:
df['pet_name'] = ['Spot', 'Bill', 'Nemo', 'ScoobyDoo']
df

Unnamed: 0_level_0,pet_species,pet_name
person_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Alma,dog,Spot
Bob,cat,Bill
Charlie,fish,Nemo
Donna,dog,ScoobyDoo


# loc - select dataframe column by index label

In [95]:
df.loc[:, 'pet_species']

person_name
Alma        dog
Bob         cat
Charlie    fish
Donna       dog
Name: pet_species, dtype: object

# shorthand [] select column

In [96]:
df['pet_species']

person_name
Alma        dog
Bob         cat
Charlie    fish
Donna       dog
Name: pet_species, dtype: object

# loc - select dataframe row by index label

In [97]:
df.loc['Charlie']

pet_species    fish
pet_name       Nemo
Name: Charlie, dtype: object

# loc - select dataframe value by index label

In [98]:
df.loc['Charlie', 'pet_name']

'Nemo'

# iloc - select dataframe value by index integer

In [99]:
df.iloc[2, 1]

'Nemo'

# add columns

In [100]:
df['num_apples'] = [1, 4, 3, 7]
df['num_oranges'] = [3, 1, 3, 5]
df

Unnamed: 0_level_0,pet_species,pet_name,num_apples,num_oranges
person_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Alma,dog,Spot,1,3
Bob,cat,Bill,4,1
Charlie,fish,Nemo,3,3
Donna,dog,ScoobyDoo,7,5


# iloc slice acts like Python slice, half open range includes start excludes stop

In [101]:
df.iloc[0:1]

Unnamed: 0_level_0,pet_species,pet_name,num_apples,num_oranges
person_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Alma,dog,Spot,1,3


# loc slice doesn't act like Python slice, closed range includes start includes stop

In [102]:
df.loc['Bob':'Charlie']

Unnamed: 0_level_0,pet_species,pet_name,num_apples,num_oranges
person_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bob,cat,Bill,4,1
Charlie,fish,Nemo,3,3


In [103]:
df.loc[:, 'num_apples':'num_oranges']

Unnamed: 0_level_0,num_apples,num_oranges
person_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Alma,1,3
Bob,4,1
Charlie,3,3
Donna,7,5


# vector operation on columns - very fast

In [104]:
df['num_fruit'] = df['num_apples'] + df['num_oranges']
df

Unnamed: 0_level_0,pet_species,pet_name,num_apples,num_oranges,num_fruit
person_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alma,dog,Spot,1,3,4
Bob,cat,Bill,4,1,5
Charlie,fish,Nemo,3,3,6
Donna,dog,ScoobyDoo,7,5,12


# compare apples to oranges

In [105]:
# apples_greater_than_oranges is a boolean series
apples_greater_than_oranges = (df['num_apples'] > df['num_oranges'])
apples_greater_than_oranges

person_name
Alma       False
Bob         True
Charlie    False
Donna       True
dtype: bool

# select rows via boolean mask

In [106]:
df[apples_greater_than_oranges]

Unnamed: 0_level_0,pet_species,pet_name,num_apples,num_oranges,num_fruit
person_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bob,cat,Bill,4,1,5
Donna,dog,ScoobyDoo,7,5,12


# apply - may be slower than vectorized operation
pandas apply() implicitly supplies argument to method fruit_string

In [107]:
def fruit_string(fruit_qty):
    if fruit_qty < 6:
        return 'meh'
    else:
        return 'I love fruit!'

In [108]:
df['fruit_attitude'] = df['num_fruit'].apply(fruit_string)
df

Unnamed: 0_level_0,pet_species,pet_name,num_apples,num_oranges,num_fruit,fruit_attitude
person_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alma,dog,Spot,1,3,4,meh
Bob,cat,Bill,4,1,5,meh
Charlie,fish,Nemo,3,3,6,I love fruit!
Donna,dog,ScoobyDoo,7,5,12,I love fruit!
