A DataFrame represents a rectangular table of data and contains an ordered collec‐
tion of columns, each of which can be a different value type (numeric, string,
boolean, etc.). The DataFrame has both a row and column index; it can be thought of
as a dict of Series all sharing the same index. 

# Summary  
## DataFrame
(1) Create a dataframe from a dict, a nested dict, a dict of series   
(2) Display the first rows   
(3) Specify column and index upon creation   
(4) Get column and index
(5) Get a column as a series   
(6) Get a row   
(7) Assign value to an entry   
(8) Add, delete a column      
(9) Specify column and index names   
(10) transpose   
(11) Get values as a 2D array   

In [29]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

In [30]:
# construct a DataFrame from a dict
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year':  [2000, 2001, 2002, 2001, 2002, 2003],
        'pop':   [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)

In [31]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [32]:
# To display large dataframe, use 'head' method to display only the first several rows
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [33]:
# We can order the columns when creating a dataframe
pd.DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [34]:
# Specify a column that is not in the data, then the values in that column will be NaN
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop','debt'])
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [35]:
# We can also specify index upon creation
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop','debt'],index=['one','two', 'three', 'four','five','six'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [36]:
# Get the column labels and index
frame2.index

Index(['one', 'two', 'three', 'four', 'five', 'six'], dtype='object')

In [37]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [38]:
# Get a column as a series
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [39]:
# Alternative
# But his method works only when the column name is a valid python variable name
frame2.state

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [40]:
# Get a row
frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [41]:
# Assign values to entries
frame2['debt'] = 16.5
frame2
# Note that frame2['debt'] is a column, therefore we assigned the same value to all entries in that column

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


In [42]:
# Use iterables to assign values
frame2['debt'] = range(6)
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0
two,2001,Ohio,1.7,1
three,2002,Ohio,3.6,2
four,2001,Nevada,2.4,3
five,2002,Nevada,2.9,4
six,2003,Nevada,3.2,5


In [43]:
# Use numpy arrays
frame2['debt'] = np.arange(6.0) # If np.arange(6) is used, the entries will be integers
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


In [45]:
# Assign with a series
# The missing values are automatically set to NaN
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [49]:
# Adding a new column
print(frame2['state']=='Ohio')
frame2['eastern'] = frame2['state']=='Ohio'
frame2

one       True
two       True
three     True
four     False
five     False
six      False
Name: state, dtype: bool


Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.2,,False


In [50]:
# Delete a column
del frame2['eastern']
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [53]:
# Indexing acts as a pointer.
col = frame2['pop']
col['one'] = 100
print(frame2)
col['one'] = 1.5 # Reset to the original value

       year   state    pop  debt
one    2000    Ohio  100.0   NaN
two    2001    Ohio    1.7  -1.2
three  2002    Ohio    3.6   NaN
four   2001  Nevada    2.4  -1.5
five   2002  Nevada    2.9  -1.7
six    2003  Nevada    3.2   NaN


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  col['one'] = 100
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  col['one'] = 1.5 # Reset to the original value


In [58]:
# Create a data frame from a nested dict
pop = {
    'Nevada':{2001:2.4, 2002:2.9},
    'Ohio':{2000:1.5, 2001:1.7, 2002:3.6}
}
frame3 = DataFrame(pop, index=[2000,2001,2002]) # Manually specify the index, otherwise 2000 would be after 2001 and 2002.
frame3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [59]:
# Transpose a dataframe
frame3.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [60]:
frame3.T[2000]

Nevada    NaN
Ohio      1.5
Name: 2000, dtype: float64

In [65]:
# Create a dataframe from a dict of series
print(frame3['Ohio'][:-1]) # Column 'Ohio' except the last entry
print(frame3['Nevada'][:2]) # The first two elements in column 'Nevada'
data = {'Ohio': frame3['Ohio'][:-1], 'Nevada': frame3['Nevada'][:2]}
pd.DataFrame(data)

2000    1.5
2001    1.7
Name: Ohio, dtype: float64
2000    NaN
2001    2.4
Name: Nevada, dtype: float64


Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4


In [66]:
# Specify names for index and column
frame3.index.name = 'year'
frame3.columns.name = 'state'
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [67]:
# Getting the values of a dataframe
frame3.values

array([[nan, 1.5],
       [2.4, 1.7],
       [2.9, 3.6]])

In [68]:
frame2.values
# Note the difference in the type of the entries.

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7],
       [2003, 'Nevada', 3.2, nan]], dtype=object)