# Pandas Documentation on DataFrame

In this notebook, you will work through the Pandas documentation on DataFrames.

## Imports

In [1]:
import numpy as np
import pandas as pd

## DataFrame

In this notebook, you are going to learn how to use `pandas.DataFrame` by typing the code from the Pandas documentation into this notebook.

* Go to the Pandas [DataFrame Documentation](http://pandas.pydata.org/pandas-docs/stable/dsintro.html#dataframe).
* Type all of the code from that section of the documentation into this notebook and get it working.
* **To learn this API well, you must type the code rather than copy and pasting it**.
* Create a new cell in this section for each `In[]` prompt in the documentation.
* Ignore the cells in the **Grading** section below.
* No Markdown comments are needed.
* Skip the following sub-sections:
  - From structured or record array
  - Alternate Constructors
  - Assigning New Columns in Method Chains
  - Console display
  - DataFrame column attribute access and IPython completion

## Grading

In [2]:
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
     'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}

In [5]:
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1
b,2.0,2
c,3.0,3
d,,4


In [7]:
pd.DataFrame(d, index=['d', 'b', 'a'])

Unnamed: 0,one,two
d,,4
b,2.0,2
a,1.0,1


In [8]:
pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three'])

Unnamed: 0,two,three
d,4,
b,2,
a,1,


In [9]:
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [10]:
df.columns

Index(['one', 'two'], dtype='object')

In [11]:
d = {'one' : [1., 2., 3., 4.],
     'two' : [4., 3., 2., 1.]}

pd.DataFrame(d)

Unnamed: 0,one,two
0,1,4
1,2,3
2,3,2
3,4,1


In [12]:
pd.DataFrame(d, index=['a', 'b', 'c', 'd'])

Unnamed: 0,one,two
a,1,4
b,2,3
c,3,2
d,4,1


In [21]:
data2 = [{'a' : 1, 'b' : 2}, {'a': 5, 'b' : 10, 'c' : 20}]
pd.DataFrame(data2)

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


In [22]:
pd.DataFrame(data2, index=['first', 'second'])

Unnamed: 0,a,b,c
first,1,2,
second,5,10,20.0


In [23]:
pd.DataFrame(data2, columns=['a', 'b'])

Unnamed: 0,a,b
0,1,2
1,5,10


In [24]:
pd.DataFrame({('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2}, ('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4}, ('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6}, ('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8}, ('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}})


Unnamed: 0_level_0,Unnamed: 1_level_0,a,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,a,b,c,a,b
A,B,4.0,1.0,5.0,8.0,10.0
A,C,3.0,2.0,6.0,7.0,
A,D,,,,,9.0


In [25]:
df['one']

a     1
b     2
c     3
d   NaN
Name: one, dtype: float64

In [26]:
df['three'] = df['one'] * df['two']
df['flag'] = df['one'] > 2
df

Unnamed: 0,one,two,three,flag
a,1.0,1,1.0,False
b,2.0,2,4.0,False
c,3.0,3,9.0,True
d,,4,,False


In [27]:
del df['two']

In [28]:
three = df.pop('three')
df

Unnamed: 0,one,flag
a,1.0,False
b,2.0,False
c,3.0,True
d,,False


In [29]:
df['foo'] = 'bar'
df

Unnamed: 0,one,flag,foo
a,1.0,False,bar
b,2.0,False,bar
c,3.0,True,bar
d,,False,bar


In [30]:
df['one_trunc'] = df['one'][:2]
df

Unnamed: 0,one,flag,foo,one_trunc
a,1.0,False,bar,1.0
b,2.0,False,bar,2.0
c,3.0,True,bar,
d,,False,bar,


In [31]:
df.insert(1, 'bar', df['one'])
df

Unnamed: 0,one,bar,flag,foo,one_trunc
a,1.0,1.0,False,bar,1.0
b,2.0,2.0,False,bar,2.0
c,3.0,3.0,True,bar,
d,,,False,bar,


In [32]:
df.loc['b']

one              2
bar              2
flag         False
foo            bar
one_trunc        2
Name: b, dtype: object

In [33]:
df.iloc[2]

one             3
bar             3
flag         True
foo           bar
one_trunc     NaN
Name: c, dtype: object

In [34]:
df = pd.DataFrame(np.random.randn(10, 4), columns=['A', 'B', 'C', 'D'])
df2 = pd.DataFrame(np.random.randn(7, 3), columns=['A', 'B', 'C'])

In [35]:
df + df2

Unnamed: 0,A,B,C,D
0,0.612696,1.582193,2.442391,
1,-1.080286,-1.297347,-4.575589,
2,0.194104,0.593363,1.922629,
3,-0.685746,0.811607,-1.689754,
4,1.508904,0.507244,-0.596899,
5,-1.457685,0.719474,-0.068303,
6,-1.290063,-2.762058,-1.434951,
7,,,,
8,,,,
9,,,,


In [36]:
df - df.iloc[0]

Unnamed: 0,A,B,C,D
0,0.0,0.0,0.0,0.0
1,-1.609178,-1.488553,-2.121538,1.193012
2,-0.687104,-1.410872,0.275175,-0.474293
3,-0.655963,-0.97019,-2.662272,1.482432
4,0.926488,-0.96915,-2.443906,1.04175
5,-0.363819,-1.384792,-1.254007,-2.048695
6,-1.666833,-2.366185,-2.615724,-0.217778
7,-3.130698,-2.234334,-0.521929,0.774031
8,-0.091256,-2.263163,-0.096729,1.174963
9,0.192331,0.698358,-0.419969,0.793314


In [38]:
index = pd.date_range('1/1/2000', periods=8)
df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=list('ABC'))

In [39]:
df

Unnamed: 0,A,B,C
2000-01-01,-0.468784,0.505409,-0.360057
2000-01-02,0.970974,0.334218,0.409123
2000-01-03,-0.21377,-0.193701,-0.5925
2000-01-04,1.586683,0.425724,2.487441
2000-01-05,0.004959,-0.154486,-1.188976
2000-01-06,1.047423,-1.032974,-0.123911
2000-01-07,0.285291,-0.707373,1.066949
2000-01-08,0.775757,-0.255798,-2.864177


In [40]:
type(df['A'])

pandas.core.series.Series

In [41]:
df - df['A']

Unnamed: 0,2000-01-01 00:00:00,2000-01-02 00:00:00,2000-01-03 00:00:00,2000-01-04 00:00:00,2000-01-05 00:00:00,2000-01-06 00:00:00,2000-01-07 00:00:00,2000-01-08 00:00:00,A,B,C
2000-01-01,,,,,,,,,,,
2000-01-02,,,,,,,,,,,
2000-01-03,,,,,,,,,,,
2000-01-04,,,,,,,,,,,
2000-01-05,,,,,,,,,,,
2000-01-06,,,,,,,,,,,
2000-01-07,,,,,,,,,,,
2000-01-08,,,,,,,,,,,


In [42]:
df * 5 + 2

Unnamed: 0,A,B,C
2000-01-01,-0.343919,4.527046,0.199717
2000-01-02,6.854868,3.671088,4.045614
2000-01-03,0.93115,1.031496,-0.962502
2000-01-04,9.933413,4.128622,14.437204
2000-01-05,2.024797,1.22757,-3.944882
2000-01-06,7.237115,-3.16487,1.380444
2000-01-07,3.426455,-1.536864,7.334747
2000-01-08,5.878783,0.721011,-12.320887


In [43]:
1 / df

Unnamed: 0,A,B,C
2000-01-01,-2.13318,1.978595,-2.777341
2000-01-02,1.029894,2.992063,2.444254
2000-01-03,-4.677925,-5.1626,-1.687762
2000-01-04,0.630246,2.348938,0.40202
2000-01-05,201.639925,-6.473081,-0.84106
2000-01-06,0.954724,-0.968079,-8.070297
2000-01-07,3.505193,-1.413682,0.937252
2000-01-08,1.289064,-3.909337,-0.34914


In [44]:
df ** 4

Unnamed: 0,A,B,C
2000-01-01,0.04829365,0.065249,0.016807
2000-01-02,0.8888522,0.012477,0.028017
2000-01-03,0.002088272,0.001408,0.123241
2000-01-04,6.338117,0.032848,38.283445
2000-01-05,6.049144e-10,0.00057,1.998448
2000-01-06,1.203617,1.138564,0.000236
2000-01-07,0.006624486,0.250376,1.295911
2000-01-08,0.3621612,0.004281,67.297615


In [46]:
df1 = pd.DataFrame({'a' : [1, 0, 1], 'b' : [0, 1, 1] }, dtype=bool)
df2 = pd.DataFrame({'a' : [0, 1, 1], 'b' : [1, 1, 0] }, dtype=bool)

In [47]:
df1 & df2

Unnamed: 0,a,b
0,False,False
1,False,True
2,True,False


In [48]:
df1 | df2

Unnamed: 0,a,b
0,True,True
1,True,True
2,True,True


In [49]:
df1 ^ df2

Unnamed: 0,a,b
0,True,True
1,True,False
2,False,True


In [50]:
-df1

Unnamed: 0,a,b
0,False,True
1,True,False
2,False,False


In [51]:
df[:5].T

Unnamed: 0,2000-01-01 00:00:00,2000-01-02 00:00:00,2000-01-03 00:00:00,2000-01-04 00:00:00,2000-01-05 00:00:00
A,-0.468784,0.970974,-0.21377,1.586683,0.004959
B,0.505409,0.334218,-0.193701,0.425724,-0.154486
C,-0.360057,0.409123,-0.5925,2.487441,-1.188976


In [52]:
np.exp(df)

Unnamed: 0,A,B,C
2000-01-01,0.625763,1.657664,0.697637
2000-01-02,2.640514,1.396847,1.505497
2000-01-03,0.807534,0.823904,0.552943
2000-01-04,4.887508,1.530699,12.030448
2000-01-05,1.004972,0.856856,0.304533
2000-01-06,2.850296,0.355947,0.883458
2000-01-07,1.330149,0.492938,2.906499
2000-01-08,2.172235,0.774298,0.05703


In [53]:
np.asarray(df)

array([[-0.46878371,  0.50540918, -0.36005656],
       [ 0.97097351,  0.33421758,  0.40912284],
       [-0.21377   , -0.19370086, -0.5925005 ],
       [ 1.58668262,  0.42572439,  2.4874408 ],
       [ 0.00495934, -0.15448594, -1.18897638],
       [ 1.04742293, -1.03297395, -0.12391117],
       [ 0.28529098, -0.70737284,  1.06694937],
       [ 0.77575662, -0.25579785, -2.86417743]])

In [54]:
df.T.dot(df)

Unnamed: 0,A,B,C
A,5.506116,-0.678485,2.586277
B,-0.678485,2.242611,1.418089
C,2.586277,1.418089,17.606353


In [55]:
s1 = pd.Series(np.arange(5, 10))
s1.dot(s1)

255