# 10 Minutes to pandas

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Object Creation

<p>Creating a Series by passing a list of values, letting pandas create a default integer index:</p>

In [2]:
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

<p>Creating a DataFrame by passing a numpy array, with a datetime index and labeled columns:</p>

In [3]:
dates = pd.date_range('20161112', periods=6)
dates

DatetimeIndex(['2016-11-12', '2016-11-13', '2016-11-14', '2016-11-15',
               '2016-11-16', '2016-11-17'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2016-11-12,0.321019,0.253118,0.963255,0.758781
2016-11-13,-0.416058,-0.935586,-0.5332,-1.673287
2016-11-14,1.558996,0.462994,-0.736424,-0.300827
2016-11-15,-0.119552,2.011811,0.306702,-1.657155
2016-11-16,-0.267299,0.949943,-0.500195,-1.038988
2016-11-17,0.90686,-0.838793,-1.831986,-1.49302


<p>Creating a DataFrame by passing a dict of objects that can be converted to series-like:</p>

In [5]:
df2 = pd.DataFrame({ 'A' : 1.,
                     'B' : pd.Timestamp(20161112),
                     'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                     'D' : np.array([3] * 4,dtype='int32'),
                     'E' : pd.Categorical(["test","train","test","train"]),
                     'F' : 'foo'})
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,1970-01-01 00:00:00.020161112,1.0,3,test,foo
1,1.0,1970-01-01 00:00:00.020161112,1.0,3,train,foo
2,1.0,1970-01-01 00:00:00.020161112,1.0,3,test,foo
3,1.0,1970-01-01 00:00:00.020161112,1.0,3,train,foo


<p>Having specific dtypes:</p>

In [6]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## Viewing Data

<p>See the top and bottom rows of the frame:</p>

In [7]:
df.head()

Unnamed: 0,A,B,C,D
2016-11-12,0.321019,0.253118,0.963255,0.758781
2016-11-13,-0.416058,-0.935586,-0.5332,-1.673287
2016-11-14,1.558996,0.462994,-0.736424,-0.300827
2016-11-15,-0.119552,2.011811,0.306702,-1.657155
2016-11-16,-0.267299,0.949943,-0.500195,-1.038988


In [8]:
df.tail(3)

Unnamed: 0,A,B,C,D
2016-11-15,-0.119552,2.011811,0.306702,-1.657155
2016-11-16,-0.267299,0.949943,-0.500195,-1.038988
2016-11-17,0.90686,-0.838793,-1.831986,-1.49302


<p>Display the index, columns, and the underlying numpy data:</p>

In [9]:
df.index

DatetimeIndex(['2016-11-12', '2016-11-13', '2016-11-14', '2016-11-15',
               '2016-11-16', '2016-11-17'],
              dtype='datetime64[ns]', freq='D')

In [10]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [11]:
df.values

array([[ 0.32101871,  0.25311796,  0.96325493,  0.75878097],
       [-0.4160578 , -0.93558565, -0.53320032, -1.67328681],
       [ 1.55899552,  0.4629943 , -0.73642401, -0.30082668],
       [-0.1195524 ,  2.01181137,  0.30670201, -1.65715516],
       [-0.2672992 ,  0.94994279, -0.50019474, -1.03898793],
       [ 0.90685951, -0.8387927 , -1.83198562, -1.49302018]])

<p>Describe shows a quick statistic summary of your data:</p>

In [12]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.330661,0.317248,-0.388641,-0.900749
std,0.769252,1.113842,0.953799,0.964656
min,-0.416058,-0.935586,-1.831986,-1.673287
25%,-0.230362,-0.565815,-0.685618,-1.616121
50%,0.100733,0.358056,-0.516698,-1.266004
75%,0.760399,0.828206,0.104978,-0.485367
max,1.558996,2.011811,0.963255,0.758781


<p>Transposing your data:</p>

In [13]:
df.T

Unnamed: 0,2016-11-12 00:00:00,2016-11-13 00:00:00,2016-11-14 00:00:00,2016-11-15 00:00:00,2016-11-16 00:00:00,2016-11-17 00:00:00
A,0.321019,-0.416058,1.558996,-0.119552,-0.267299,0.90686
B,0.253118,-0.935586,0.462994,2.011811,0.949943,-0.838793
C,0.963255,-0.5332,-0.736424,0.306702,-0.500195,-1.831986
D,0.758781,-1.673287,-0.300827,-1.657155,-1.038988,-1.49302


<p>Sorting by an axis:</p>

In [14]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2016-11-12,0.758781,0.963255,0.253118,0.321019
2016-11-13,-1.673287,-0.5332,-0.935586,-0.416058
2016-11-14,-0.300827,-0.736424,0.462994,1.558996
2016-11-15,-1.657155,0.306702,2.011811,-0.119552
2016-11-16,-1.038988,-0.500195,0.949943,-0.267299
2016-11-17,-1.49302,-1.831986,-0.838793,0.90686


<p>Sorting by values:</p>

In [15]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2016-11-13,-0.416058,-0.935586,-0.5332,-1.673287
2016-11-17,0.90686,-0.838793,-1.831986,-1.49302
2016-11-12,0.321019,0.253118,0.963255,0.758781
2016-11-14,1.558996,0.462994,-0.736424,-0.300827
2016-11-16,-0.267299,0.949943,-0.500195,-1.038988
2016-11-15,-0.119552,2.011811,0.306702,-1.657155


## Selection

<p>See the indexing documentation Indexing and Selecting Data and MultiIndex/Advanced Indexing.</p>

### Getting

Selecting a single column, which yields a Series, equivalent to df.A:

In [16]:
df['A']

2016-11-12    0.321019
2016-11-13   -0.416058
2016-11-14    1.558996
2016-11-15   -0.119552
2016-11-16   -0.267299
2016-11-17    0.906860
Freq: D, Name: A, dtype: float64

<p>Selecting via [ ], which slices the rows.</p>

In [17]:
df[0:3]

Unnamed: 0,A,B,C,D
2016-11-12,0.321019,0.253118,0.963255,0.758781
2016-11-13,-0.416058,-0.935586,-0.5332,-1.673287
2016-11-14,1.558996,0.462994,-0.736424,-0.300827


In [18]:
df['20161112':'20161114']

Unnamed: 0,A,B,C,D
2016-11-12,0.321019,0.253118,0.963255,0.758781
2016-11-13,-0.416058,-0.935586,-0.5332,-1.673287
2016-11-14,1.558996,0.462994,-0.736424,-0.300827


### Selection by Label

For getting a cross section using a label:

In [19]:
df.loc[dates[0]]

A    0.321019
B    0.253118
C    0.963255
D    0.758781
Name: 2016-11-12 00:00:00, dtype: float64

Selecting on a multi-axis by label:

In [20]:
df.loc[:,['A','B']]

Unnamed: 0,A,B
2016-11-12,0.321019,0.253118
2016-11-13,-0.416058,-0.935586
2016-11-14,1.558996,0.462994
2016-11-15,-0.119552,2.011811
2016-11-16,-0.267299,0.949943
2016-11-17,0.90686,-0.838793


Showing label slicing, both endpoints are included:

In [21]:
df.loc['20161112':'20161114',['A','B']]

Unnamed: 0,A,B
2016-11-12,0.321019,0.253118
2016-11-13,-0.416058,-0.935586
2016-11-14,1.558996,0.462994


Reduction in the dimensions of the returned object:

In [22]:
df.loc['20161112',['A','B']]

A    0.321019
B    0.253118
Name: 2016-11-12 00:00:00, dtype: float64

For getting a scalar value:

In [23]:
df.loc[dates[0],'A']

0.32101870585660514

For getting fast access to a scalar value (equivalent to the prior method):

In [24]:
df.at[dates[0],'A']

0.32101870585660514

### Selection by Position

Select via the position of the passed integers:

In [25]:
df.iloc[3]

A   -0.119552
B    2.011811
C    0.306702
D   -1.657155
Name: 2016-11-15 00:00:00, dtype: float64

By integer slices, acting similar to the numpy/python style:

In [26]:
df.iloc[3:5,0:2]

Unnamed: 0,A,B
2016-11-15,-0.119552,2.011811
2016-11-16,-0.267299,0.949943


By lists of integer position locations, similar to the numpy/python style:

In [27]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2016-11-13,-0.416058,-0.5332
2016-11-14,1.558996,-0.736424
2016-11-16,-0.267299,-0.500195


For slicing rows explicitly:

In [28]:
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2016-11-13,-0.416058,-0.935586,-0.5332,-1.673287
2016-11-14,1.558996,0.462994,-0.736424,-0.300827


For slicing columns explicitly:

In [29]:
df.iloc[:,1:3]

Unnamed: 0,B,C
2016-11-12,0.253118,0.963255
2016-11-13,-0.935586,-0.5332
2016-11-14,0.462994,-0.736424
2016-11-15,2.011811,0.306702
2016-11-16,0.949943,-0.500195
2016-11-17,-0.838793,-1.831986


For getting a value explicitly:

In [30]:
df.iloc[1,1]

-0.93558565365256752

For getting fast access to a scalar (equivalent to the prior method):

In [31]:
df.iat[1,1]

-0.93558565365256752

### Boolean Indexing

Using a single column's values to select data:

In [32]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2016-11-12,0.321019,0.253118,0.963255,0.758781
2016-11-14,1.558996,0.462994,-0.736424,-0.300827
2016-11-17,0.90686,-0.838793,-1.831986,-1.49302


A where operation for getting:

In [33]:
df[df > 0]

Unnamed: 0,A,B,C,D
2016-11-12,0.321019,0.253118,0.963255,0.758781
2016-11-13,,,,
2016-11-14,1.558996,0.462994,,
2016-11-15,,2.011811,0.306702,
2016-11-16,,0.949943,,
2016-11-17,0.90686,,,


Using the isin() method for filtering:

In [34]:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2016-11-12,0.321019,0.253118,0.963255,0.758781,one
2016-11-13,-0.416058,-0.935586,-0.5332,-1.673287,one
2016-11-14,1.558996,0.462994,-0.736424,-0.300827,two
2016-11-15,-0.119552,2.011811,0.306702,-1.657155,three
2016-11-16,-0.267299,0.949943,-0.500195,-1.038988,four
2016-11-17,0.90686,-0.838793,-1.831986,-1.49302,three


In [35]:
df2[df2['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2016-11-14,1.558996,0.462994,-0.736424,-0.300827,two
2016-11-16,-0.267299,0.949943,-0.500195,-1.038988,four


### Setting

Setting a new column automatically aligns the data by the indexes:

In [36]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20161112', periods=6))
s1

2016-11-12    1
2016-11-13    2
2016-11-14    3
2016-11-15    4
2016-11-16    5
2016-11-17    6
Freq: D, dtype: int64

In [37]:
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2016-11-12,0.321019,0.253118,0.963255,0.758781,1
2016-11-13,-0.416058,-0.935586,-0.5332,-1.673287,2
2016-11-14,1.558996,0.462994,-0.736424,-0.300827,3
2016-11-15,-0.119552,2.011811,0.306702,-1.657155,4
2016-11-16,-0.267299,0.949943,-0.500195,-1.038988,5
2016-11-17,0.90686,-0.838793,-1.831986,-1.49302,6


Setting values by label:

In [38]:
df.at[dates[0],'A'] = 0
df

Unnamed: 0,A,B,C,D,F
2016-11-12,0.0,0.253118,0.963255,0.758781,1
2016-11-13,-0.416058,-0.935586,-0.5332,-1.673287,2
2016-11-14,1.558996,0.462994,-0.736424,-0.300827,3
2016-11-15,-0.119552,2.011811,0.306702,-1.657155,4
2016-11-16,-0.267299,0.949943,-0.500195,-1.038988,5
2016-11-17,0.90686,-0.838793,-1.831986,-1.49302,6


Setting values by position:

In [39]:
df.iat[0,1] = 0
df

Unnamed: 0,A,B,C,D,F
2016-11-12,0.0,0.0,0.963255,0.758781,1
2016-11-13,-0.416058,-0.935586,-0.5332,-1.673287,2
2016-11-14,1.558996,0.462994,-0.736424,-0.300827,3
2016-11-15,-0.119552,2.011811,0.306702,-1.657155,4
2016-11-16,-0.267299,0.949943,-0.500195,-1.038988,5
2016-11-17,0.90686,-0.838793,-1.831986,-1.49302,6


Setting by assigning with a numpy array:

In [40]:
df.loc[:,'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2016-11-12,0.0,0.0,0.963255,5,1
2016-11-13,-0.416058,-0.935586,-0.5332,5,2
2016-11-14,1.558996,0.462994,-0.736424,5,3
2016-11-15,-0.119552,2.011811,0.306702,5,4
2016-11-16,-0.267299,0.949943,-0.500195,5,5
2016-11-17,0.90686,-0.838793,-1.831986,5,6


In [41]:
# Make DataFrame df match tutorial df
df.loc[:,'F'] = [np.nan, 1,2,3,4,5]
df

Unnamed: 0,A,B,C,D,F
2016-11-12,0.0,0.0,0.963255,5,
2016-11-13,-0.416058,-0.935586,-0.5332,5,1.0
2016-11-14,1.558996,0.462994,-0.736424,5,2.0
2016-11-15,-0.119552,2.011811,0.306702,5,3.0
2016-11-16,-0.267299,0.949943,-0.500195,5,4.0
2016-11-17,0.90686,-0.838793,-1.831986,5,5.0


A where operation with setting:

In [42]:
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2016-11-12,0.0,0.0,-0.963255,-5,
2016-11-13,-0.416058,-0.935586,-0.5332,-5,-1.0
2016-11-14,-1.558996,-0.462994,-0.736424,-5,-2.0
2016-11-15,-0.119552,-2.011811,-0.306702,-5,-3.0
2016-11-16,-0.267299,-0.949943,-0.500195,-5,-4.0
2016-11-17,-0.90686,-0.838793,-1.831986,-5,-5.0


## Missing Data

pandas primarily (?!) uses the value np.nan to represent missing data.  
It is by default not included in computations.  


Reindexing allows you to change/add/delete the index on a specified axis.  
This returns a copy of the data.  

In [43]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1

Unnamed: 0,A,B,C,D,F,E
2016-11-12,0.0,0.0,0.963255,5,,
2016-11-13,-0.416058,-0.935586,-0.5332,5,1.0,
2016-11-14,1.558996,0.462994,-0.736424,5,2.0,
2016-11-15,-0.119552,2.011811,0.306702,5,3.0,


In [44]:
df1.loc[dates[0]:dates[1],'E'] = 1
df1

Unnamed: 0,A,B,C,D,F,E
2016-11-12,0.0,0.0,0.963255,5,,1.0
2016-11-13,-0.416058,-0.935586,-0.5332,5,1.0,1.0
2016-11-14,1.558996,0.462994,-0.736424,5,2.0,
2016-11-15,-0.119552,2.011811,0.306702,5,3.0,


To drop any rows that have missing data:

In [45]:
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,F,E
2016-11-13,-0.416058,-0.935586,-0.5332,5,1.0,1.0


In [46]:
df1

Unnamed: 0,A,B,C,D,F,E
2016-11-12,0.0,0.0,0.963255,5,,1.0
2016-11-13,-0.416058,-0.935586,-0.5332,5,1.0,1.0
2016-11-14,1.558996,0.462994,-0.736424,5,2.0,
2016-11-15,-0.119552,2.011811,0.306702,5,3.0,


Filling missing data:

In [47]:
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2016-11-12,0.0,0.0,0.963255,5,5.0,1.0
2016-11-13,-0.416058,-0.935586,-0.5332,5,1.0,1.0
2016-11-14,1.558996,0.462994,-0.736424,5,2.0,5.0
2016-11-15,-0.119552,2.011811,0.306702,5,3.0,5.0


To get the boolean mask where values are nan:

In [48]:
pd.isnull(df1)

Unnamed: 0,A,B,C,D,F,E
2016-11-12,False,False,False,False,True,False
2016-11-13,False,False,False,False,False,False
2016-11-14,False,False,False,False,False,True
2016-11-15,False,False,False,False,False,True


## Operations 

See the Basic section on Binary Ops.

### Stats

Operations in general <em>exclude</em> missing data.

Performing a descriptive statistic:

In [49]:
df.mean()

A    0.277158
B    0.275062
C   -0.388641
D    5.000000
F    3.000000
dtype: float64

Same operation on the other axis:

In [50]:
df.mean(1)

2016-11-12    1.490814
2016-11-13    0.823031
2016-11-14    1.657113
2016-11-15    2.039792
2016-11-16    1.836490
2016-11-17    1.647216
Freq: D, dtype: float64

Operating with objects that have different dimensionality and need alignment.  
In addition, pandas broadcasts along the specified dimension.

In [51]:
s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)
s

2016-11-12    NaN
2016-11-13    NaN
2016-11-14    1.0
2016-11-15    3.0
2016-11-16    5.0
2016-11-17    NaN
Freq: D, dtype: float64

In [52]:
df.sub(s, axis='index')

Unnamed: 0,A,B,C,D,F
2016-11-12,,,,,
2016-11-13,,,,,
2016-11-14,0.558996,-0.537006,-1.736424,4.0,1.0
2016-11-15,-3.119552,-0.988189,-2.693298,2.0,0.0
2016-11-16,-5.267299,-4.050057,-5.500195,0.0,-1.0
2016-11-17,,,,,


### Apply

Applying functions to the data:

In [53]:
df

Unnamed: 0,A,B,C,D,F
2016-11-12,0.0,0.0,0.963255,5,
2016-11-13,-0.416058,-0.935586,-0.5332,5,1.0
2016-11-14,1.558996,0.462994,-0.736424,5,2.0
2016-11-15,-0.119552,2.011811,0.306702,5,3.0
2016-11-16,-0.267299,0.949943,-0.500195,5,4.0
2016-11-17,0.90686,-0.838793,-1.831986,5,5.0


In [54]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2016-11-12,0.0,0.0,0.963255,5,
2016-11-13,-0.416058,-0.935586,0.430055,10,1.0
2016-11-14,1.142938,-0.472591,-0.306369,15,3.0
2016-11-15,1.023385,1.53922,0.000333,20,6.0
2016-11-16,0.756086,2.489163,-0.499862,25,10.0
2016-11-17,1.662946,1.65037,-2.331848,30,15.0


In [55]:
df.apply(lambda x: x.max() - x.min())

A    1.975053
B    2.947397
C    2.795241
D    0.000000
F    4.000000
dtype: float64

### Histogramming

See more at Histogramming and Discretization.

In [56]:
s = pd.Series(np.random.randint(0, 7, size=10))
s

0    0
1    2
2    4
3    6
4    2
5    0
6    6
7    0
8    2
9    6
dtype: int64

In [57]:
s.value_counts()

6    3
2    3
0    3
4    1
dtype: int64

### String Methods

Series is equipped with a set of string processing methods in the str attribute that make it easy to operate on each element of the array, as in the code snippet below.  
Note that the pattern-matching in str generally uses regular expressions by default, and in some cases always uses regexes.  
See more at Vectorized String Methods. 

In [58]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s

0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object

In [59]:
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

## Merge

### Concat

See the Merging section.  
Concatenating pandas objects together with concat():

In [60]:
df = pd.DataFrame(np.random.randn(10, 4))
df

Unnamed: 0,0,1,2,3
0,0.892157,0.223927,-0.656617,0.652973
1,1.265082,-0.554515,2.692443,-1.077673
2,-0.200476,0.441707,-0.555561,-0.521179
3,-1.50677,-0.547829,0.538228,-0.56315
4,-1.250383,0.649648,-0.173917,-0.161883
5,-0.678727,-0.501232,-0.758787,-0.072932
6,1.04035,0.435076,0.833642,0.847521
7,1.296809,2.69142,1.545096,0.220965
8,-1.204751,0.625082,-1.858298,-0.967368
9,2.233012,-0.219081,-1.605697,-0.098209


In [61]:
# break it into pieces
pieces = [df[:3], df[3:7], df[7:]]
pieces

[          0         1         2         3
 0  0.892157  0.223927 -0.656617  0.652973
 1  1.265082 -0.554515  2.692443 -1.077673
 2 -0.200476  0.441707 -0.555561 -0.521179,
           0         1         2         3
 3 -1.506770 -0.547829  0.538228 -0.563150
 4 -1.250383  0.649648 -0.173917 -0.161883
 5 -0.678727 -0.501232 -0.758787 -0.072932
 6  1.040350  0.435076  0.833642  0.847521,
           0         1         2         3
 7  1.296809  2.691420  1.545096  0.220965
 8 -1.204751  0.625082 -1.858298 -0.967368
 9  2.233012 -0.219081 -1.605697 -0.098209]

In [62]:
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,0.892157,0.223927,-0.656617,0.652973
1,1.265082,-0.554515,2.692443,-1.077673
2,-0.200476,0.441707,-0.555561,-0.521179
3,-1.50677,-0.547829,0.538228,-0.56315
4,-1.250383,0.649648,-0.173917,-0.161883
5,-0.678727,-0.501232,-0.758787,-0.072932
6,1.04035,0.435076,0.833642,0.847521
7,1.296809,2.69142,1.545096,0.220965
8,-1.204751,0.625082,-1.858298,-0.967368
9,2.233012,-0.219081,-1.605697,-0.098209


### Join

SQL style merges.  
See the Database style joining.

In [63]:
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
left

Unnamed: 0,key,lval
0,foo,1
1,foo,2


In [64]:
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
right

Unnamed: 0,key,rval
0,foo,4
1,foo,5


In [65]:
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,foo,1,5
2,foo,2,4
3,foo,2,5


Another example that can be given is:

In [68]:
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
left

Unnamed: 0,key,lval
0,foo,1
1,bar,2


In [69]:
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
right

Unnamed: 0,key,rval
0,foo,4
1,bar,5


In [70]:
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


### Append

Append rows to a dataframe.  
See the Appending.

In [74]:
df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,0.940416,2.754753,0.597961,-0.103247
1,-0.77496,0.681974,-0.841075,-1.090128
2,-0.01576,-0.040093,-1.445794,-1.580715
3,1.429497,1.729224,1.394232,-1.636049
4,0.858742,0.353892,-1.403011,-1.621984
5,0.955424,0.244535,1.072341,-1.2122
6,0.700364,0.817684,0.287086,-0.115659
7,-1.052464,0.887217,-0.604326,-0.612835


In [78]:
s = df.iloc[3]
s

A    1.429497
B    1.729224
C    1.394232
D   -1.636049
Name: 3, dtype: float64

In [82]:
df.append(s, ignore_index=True)  
# leaving out ignore_index means the appended row will have an index of 3 instead of 8

Unnamed: 0,A,B,C,D
0,0.940416,2.754753,0.597961,-0.103247
1,-0.77496,0.681974,-0.841075,-1.090128
2,-0.01576,-0.040093,-1.445794,-1.580715
3,1.429497,1.729224,1.394232,-1.636049
4,0.858742,0.353892,-1.403011,-1.621984
5,0.955424,0.244535,1.072341,-1.2122
6,0.700364,0.817684,0.287086,-0.115659
7,-1.052464,0.887217,-0.604326,-0.612835
8,1.429497,1.729224,1.394232,-1.636049


## Grouping 