# 1.Getting Started with Pandas

Pandas is often used in tandem with numerical computing tools like NumPy and SciPy, analytical libraries like statsmodels and scikit-learn, and data visualization libraries like matplotlib

pandas adopts significant parts of NumPy’s idiomatic style of array-based computing, especially array-based
functions and vectorization. the biggest difference is that pandas is designed for working with tabular or heterogeneous data. NumPy, by contrast, is best suited for working with homogeneous numerical array data.


In [1]:
import pandas as pd
import numpy as np
from pandas import Series
from pandas import DataFrame as DF

In [2]:
obj = Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [3]:
obj.values

array([ 4,  7, -5,  3])

In [4]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
obj2 = Series([4,7,-5,3], index=['d', 'b', 'a', 'c'])

In [6]:
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [7]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

### Note
In panda.series fancy indexing and regular indexing both copies data to new objects. To demonstrate this:

In [8]:
obj2[['b', 'c']]

b    7
c    3
dtype: int64

In [9]:
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [10]:
a = obj2['a']
a = 0
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [11]:
bc = obj2[['b','c']]
bc

b    7
c    3
dtype: int64

In [12]:
bc[0]=bc[1]=0

In [13]:
bc

b    0
c    0
dtype: int64

In [14]:
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [15]:
obj2>0

d     True
b     True
a    False
c     True
dtype: bool

In [16]:
obj2[obj2>0]

d    4
b    7
c    3
dtype: int64

In [17]:
print(obj2*2)
print(np.exp(obj2))

d     8
b    14
a   -10
c     6
dtype: int64
d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64


### turn dict into a panda series, indexing

Should you have data contained in a Python dict, you can create a Series from it by passing the dict:

In [18]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [19]:
obj3 = Series(sdata)
obj33 = obj3.copy()
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [20]:
obj3.sort_index()

Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

In [21]:
##changing index
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj3.index=states

In [22]:
obj3

California    35000
Ohio          71000
Oregon        16000
Texas          5000
dtype: int64

In [23]:
obj4 = Series(sdata,index=states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [24]:
## detect missing data
print(pd.isnull(obj4))
print(pd.notnull(obj4))

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool
California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool


A useful Series feature for many applications is that it automatically aligns by index
label in arithmetic operations:

In [25]:
obj33

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [26]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [27]:
obj33 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

Both the Series object itself and its index have a name attribute:

In [28]:
obj4.name = 'population'
obj4.index.name = 'state'
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

## Dataframe
it can be thought of as a dict of Series all sharing the same index

In [29]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2],
'number' : np.array([0.5,1,-0.3,0.99,-8,28])       
       }
frame = DF(data)
frame

Unnamed: 0,state,year,pop,number
0,Ohio,2000,1.5,0.5
1,Ohio,2001,1.7,1.0
2,Ohio,2002,3.6,-0.3
3,Nevada,2001,2.4,0.99
4,Nevada,2002,2.9,-8.0
5,Nevada,2003,3.2,28.0


In [30]:
frame2 = DF(data, columns=['year','state','number','wtf'], index=['one','two','three','four','five','six'])
frame2

Unnamed: 0,year,state,number,wtf
one,2000,Ohio,0.5,
two,2001,Ohio,1.0,
three,2002,Ohio,-0.3,
four,2001,Nevada,0.99,
five,2002,Nevada,-8.0,
six,2003,Nevada,28.0,


In [31]:
frame2.columns

Index(['year', 'state', 'number', 'wtf'], dtype='object')

In [32]:
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [33]:
frame2.number

one       0.50
two       1.00
three    -0.30
four      0.99
five     -8.00
six      28.00
Name: number, dtype: float64

In [34]:
frame2.wtf

one      NaN
two      NaN
three    NaN
four     NaN
five     NaN
six      NaN
Name: wtf, dtype: object

In [35]:
#frame2[column] works for any column name, but frame2.column
#only works when the column name is a valid Python variable name.
frame2

Unnamed: 0,year,state,number,wtf
one,2000,Ohio,0.5,
two,2001,Ohio,1.0,
three,2002,Ohio,-0.3,
four,2001,Nevada,0.99,
five,2002,Nevada,-8.0,
six,2003,Nevada,28.0,


In [36]:
frame2.loc['three']

year      2002
state     Ohio
number    -0.3
wtf        NaN
Name: three, dtype: object

In [37]:
frame2.iloc[1,0]

2001

#### Inserting various objects into Dataframe

In [38]:
frame2['debt'] = 16.5
frame2

Unnamed: 0,year,state,number,wtf,debt
one,2000,Ohio,0.5,,16.5
two,2001,Ohio,1.0,,16.5
three,2002,Ohio,-0.3,,16.5
four,2001,Nevada,0.99,,16.5
five,2002,Nevada,-8.0,,16.5
six,2003,Nevada,28.0,,16.5


In [39]:
frame2['debt'] = np.arange(6.)
frame2


Unnamed: 0,year,state,number,wtf,debt
one,2000,Ohio,0.5,,0.0
two,2001,Ohio,1.0,,1.0
three,2002,Ohio,-0.3,,2.0
four,2001,Nevada,0.99,,3.0
five,2002,Nevada,-8.0,,4.0
six,2003,Nevada,28.0,,5.0


In [40]:
frame2['debt'] = np.linspace(1,2,6)
frame2

Unnamed: 0,year,state,number,wtf,debt
one,2000,Ohio,0.5,,1.0
two,2001,Ohio,1.0,,1.2
three,2002,Ohio,-0.3,,1.4
four,2001,Nevada,0.99,,1.6
five,2002,Nevada,-8.0,,1.8
six,2003,Nevada,28.0,,2.0


In [41]:
val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val
frame2

Unnamed: 0,year,state,number,wtf,debt
one,2000,Ohio,0.5,,
two,2001,Ohio,1.0,,-1.2
three,2002,Ohio,-0.3,,
four,2001,Nevada,0.99,,-1.5
five,2002,Nevada,-8.0,,-1.7
six,2003,Nevada,28.0,,


In [42]:
dc = {'dt':[1,2,3,4,5,6], 'ds':['a','b','c','d','e','f']}

In [43]:
frame2[list(dc.keys())[0]] = dc['dt']
frame2

Unnamed: 0,year,state,number,wtf,debt,dt
one,2000,Ohio,0.5,,,1
two,2001,Ohio,1.0,,-1.2,2
three,2002,Ohio,-0.3,,,3
four,2001,Nevada,0.99,,-1.5,4
five,2002,Nevada,-8.0,,-1.7,5
six,2003,Nevada,28.0,,,6


In [44]:
dcs = Series(dc)
dcs

dt    [1, 2, 3, 4, 5, 6]
ds    [a, b, c, d, e, f]
dtype: object

In [45]:
list(dcs.index)[1]

'ds'

In [46]:
frame2[list(dcs.index)[1]] = dcs['ds']

In [47]:
frame2

Unnamed: 0,year,state,number,wtf,debt,dt,ds
one,2000,Ohio,0.5,,,1,a
two,2001,Ohio,1.0,,-1.2,2,b
three,2002,Ohio,-0.3,,,3,c
four,2001,Nevada,0.99,,-1.5,4,d
five,2002,Nevada,-8.0,,-1.7,5,e
six,2003,Nevada,28.0,,,6,f


In [48]:
del frame2['debt'],frame2['dt'],frame2['ds']

In [49]:
frame2

Unnamed: 0,year,state,number,wtf
one,2000,Ohio,0.5,
two,2001,Ohio,1.0,
three,2002,Ohio,-0.3,
four,2001,Nevada,0.99,
five,2002,Nevada,-8.0,
six,2003,Nevada,28.0,


In [50]:
frame2['eastern'] = frame2['state'] == 'Ohio'
frame2

Unnamed: 0,year,state,number,wtf,eastern
one,2000,Ohio,0.5,,True
two,2001,Ohio,1.0,,True
three,2002,Ohio,-0.3,,True
four,2001,Nevada,0.99,,False
five,2002,Nevada,-8.0,,False
six,2003,Nevada,28.0,,False


### Note
The column returned from indexing a DataFrame is a view on the underlying data, not a copy. Thus, any in-place modifications to the Series will be reflected in the DataFrame

In [51]:
##Inserting a nested dict of dicts
pop = {
        'Nevada': {2001: 2.4, 2002: 2.9},
        'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}
}

frame3 = DF(pop)

In [52]:
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [53]:
frame3.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


In [54]:
DF(pop,index=[2001,2002,2003])

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [55]:
##insert a tuple (collection) of dictionaries
dictcol = [({'a':1,'b':2,'c':3}),({'a':4,'b':5,'c':6}),({'a':7,'b':8,'c':9})]

In [56]:
DF(dictcol)

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [57]:
frame2

Unnamed: 0,year,state,number,wtf,eastern
one,2000,Ohio,0.5,,True
two,2001,Ohio,1.0,,True
three,2002,Ohio,-0.3,,True
four,2001,Nevada,0.99,,False
five,2002,Nevada,-8.0,,False
six,2003,Nevada,28.0,,False


In [58]:
frame2.values

array([[2000, 'Ohio', 0.5, nan, True],
       [2001, 'Ohio', 1.0, nan, True],
       [2002, 'Ohio', -0.3, nan, True],
       [2001, 'Nevada', 0.99, nan, False],
       [2002, 'Nevada', -8.0, nan, False],
       [2003, 'Nevada', 28.0, nan, False]], dtype=object)

In [59]:
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [60]:
frame3.index.name = 'rows'
frame3.columns.name = 'columns'
print(frame3.index.name)

rows


In [61]:
frame3

columns,Nevada,Ohio
rows,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [62]:
obj = Series(range(3), index=['a', 'b', 'c'])
index = obj.index
index

Index(['a', 'b', 'c'], dtype='object')

Index objects are immutable.
Unlike Python sets, a pandas Index can contain duplicate labels
Immutability makes it safer to share Index objects among data structures:

In [63]:
labels = pd.Index(np.arange(3))
obj2 = Series([1.5, -2.5, 0], index=labels)   
obj2

0    1.5
1   -2.5
2    0.0
dtype: float64

In [64]:
labels

Int64Index([0, 1, 2], dtype='int64')

In [65]:
labels2 = pd.Index(np.arange(29,16,-1))
labels2

Int64Index([29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17], dtype='int64')

In [66]:
labels.union(labels2)

Int64Index([0, 1, 2, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29], dtype='int64')

In [67]:
labels.isin([1,4,3,5,0])

array([ True,  True, False])

In [68]:
labels.insert(0,1)

Int64Index([1, 0, 1, 2], dtype='int64')

### Reindexing

In [69]:
obj = Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

For ordered data like time series, it may be desirable to do some interpolation or fill‐
ing of values when reindexing. -> ffill

In [70]:
obj3 = Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [71]:
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [72]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
        index=['a', 'c', 'd'],
        columns=['Ohio', 'Texas', 'California'])

In [73]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [74]:
states = ['Texas', 'Utah', 'California','cn']
frame2 = frame.reindex(['a','b','c','rw'],columns=states)

In [75]:
frame2

Unnamed: 0,Texas,Utah,California,cn
a,1.0,,2.0,
b,,,,
c,4.0,,5.0,
rw,,,,


In [76]:
frame.loc[['a', 'b', 'c', 'd'], states]

Unnamed: 0,Texas,Utah,California,cn
a,1.0,,2.0,
b,,,,
c,4.0,,5.0,
d,7.0,,8.0,


In [77]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [78]:
frame.iloc[0:2,0:2]

Unnamed: 0,Ohio,Texas
a,0,1
c,3,4


In [79]:
frame.loc[['a', 'b', 'c', 'd'], states]

Unnamed: 0,Texas,Utah,California,cn
a,1.0,,2.0,
b,,,,
c,4.0,,5.0,
d,7.0,,8.0,


In [80]:
frame2

Unnamed: 0,Texas,Utah,California,cn
a,1.0,,2.0,
b,,,,
c,4.0,,5.0,
rw,,,,


In [81]:
frame2.iloc[[0,3]]

Unnamed: 0,Texas,Utah,California,cn
a,1.0,,2.0,
rw,,,,


In [82]:
frame2.iloc[[0,2],[2,3]]

Unnamed: 0,California,cn
a,2.0,
c,5.0,


### Dropping

In [83]:
obj = Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [84]:
obj.drop(['d', 'c'])

a    0.0
b    1.0
e    4.0
dtype: float64

In [85]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
        index=['Ohio', 'Colorado', 'Utah', 'New York'],
        columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [86]:
data.drop(['Ohio','Colorado'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [87]:
data.drop(['two','four'],axis=1)

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


In [88]:
data.drop(['two', 'four'], axis='columns')

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


In [89]:
data.drop('Ohio', axis='rows')

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [90]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [91]:
obj.drop('c')

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [92]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

inplace: manipluate an object in-place without returning a new object <br>
Be careful with the inplace , as it destroys any data that is dropped.

In [93]:
obj.drop('c',inplace=True)

In [94]:
obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

### Indexing, Selectio and Flitering

In [95]:
obj = Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [96]:
obj[['b', 'a', 'd']]

b    1.0
a    0.0
d    3.0
dtype: float64

In [97]:
obj[[1, 3]]

b    1.0
d    3.0
dtype: float64

In [98]:
obj[obj < 2]

a    0.0
b    1.0
dtype: float64

Slicing with labels behaves differently than normal Python slicing in that the end‐
point is inclusive:

In [99]:
obj['b':'c'] = 5
obj

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

In [100]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
    index=['Ohio', 'Colorado', 'Utah', 'New York'],
    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [101]:
data['three']>5

Ohio        False
Colorado     True
Utah         True
New York     True
Name: three, dtype: bool

In [102]:
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [103]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [104]:
data.loc['Utah',['two','three']]

two       9
three    10
Name: Utah, dtype: int64

In [105]:
data.loc[['Utah','New York'],['two','three']]

Unnamed: 0,two,three
Utah,9,10
New York,13,14


In [106]:
data.iloc[[1, 2], [3, 0, 1]]

Unnamed: 0,four,one,two
Colorado,7,4,5
Utah,11,8,9


In [107]:
data.loc[:'Utah', 'two']

Ohio        1
Colorado    5
Utah        9
Name: two, dtype: int64

In [108]:
data.iloc[:, :3][data.three > 5]

Unnamed: 0,one,two,three
Colorado,4,5,6
Utah,8,9,10
New York,12,13,14


In [109]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [110]:
data.iat[1,3]

7

In [111]:
data.at['New York','three']

14

In [112]:
data.loc['Ohio','three']

2

### Integer Indexes

In [113]:
ser0 = Series(np.arange(3.),index=range(3))
ser0

0    0.0
1    1.0
2    2.0
dtype: float64

In [114]:
ser = Series(np.arange(3.))
ser

0    0.0
1    1.0
2    2.0
dtype: float64

In [115]:
ser[-1] #This raises error

KeyError: -1

In [None]:
ser.iloc[-1]

In [None]:
ser[:1] 

In [None]:
ser.loc[:1] 

In [None]:
ser.iloc[:1]

### Arithmetic and Data Alignment

When you are adding together objects, if any index pairs are not the same, the respective index in the result will be the union of the index pairs.

In [None]:
s1 = Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = Series([-2.1, 3.6, -1.5, 4, 3.1],
                index=['a', 'c', 'e', 'f', 'g'])

In [None]:
s1

In [None]:
s2

In [None]:
s1+s2

In [None]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
        index=['Ohio', 'Texas', 'Colorado'])

df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
        index=['Utah', 'Ohio', 'Texas', 'Oregon'])

print(df1)
print(df2)

In [None]:
dff = df1+df2
dff

In [None]:
df1.add(df2)

### Fill Missing Values

In [None]:
df1.add(df2, fill_value=0)

In [None]:
df2.add(df1, fill_value=0)

In [None]:
dff.fillna(0)

### Operations between DataFrame and Series

In [None]:
## Broadcasting Equivalence
arr = np.arange(12.).reshape((3, 4))
print(arr,'\n')
print(arr[0],'\n')
print(arr - arr[0])

In [116]:
#Operations between a DataFrame and a Series are similar:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
    columns=list('bde'),
    index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [117]:
series=frame.iloc[0]
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [118]:
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


If an index value is not found in either the DataFrame’s columns or the Series’s index, the objects will be reindexed to form the union:

In [119]:
series2 = Series(range(3), index=['b', 'e', 'f'])
series2

b    0
e    1
f    2
dtype: int64

In [120]:
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [121]:
series3 = frame['d']
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [122]:
frame.sub(series3, axis='rows')
#'rows' or 'index' or 0, no difference

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


### Function Application and Mapping

NumPy ufuncs (element-wise array methods) also work with pandas objects:

In [123]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
        index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [124]:
frame

Unnamed: 0,b,d,e
Utah,-0.343786,1.027151,0.755504
Ohio,-1.436706,-0.589644,0.044513
Texas,-0.011726,1.276152,-0.993051
Oregon,-1.229064,-1.653963,-0.988156


In [125]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.343786,1.027151,0.755504
Ohio,1.436706,0.589644,0.044513
Texas,0.011726,1.276152,0.993051
Oregon,1.229064,1.653963,0.988156


Another frequent operation is applying a function on one-dimensional arrays to each column or row. DataFrame’s apply method does exactly this:

In [126]:
f = lambda x: x.max() - x.min()

frame.apply(f)

b    1.424981
d    2.930115
e    1.748555
dtype: float64

In [127]:
f = lambda x: x.T.max() - x.T.min()

frame.apply(f)

b    1.424981
d    2.930115
e    1.748555
dtype: float64

In [128]:
def f(x):
    return Series([x.T.min(), x.T.max()], index=['min', 'max'])

frame.apply(f)

Unnamed: 0,b,d,e
min,-1.436706,-1.653963,-0.993051
max,-0.011726,1.276152,0.755504


In [129]:
frame.apply(f,axis=1)

Unnamed: 0,min,max
Utah,-0.343786,1.027151
Ohio,-1.436706,0.044513
Texas,-0.993051,1.276152
Oregon,-1.653963,-0.988156


Element-wise Python functions can be used, too. Suppose you wanted to compute a formatted string from each floating-point value in frame . You can do this with apply map:

In [130]:
format = lambda x: '%.2f' % x
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,-0.34,1.03,0.76
Ohio,-1.44,-0.59,0.04
Texas,-0.01,1.28,-0.99
Oregon,-1.23,-1.65,-0.99


The reason for the name applymap is that Series has a map method for applying an element-wise function:

In [131]:
type(frame['e'])

pandas.core.series.Series

In [132]:
frame['e'].map(format)

Utah       0.76
Ohio       0.04
Texas     -0.99
Oregon    -0.99
Name: e, dtype: object

### Sorting and Ranking

#### Sorting

In [133]:
obj = Series(range(4), index=list('dabc'))
obj

d    0
a    1
b    2
c    3
dtype: int64

In [134]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [135]:
dc = DF.to_dict(frame)
dc

{'b': {'Utah': -0.3437862119033086,
  'Ohio': -1.436706292259764,
  'Texas': -0.011725588927961675,
  'Oregon': -1.2290635884235306},
 'd': {'Utah': 1.0271512967167302,
  'Ohio': -0.5896438465236417,
  'Texas': 1.276151775407355,
  'Oregon': -1.6539631410963833},
 'e': {'Utah': 0.7555038155589177,
  'Ohio': 0.04451292037638918,
  'Texas': -0.9930510767155852,
  'Oregon': -0.9881562586165241}}

In [136]:
dc.keys()

dict_keys(['b', 'd', 'e'])

In [137]:
dc.values()

dict_values([{'Utah': -0.3437862119033086, 'Ohio': -1.436706292259764, 'Texas': -0.011725588927961675, 'Oregon': -1.2290635884235306}, {'Utah': 1.0271512967167302, 'Ohio': -0.5896438465236417, 'Texas': 1.276151775407355, 'Oregon': -1.6539631410963833}, {'Utah': 0.7555038155589177, 'Ohio': 0.04451292037638918, 'Texas': -0.9930510767155852, 'Oregon': -0.9881562586165241}])

In [138]:
dcs = sorted(dc.items(), key=lambda item: item[0])
dcs


[('b',
  {'Utah': -0.3437862119033086,
   'Ohio': -1.436706292259764,
   'Texas': -0.011725588927961675,
   'Oregon': -1.2290635884235306}),
 ('d',
  {'Utah': 1.0271512967167302,
   'Ohio': -0.5896438465236417,
   'Texas': 1.276151775407355,
   'Oregon': -1.6539631410963833}),
 ('e',
  {'Utah': 0.7555038155589177,
   'Ohio': 0.04451292037638918,
   'Texas': -0.9930510767155852,
   'Oregon': -0.9881562586165241})]

In [139]:
dicf = {k:v for k,v in dcs}
DF(dicf)

Unnamed: 0,b,d,e
Utah,-0.343786,1.027151,0.755504
Ohio,-1.436706,-0.589644,0.044513
Texas,-0.011726,1.276152,-0.993051
Oregon,-1.229064,-1.653963,-0.988156


In [140]:
frame = pd.DataFrame(np.random.rand(8).reshape((2, 4)),
        index=['three', 'one'],
        columns=['d', 'a', 'b', 'c'])
frame

Unnamed: 0,d,a,b,c
three,0.025634,0.939185,0.850857,0.545415
one,0.673996,0.393249,0.22508,0.051566


In [141]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,0.673996,0.393249,0.22508,0.051566
three,0.025634,0.939185,0.850857,0.545415


In [142]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,0.939185,0.850857,0.545415,0.025634
one,0.393249,0.22508,0.051566,0.673996


In [143]:
obj = Series([4, np.nan, 7, np.nan, -3, 2])
obj

0    4.0
1    NaN
2    7.0
3    NaN
4   -3.0
5    2.0
dtype: float64

In [144]:
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

When sorting a DataFrame, you can use the data in one or more columns as the sort
keys. To do so, pass one or more column names to the by option of sort_values :

In [145]:
frame.sort_values(by=['b'])

Unnamed: 0,d,a,b,c
one,0.673996,0.393249,0.22508,0.051566
three,0.025634,0.939185,0.850857,0.545415


In [146]:
frame

Unnamed: 0,d,a,b,c
three,0.025634,0.939185,0.850857,0.545415
one,0.673996,0.393249,0.22508,0.051566


In [147]:
frame.sort_values(by=['one'],axis=1)

Unnamed: 0,c,b,a,d
three,0.545415,0.850857,0.939185,0.025634
one,0.051566,0.22508,0.393249,0.673996


In [148]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 2, 1]})
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,2
3,2,1


In [149]:
frame.sort_values(by=['b','a'])

Unnamed: 0,b,a
2,-3,2
3,2,1
0,4,0
1,7,1


In [150]:
frame.sort_values(by=['b'])

Unnamed: 0,b,a
2,-3,2
3,2,1
0,4,0
1,7,1


#### Ranking

Ranking assigns ranks from one through the number of valid data points in an array. The rank methods for Series and DataFrame are the place to look; by default rank breaks ties by assigning each group the mean rank:

In [151]:
# importing pandas as pd 
import pandas as pd 
  
# Creating the Series 
sr = Series([10, 25, 3, 11, 24, 6]) 
  
# Create the Index 
index_ = ['Coca Cola', 'Sprite', 'Coke', 'Fanta', 'Dew', 'ThumbsUp'] 
  
# set the index 
sr.index = index_ 
  
# Print the series 
print(sr) 

Coca Cola    10
Sprite       25
Coke          3
Fanta        11
Dew          24
ThumbsUp      6
dtype: int64


In [152]:
# assign rank 
result = sr.rank() 
  
# Print the result 
print(result) 

Coca Cola    3.0
Sprite       6.0
Coke         1.0
Fanta        4.0
Dew          5.0
ThumbsUp     2.0
dtype: float64


In [153]:
obj = Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

Ranks can also be assigned according to the order in which they’re observed in the
data:

In [154]:
obj.rank(method = 'first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

Here, instead of using the average rank 6.5 for the entries 0 and 2, they instead have been set to 6 and 7 because label 0 precedes label 2 in the data.

You can rank in descending order, too:

In [155]:
obj.rank(ascending=False, method='max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

### Axis Indexes with Duplicate Labels

While many pandas functions (like reindex ) require that the labels be
unique, it’s not mandatory.

In [156]:
obj = Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [157]:
obj.index.is_unique

False

Data selection is one of the main things that behaves differently with duplicates

In [158]:
obj['a']

a    0
a    1
dtype: int64

In [159]:
obj['c']

4

This can make your code more complicated, as the output type from indexing can
vary based on whether a label is repeated or not. Same goes for DataFrame:

In [160]:
df = DF(np.random.randn(5, 4), index=['a', 'a', 'b', 'b','c'],columns=[0,1,1,2])

In [161]:
df

Unnamed: 0,0,1,1.1,2
a,-1.081841,1.811593,-0.447942,0.410725
a,0.324588,0.570015,-0.591241,2.580508
b,0.24537,0.660358,-1.429239,-0.995751
b,0.087476,0.316758,-0.065745,-0.79816
c,-0.145225,-1.356693,0.324686,-1.35058


In [162]:
df.loc['b']

Unnamed: 0,0,1,1.1,2
b,0.24537,0.660358,-1.429239,-0.995751
b,0.087476,0.316758,-0.065745,-0.79816


In [163]:
df.iloc[:,[1,2]]

Unnamed: 0,1,1.1
a,1.811593,-0.447942
a,0.570015,-0.591241
b,0.660358,-1.429239
b,0.316758,-0.065745
c,-1.356693,0.324686


In [164]:
df.loc['b','1']

Unnamed: 0,1,1.1
b,0.660358,-1.429239
b,0.316758,-0.065745


## Summarizing and Computing Descriptive Statistics
reductions or summary statistics

In [165]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
    [np.nan, np.nan], [0.75, -1.3]],
    index=['a', 'b', 'c', 'd'],
    columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [166]:
np.nansum(df.values)

3.45

In [167]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [168]:
df.sum(axis='columns')

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [169]:
df.mean(axis='columns')

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [170]:
df.mean(axis='columns',skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [171]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


Some methods, like idxmin and idxmax , return indirect statistics like the index value
where the minimum or maximum values are attained:

In [172]:
df.idxmax()

one    b
two    d
dtype: object

In [173]:
df.idxmax(axis=1)

a    one
b    one
c    NaN
d    one
dtype: object

Other methods are accumulations:

In [174]:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [175]:
df = DF(np.random.rand(7,8),index=list('abcdefg'))

In [176]:
df

Unnamed: 0,0,1,2,3,4,5,6,7
a,0.779998,0.054951,0.005583,0.112303,0.113479,0.794088,0.756692,0.139028
b,0.532249,0.157049,0.633843,0.952071,0.624281,0.579702,0.930629,0.205672
c,0.739442,0.557506,0.058746,0.508246,0.51963,0.502399,0.810007,0.614468
d,0.06471,0.268887,0.495587,0.800836,0.650217,0.184965,0.927845,0.802452
e,0.496431,0.58393,0.92689,0.077171,0.90533,0.873562,0.900456,0.140986
f,0.33195,0.251033,0.186749,0.998874,0.636643,0.99711,0.245041,0.80066
g,0.13467,0.760422,0.772923,0.919297,0.80859,0.018345,0.452825,0.609097


Another type of method is neither a reduction nor an accumulation. describe is one
such example, producing multiple summary statistics in one shot:

In [177]:
df.T.describe()

Unnamed: 0,a,b,c,d,e,f,g
count,8.0,8.0,8.0,8.0,8.0,8.0,8.0
mean,0.344515,0.576937,0.538805,0.524437,0.613094,0.556008,0.559521
std,0.360535,0.289693,0.224552,0.32182,0.350276,0.34503,0.330495
min,0.005583,0.157049,0.058746,0.06471,0.077171,0.186749,0.018345
25%,0.097965,0.450604,0.506784,0.247907,0.407569,0.249535,0.373286
50%,0.126253,0.601992,0.538568,0.572902,0.728746,0.484297,0.684759
75%,0.762519,0.708039,0.645712,0.80124,0.901674,0.849772,0.78184
max,0.794088,0.952071,0.810007,0.927845,0.92689,0.998874,0.919297


In [178]:
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)
obj

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [179]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

In [180]:
obj.value_counts()

a    8
c    4
b    4
dtype: int64

### Correlation and Covariance

In [181]:
#STOCKS

import datetime
def parser(x):
    return datetime.datetime.strptime(x,'%Y-%m-%d')


#3 stocks
###3 Correlated Stocks
url1 = 'https://raw.githubusercontent.com/berserkhmdvhb/stockdata-public/master/GS.csv'
url2 = 'https://raw.githubusercontent.com/berserkhmdvhb/stockdata-public/master/JPM.csv'
url3 = 'https://raw.githubusercontent.com/berserkhmdvhb/stockdata-public/master/MS.csv'

###GS
gs = pd.read_csv(url1,engine='python', header=0, parse_dates=[0], date_parser=parser)
df_gs = gs
#df_gs = df_gs.drop(["Open", "High", "Low", "Close", "Volume"], axis=1)
df_gs.drop(df_gs.columns[[1,2,3,4,6]], axis=1,inplace=True)
df_gs.columns = ['Date','GS']


###JPM
jpm = pd.read_csv(url2,engine='python', header=0, parse_dates=[0], date_parser=parser)
df_jpm = jpm
#df_jpm = df_jpm.drop(["Open", "High", "Low", "Close", "Volume"], axis=1)
df_jpm.drop(df_jpm.columns[[1,2,3,4,6]], axis=1,inplace=True)
df_jpm.columns = ['Date','JPM']


###MS
ms = pd.read_csv(url3,engine='python', header=0, parse_dates=[0], date_parser=parser)
df_ms = ms
#df_ms = df_ms.drop(["Open", "High", "Low", "Close", "Volume"], axis=1)
df_ms.drop(df_ms.columns[[1,2,3,4,6]], axis=1,inplace=True)
df_ms.columns = ['Date','MS']

##Fill Missing Values
def fill_miss(dataframe,ind):
  dataset = dataframe.copy()
  r = pd.date_range(start=dataset.Date.min(), end=dataset.Date.max())
  dataset = dataset.set_index('Date').reindex(r).fillna(dataset.iloc[:, ind].interpolate()).rename_axis('Date').reset_index()
  dataset.iloc[:, ind] = dataset.iloc[:, ind].fillna(dataset.iloc[:, ind].interpolate()) 
  return dataset

df_gs = fill_miss(df_gs,1)
df_jpm = fill_miss(df_jpm,1)
df_ms = fill_miss(df_ms,1)

In [182]:
df_gs.Date = pd.to_datetime(df_gs.Date)
df_jpm.Date = pd.to_datetime(df_jpm.Date)
df_ms.Date = pd.to_datetime(df_ms.Date)

#result = pd.merge([df_gs, df_jpm, df_ms])
df = pd.merge(df_gs,df_jpm, how='outer', on='Date')
df = pd.merge(df,df_ms,how='outer', on='Date')

df = fill_miss(df,[1,2,3])
df.JPM[0] = df.JPM[1]

In [183]:
df.head()

Unnamed: 0,Date,GS,JPM,MS
0,1999-05-11,55.668247,29.36916,28.974047
1,1999-05-12,57.934387,29.36916,30.698467
2,1999-05-13,57.688076,29.971283,31.039766
3,1999-05-14,55.323395,28.209566,28.776459
4,1999-05-15,54.912866,28.15753,28.770469


In [184]:
df.set_index('Date', inplace=True)
print(df.head())
returns = df.pct_change()


                   GS        JPM         MS
Date                                       
1999-05-11  55.668247  29.369160  28.974047
1999-05-12  57.934387  29.369160  30.698467
1999-05-13  57.688076  29.971283  31.039766
1999-05-14  55.323395  28.209566  28.776459
1999-05-15  54.912866  28.157530  28.770469


In [185]:
returns.head()

Unnamed: 0_level_0,GS,JPM,MS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1999-05-11,,,
1999-05-12,0.040708,0.0,0.059516
1999-05-13,-0.004252,0.020502,0.011118
1999-05-14,-0.040991,-0.05878,-0.072916
1999-05-15,-0.007421,-0.001845,-0.000208


In [186]:
returns.iloc[[0,1,2][0]] = returns.iloc[[0,1,2][1]]

In [187]:
returns.head()

Unnamed: 0_level_0,GS,JPM,MS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1999-05-11,0.040708,0.0,0.059516
1999-05-12,0.040708,0.0,0.059516
1999-05-13,-0.004252,0.020502,0.011118
1999-05-14,-0.040991,-0.05878,-0.072916
1999-05-15,-0.007421,-0.001845,-0.000208


In [188]:
returns.tail()

Unnamed: 0_level_0,GS,JPM,MS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-12-27,0.015221,0.011255,0.00916
2018-12-28,-0.014389,-0.002164,-0.007312
2018-12-29,0.008219,0.00272,0.002371
2018-12-30,0.008152,0.002712,0.002365
2018-12-31,0.008086,0.002705,0.002359


In [189]:
returns.describe()

Unnamed: 0,GS,JPM,MS
count,7175.0,7175.0,7175.0
mean,0.000311,0.000334,0.000309
std,0.017769,0.018707,0.023057
min,-0.139162,-0.181077,-0.258929
25%,-0.006479,-0.005696,-0.007142
50%,0.00025,0.0,0.0
75%,0.005886,0.005524,0.007103
max,0.201852,0.250967,0.289945


In [190]:
returns['GS'].corr(returns['JPM'])

0.6876789645205169

In [191]:
returns['GS'].cov(returns['MS'])

0.0003301406934558871

In [192]:
returns['JPM'].cov(returns['MS'])

0.00029751034897684733

In [193]:
returns.corr()

Unnamed: 0,GS,JPM,MS
GS,1.0,0.687679,0.805804
JPM,0.687679,1.0,0.689774
MS,0.805804,0.689774,1.0


In [194]:
returns.cov()

Unnamed: 0,GS,JPM,MS
GS,0.000316,0.000229,0.00033
JPM,0.000229,0.00035,0.000298
MS,0.00033,0.000298,0.000532


In [195]:
returns.corrwith(returns.GS)

GS     1.000000
JPM    0.687679
MS     0.805804
dtype: float64

### Unique Values, Value Counts, and Membership

In [196]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

In [201]:
obj.unique()

array(['c', 'a', 'd', 'b'], dtype=object)

In [202]:
obj.value_counts()

a    3
c    3
b    2
d    1
dtype: int64

In [203]:
pd.value_counts(obj.values, sort=False)

d    1
c    3
b    2
a    3
dtype: int64

In [197]:
obj.isin(['b', 'c'])

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

Related to isin is the Index.get_indexer method, which gives you an index array
from an array of possibly non-distinct values into another array of distinct values:

In [198]:
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])
unique_vals = pd.Series(['c', 'b', 'a'])
pd.Index(unique_vals).get_indexer(to_match)

array([0, 2, 1, 1, 0, 2])

In [199]:
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
    'Qu2': [2, 3, 1, 2, 3],
    'Qu3': [1, 5, 2, 4, 4]})

In [200]:
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [206]:
## Count non-NA cells
data.count()

Qu1    5
Qu2    5
Qu3    5
dtype: int64

In [207]:
result = data.apply(pd.value_counts).fillna(0)
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


The row labels in the result are the distinct values occurring in all of the col‐
umns. The values are the respective counts of these values in each column.