In [2]:
import numpy as np
import pandas as pd
import datetime

pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)

In [3]:
df = pd.DataFrame(np.arange(0, 15).reshape(5, 3),
                  index=['a', 'b', 'c', 'd', 'e'],
                  columns=['c1', 'c2', 'c3'])
df

Unnamed: 0,c1,c2,c3
a,0,1,2
b,3,4,5
c,6,7,8
d,9,10,11
e,12,13,14


In [4]:
df['c4'] = np.nan
df.loc['f'] = np.arange(15, 19)
df.loc['g'] = np.nan
df['c5'] = np.nan
df['c4']['a'] = 20
df



Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,
b,3.0,4.0,5.0,,
c,6.0,7.0,8.0,,
d,9.0,10.0,11.0,,
e,12.0,13.0,14.0,,
f,15.0,16.0,17.0,18.0,
g,,,,,


In [5]:
df.isnull()

Unnamed: 0,c1,c2,c3,c4,c5
a,False,False,False,False,True
b,False,False,False,True,True
c,False,False,False,True,True
d,False,False,False,True,True
e,False,False,False,True,True
f,False,False,False,False,True
g,True,True,True,True,True


In [6]:
df.isnull().sum()

c1    1
c2    1
c3    1
c4    5
c5    7
dtype: int64

In [7]:
df.isnull().sum().sum()

15

In [8]:
df.count()

c1    6
c2    6
c3    6
c4    2
c5    0
dtype: int64

In [9]:
(len(df) - df.count()).sum()

15

In [11]:
df.notnull()

Unnamed: 0,c1,c2,c3,c4,c5
a,True,True,True,True,False
b,True,True,True,False,False
c,True,True,True,False,False
d,True,True,True,False,False
e,True,True,True,False,False
f,True,True,True,True,False
g,False,False,False,False,False


In [14]:
df.c4[df.c4.notnull()]

a    20
f    18
Name: c4, dtype: float64

In [15]:
df.c4.dropna()

a    20
f    18
Name: c4, dtype: float64

In [16]:
df.c4

a    20
b   NaN
c   NaN
d   NaN
e   NaN
f    18
g   NaN
Name: c4, dtype: float64

In [17]:
df.dropna()

Unnamed: 0,c1,c2,c3,c4,c5


In [19]:
df.dropna(how='all') # drop if ALL values in ROW are NaN

Unnamed: 0,c1,c2,c3,c4,c5
a,0,1,2,20.0,
b,3,4,5,,
c,6,7,8,,
d,9,10,11,,
e,12,13,14,,
f,15,16,17,18.0,


In [20]:
df.dropna(how='all', axis=1) # drop if ALL values in COLUMN are NaN

Unnamed: 0,c1,c2,c3,c4
a,0.0,1.0,2.0,20.0
b,3.0,4.0,5.0,
c,6.0,7.0,8.0,
d,9.0,10.0,11.0,
e,12.0,13.0,14.0,
f,15.0,16.0,17.0,18.0
g,,,,


In [21]:
df2 = df.copy()
df2.ix['g'].c1 = 0
df2.ix['g'].c3 = 0
df2

Unnamed: 0,c1,c2,c3,c4,c5
a,0,1.0,2,20.0,
b,3,4.0,5,,
c,6,7.0,8,,
d,9,10.0,11,,
e,12,13.0,14,,
f,15,16.0,17,18.0,
g,0,,0,,


In [22]:
df2.dropna(how='any', axis=1)

Unnamed: 0,c1,c3
a,0,2
b,3,5
c,6,8
d,9,11
e,12,14
f,15,17
g,0,0


In [24]:
df.dropna(thresh=5, axis=1) # at least 5 NaN

Unnamed: 0,c1,c2,c3
a,0.0,1.0,2.0
b,3.0,4.0,5.0
c,6.0,7.0,8.0
d,9.0,10.0,11.0
e,12.0,13.0,14.0
f,15.0,16.0,17.0
g,,,


In [27]:
df3 = df.copy()
df3.dropna(thresh=5, axis=1, inplace=True) # inplace
df3

Unnamed: 0,c1,c2,c3
a,0.0,1.0,2.0
b,3.0,4.0,5.0
c,6.0,7.0,8.0
d,9.0,10.0,11.0
e,12.0,13.0,14.0
f,15.0,16.0,17.0
g,,,


Difference between NaN in Pandas and numpy

In [29]:
a = np.array([1, 2, np.nan, 3])
s = pd.Series(a)
a.mean(), s.mean()

(nan, 2.0)

In [32]:
s = df.c4
s.sum()

38.0

In [33]:
s.mean()

19.0

In [34]:
s.cumsum()

a    20
b   NaN
c   NaN
d   NaN
e   NaN
f    38
g   NaN
Name: c4, dtype: float64

In [35]:
df.c4 + 1

a    21
b   NaN
c   NaN
d   NaN
e   NaN
f    19
g   NaN
Name: c4, dtype: float64

In [36]:
df.c4

a    20
b   NaN
c   NaN
d   NaN
e   NaN
f    18
g   NaN
Name: c4, dtype: float64

### Filling in missing data

In [38]:
filled = df.fillna(0)
filled

Unnamed: 0,c1,c2,c3,c4,c5
a,0,1,2,20,0
b,3,4,5,0,0
c,6,7,8,0,0
d,9,10,11,0,0
e,12,13,14,0,0
f,15,16,17,18,0
g,0,0,0,0,0


In [39]:
df.mean()

c1     7.5
c2     8.5
c3     9.5
c4    19.0
c5     NaN
dtype: float64

In [40]:
filled.mean()

c1    6.428571
c2    7.285714
c3    8.142857
c4    5.428571
c5    0.000000
dtype: float64

In [41]:
df.fillna(0, limit=2)

Unnamed: 0,c1,c2,c3,c4,c5
a,0,1,2,20.0,0.0
b,3,4,5,0.0,0.0
c,6,7,8,0.0,
d,9,10,11,,
e,12,13,14,,
f,15,16,17,18.0,
g,0,0,0,,


In [43]:
df.c4.fillna(method="ffill")

a    20
b    20
c    20
d    20
e    20
f    18
g    18
Name: c4, dtype: float64

In [44]:
df.c4.fillna(method='bfill')

a    20
b    18
c    18
d    18
e    18
f    18
g   NaN
Name: c4, dtype: float64

In [47]:
df.bfill()

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,
b,3.0,4.0,5.0,18.0,
c,6.0,7.0,8.0,18.0,
d,9.0,10.0,11.0,18.0,
e,12.0,13.0,14.0,18.0,
f,15.0,16.0,17.0,18.0,
g,,,,,


In [49]:
fill_values = pd.Series([100, 101, 102], index=['a', 'e', 'g'])
fill_values

a    100
e    101
g    102
dtype: int64

In [51]:
df.c4.fillna(fill_values) #only fills Nan values

a     20
b    NaN
c    NaN
d    NaN
e    101
f     18
g    102
Name: c4, dtype: float64

In [52]:
df.fillna(df.mean())

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20,
b,3.0,4.0,5.0,19,
c,6.0,7.0,8.0,19,
d,9.0,10.0,11.0,19,
e,12.0,13.0,14.0,19,
f,15.0,16.0,17.0,18,
g,7.5,8.5,9.5,19,


In [54]:
s = pd.Series([1, np.nan, np.nan, np.nan, 2])
s.interpolate()

0    1.00
1    1.25
2    1.50
3    1.75
4    2.00
dtype: float64

time based interpolation

In [57]:
ts = pd.Series([1, np.nan, 2],
               index=[datetime.datetime(2104, 1, 1),
                      datetime.datetime(2104, 2, 1),
                      datetime.datetime(2104, 4, 1)])
ts

2104-01-01     1
2104-02-01   NaN
2104-04-01     2
dtype: float64

In [58]:
ts.interpolate()

2104-01-01    1.0
2104-02-01    1.5
2104-04-01    2.0
dtype: float64

In [59]:
ts.interpolate(method="time")

2104-01-01    1.000000
2104-02-01    1.340659
2104-04-01    2.000000
dtype: float64

In [61]:
s = pd.Series([0, np.nan, 100], index=[0, 1, 10])
s

0       0
1     NaN
10    100
dtype: float64

In [62]:
s.interpolate()

0       0
1      50
10    100
dtype: float64

In [63]:
s.interpolate(method="values")

0       0
1      10
10    100
dtype: float64

## Handling duplicate data

In [65]:
data = pd.DataFrame({'a': ['x'] * 3 + ['y'] * 4, 
                     'b': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,a,b
0,x,1
1,x,1
2,x,2
3,y,3
4,y,3
5,y,4
6,y,4


In [66]:
data.duplicated()

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [67]:
data.drop_duplicates()

Unnamed: 0,a,b
0,x,1
2,x,2
3,y,3
5,y,4


In [72]:
data.drop_duplicates(keep='last') # was (take_last = True)

Unnamed: 0,a,b
1,x,1
2,x,2
4,y,3
6,y,4


In [73]:
data['c'] = range(7)
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6    False
dtype: bool

In [75]:
data

Unnamed: 0,a,b,c
0,x,1,0
1,x,1,1
2,x,2,2
3,y,3,3
4,y,3,4
5,y,4,5
6,y,4,6


In [76]:
data.drop_duplicates(['a', 'b'])

Unnamed: 0,a,b,c
0,x,1,0
2,x,2,2
3,y,3,3
5,y,4,5


# Transforming Data

In [77]:
x = pd.Series({"one": 1, "two": 2, "three": 3})
y = pd.Series({1: "a", 2: "b", 3: "c"})
x

one      1
three    3
two      2
dtype: int64

In [78]:
y

1    a
2    b
3    c
dtype: object

In [79]:
x.map(y)

one      a
three    c
two      b
dtype: object

In [80]:
x = pd.Series({"one": 1, "two": 2, "three": 3})
y = pd.Series({1: "a", 2: "b"})
x.map(y)

one        a
three    NaN
two        b
dtype: object

In [81]:
s = pd.Series([0., 1., 2., 3., 2., 4.])
s

0    0
1    1
2    2
3    3
4    2
5    4
dtype: float64

In [82]:
s.replace(2, 5)

0    0
1    1
2    5
3    3
4    5
5    4
dtype: float64

In [83]:
s.replace([0, 1, 2, 3, 4], [4, 3, 2, 1, 0])

0    4
1    3
2    2
3    1
4    2
5    0
dtype: float64

In [84]:
s.replace({0: 10, 1: 100})

0     10
1    100
2      2
3      3
4      2
5      4
dtype: float64

In [85]:
df = pd.DataFrame({'a': [0, 1, 2, 3, 4], 'b': [5, 6, 7, 8, 9]})
df

Unnamed: 0,a,b
0,0,5
1,1,6
2,2,7
3,3,8
4,4,9


In [88]:
df.replace({'a': 1, 'b': 8}, 100) # replace specific values in given columns

Unnamed: 0,a,b
0,0,5
1,100,6
2,2,7
3,3,100
4,4,9


In [89]:
s[0] = 10
s

0    10
1     1
2     2
3     3
4     2
5     4
dtype: float64

In [90]:
s.replace([1, 2, 3,], method='pad')

0    10
1    10
2    10
3    10
4    10
5     4
dtype: float64

Applying functions to transform data

In [91]:
s = pd.Series(np.arange(0, 5))
s.apply(lambda v: v * 2)

0    0
1    2
2    4
3    6
4    8
dtype: int64

In [92]:
df = pd.DataFrame(np.arange(12).reshape(4, 3),
                 columns=['a', 'b', 'c'])
df

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11


In [93]:
df.apply(lambda col: col.sum())

a    18
b    22
c    26
dtype: int64

In [94]:
df.apply(lambda row: row.sum(), axis=1)

0     3
1    12
2    21
3    30
dtype: int64

In [96]:
df['interim'] = df.apply(lambda r: r.a * r.b, axis=1)
df

Unnamed: 0,a,b,c,interim
0,0,1,2,0
1,3,4,5,12
2,6,7,8,42
3,9,10,11,90


In [98]:
df['result'] = df.apply(lambda r: r.interim + r.c, axis=1)
df

Unnamed: 0,a,b,c,interim,result
0,0,1,2,0,2
1,3,4,5,12,17
2,6,7,8,42,50
3,9,10,11,90,101


In [99]:
df = pd.DataFrame(np.arange(0, 15).reshape(3, 5))
df.loc[1, 2] = np.nan
df

Unnamed: 0,0,1,2,3,4
0,0,1,2.0,3,4
1,5,6,,8,9
2,10,11,12.0,13,14


In [100]:
df.dropna().apply(lambda x: x.sum(), axis=1)

0    10
2    60
dtype: float64

In [101]:
df.applymap(lambda x: '%.2f' % x)

Unnamed: 0,0,1,2,3,4
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0


In [105]:
df.applymap(lambda x: '{0:.2f}'.format(x)) # f -

Unnamed: 0,0,1,2,3,4
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
