In [1]:
# %load rapid_imports.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

%matplotlib inline

## DataFrames

In [2]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

In [12]:
df = pd.DataFrame([{'state':s, 'year':y, 'pop':p} for s, y, p in zip(data['state'], data['year'], data['pop'])],
             columns=['state','year', 'pop'], index=['one', 'two', 'two', 'four', 'five', 'six'])

In [13]:
df['state']

one       Ohio
two       Ohio
two       Ohio
four    Nevada
five    Nevada
six     Nevada
Name: state, dtype: object

In [15]:
df.loc['four']

state    Nevada
year       2001
pop         2.4
Name: four, dtype: object

In [20]:
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])

In [21]:
df['newln'] = val
df

Unnamed: 0,state,year,pop,newln
one,Ohio,2000,1.5,
two,Ohio,2001,1.7,-1.2
two,Ohio,2002,3.6,-1.2
four,Nevada,2001,2.4,-1.5
five,Nevada,2002,2.9,-1.7
six,Nevada,2003,3.2,


In [22]:
del df['newln']
df

Unnamed: 0,state,year,pop
one,Ohio,2000,1.5
two,Ohio,2001,1.7
two,Ohio,2002,3.6
four,Nevada,2001,2.4
five,Nevada,2002,2.9
six,Nevada,2003,3.2


## Series

In [9]:
obj = pd.Series([5, -6, 7], index=['o', 'p', 'q'])
obj

o    5
p   -6
q    7
dtype: int64

In [10]:
obj.index

Index(['o', 'p', 'q'], dtype='object')

In [11]:
'o' in obj

True

### create from dict

In [16]:
d = {'d':4, 'a': 1, 'b':2}
s = pd.Series(d, index=['a', 'd', 'x'])
s

a    1.0
d    4.0
x    NaN
dtype: float64

In [18]:
pd.notnull(s)

a     True
d     True
x    False
dtype: bool

In [20]:
s.isna()

a    False
d    False
x     True
dtype: bool

In [26]:
s.index.name = 'lbl'

In [28]:
s.index = ['one', 'two', 'three']

In [29]:
s

one      1.0
two      4.0
three    NaN
Name: tst, dtype: float64

In [2]:
series2 = pd.Series(range(3), index=['b', 'e', 'f'])

In [3]:
series2

b    0
e    1
f    2
dtype: int64

In [4]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                     columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [5]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [22]:
frame2 = frame + series2

In [26]:
frame2.loc['Utah':'Texas', 'd':]

Unnamed: 0,d,e,f
Utah,,3.0,
Ohio,,6.0,
Texas,,9.0,


In [18]:
frame - frame.loc['Ohio']

Unnamed: 0,b,d,e
Utah,-3.0,-3.0,-3.0
Ohio,0.0,0.0,0.0
Texas,3.0,3.0,3.0
Oregon,6.0,6.0,6.0


In [21]:
frame['b']

Utah      0.0
Ohio      3.0
Texas     6.0
Oregon    9.0
Name: b, dtype: float64

In [27]:
fr = pd.DataFrame(np.random.randn(12).reshape(3,4), index=list('abc'), columns=list('qwer'))
fr

Unnamed: 0,q,w,e,r
a,-0.94734,-0.738383,0.643218,0.5436
b,0.642427,-0.188975,-0.412392,0.88256
c,-0.352883,0.344089,-0.854805,0.879818


In [29]:
fr.apply(lambda col: col.a - col.c)

q   -0.594457
w   -1.082472
e    1.498023
r   -0.336218
dtype: float64

In [34]:
fr.apply(lambda row: row.max() - row.e, axis='columns')

a    0.000000
b    1.294952
c    1.734622
dtype: float64

In [56]:
fr.apply(lambda row: pd.Series(['%s$%.2f' % ('-' if row.e < 0 else '', abs(row.e)), '%.3f%%' % row.q]), axis='columns')

Unnamed: 0,0,1
a,$0.64,-0.947%
b,-$0.41,0.642%
c,-$0.85,-0.353%


In [47]:
fr.apply(lambda col: pd.Series([col.min(), col.max(), '{:.2f}'.format(col.mean())], index=['min', 'max', 'mm']))

Unnamed: 0,q,w,e,r
min,-0.94734,-0.738383,-0.854805,0.5436
max,0.642427,0.344089,0.643218,0.88256
mm,-0.22,-0.19,-0.21,0.77


In [63]:
fr['w'].map(lambda x: x**2)

a    0.545209
b    0.035711
c    0.118398
Name: w, dtype: float64

In [52]:
fr.apply(lambda col: pd.Series(['${:.4f}'.format(col.a), col.max(), '{:.2f}'.format(col.mean())], index=['a', 'max', 'mm']))

Unnamed: 0,q,w,e,r
a,$-0.9473,$-0.7384,$0.6432,$0.5436
max,0.642427,0.344089,0.643218,0.88256
mm,-0.22,-0.19,-0.21,0.77


In [44]:
'{:.2f}'.format(2.55)

'2.55'

In [46]:
fr.mean().apply(lambda x: '{:.2f}'.format(x))

q    -0.22
w    -0.19
e    -0.21
r     0.77
dtype: object

In [48]:
fmt = lambda x: '{:.2f}'.format(x)

In [49]:
fr.applymap(fmt)

Unnamed: 0,q,w,e,r
a,-0.95,-0.74,0.64,0.54
b,0.64,-0.19,-0.41,0.88
c,-0.35,0.34,-0.85,0.88


## Values counts

In [2]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

In [3]:
obj.value_counts()

a    3
c    3
b    2
d    1
dtype: int64

In [15]:
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4], 'Qu2': [2, 3, 1, 2, 3], 'Qu3': [1, 5, 2, 4, 4]})
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [13]:
data.apply(pd.value_counts)

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,,2.0,1.0
3,2.0,2.0,
4,2.0,,2.0
5,,,1.0
