## Learning Pandas

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
s = pd.Series([1,3,5,np.nan,6,8])
s

0     1
1     3
2     5
3   NaN
4     6
5     8
dtype: float64

In [4]:
A = np.random.randn(6,4)

In [5]:
pd.DataFrame(A,
             columns=["age","height","weight","color"],
            index=["a","b","c","d","e","f"])

Unnamed: 0,age,height,weight,color
a,1.352097,-1.211018,-0.432757,1.10222
b,-0.201758,-1.2795,1.386882,2.510658
c,1.426105,-0.400167,0.596179,0.519832
d,0.720333,0.848322,0.250955,0.064121
e,-0.088224,-0.862046,0.16366,1.038924
f,-0.846496,-1.487301,0.899628,-1.196989


In [6]:
df2 = pd.DataFrame({ 'A' : 1.,
'B' : pd.Timestamp('20130102'),
'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
'D' : np.array([3] * 4,dtype='int32'),
'E' : pd.Categorical(["test","train","test","train"]),
'F' : 'foo' })

df2

Unnamed: 0,A,B,C,D,E,F
0,1,2013-01-02,1,3,test,foo
1,1,2013-01-02,1,3,train,foo
2,1,2013-01-02,1,3,test,foo
3,1,2013-01-02,1,3,train,foo


In [7]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E            object
F            object
dtype: object

In [8]:
df2.columns

Index([u'A', u'B', u'C', u'D', u'E', u'F'], dtype='object')

In [9]:
for column_name in df2.columns:
    print(column_name, df2[column_name].dtype)

('A', dtype('float64'))
('B', dtype('<M8[ns]'))
('C', dtype('float32'))
('D', dtype('int32'))
('E', dtype('O'))
('F', dtype('O'))


In [10]:
for row_name in df2.index:
    print(row_name)

0
1
2
3


In [11]:
df2["A"]

0    1
1    1
2    1
3    1
Name: A, dtype: float64

In [12]:
df2.T

Unnamed: 0,0,1,2,3
A,1,1,1,1
B,2013-01-02 00:00:00,2013-01-02 00:00:00,2013-01-02 00:00:00,2013-01-02 00:00:00
C,1,1,1,1
D,3,3,3,3
E,test,train,test,train
F,foo,foo,foo,foo


In [13]:
df2["T"] = [1,2,"bob","joe"]

In [14]:
df2.T

Unnamed: 0,0,1,2,3
A,1,1,1,1
B,2013-01-02 00:00:00,2013-01-02 00:00:00,2013-01-02 00:00:00,2013-01-02 00:00:00
C,1,1,1,1
D,3,3,3,3
E,test,train,test,train
F,foo,foo,foo,foo
T,1,2,bob,joe


In [15]:
df2.max(axis=0)

A                      1
B    2013-01-02 00:00:00
C                      1
D                      3
E                  train
F                    foo
T                    joe
dtype: object

In [16]:
df2.max(axis=1)

0    3
1    3
2    3
3    3
dtype: float64

In [17]:
df2.mean(axis=0)

A    1
C    1
D    3
dtype: float64

In [18]:
df2.mean(axis=1)

0    1.666667
1    1.666667
2    1.666667
3    1.666667
dtype: float64

In [19]:
df2.head(1)

Unnamed: 0,A,B,C,D,E,F,T
0,1,2013-01-02,1,3,test,foo,1


In [20]:
df2.tail(2)

Unnamed: 0,A,B,C,D,E,F,T
2,1,2013-01-02,1,3,test,foo,bob
3,1,2013-01-02,1,3,train,foo,joe


In [21]:
df2.loc[0, :]

A                      1
B    2013-01-02 00:00:00
C                      1
D                      3
E                   test
F                    foo
T                      1
Name: 0, dtype: object

In [22]:
df2.loc[1, :]

A                      1
B    2013-01-02 00:00:00
C                      1
D                      3
E                  train
F                    foo
T                      2
Name: 1, dtype: object

In [23]:
df2.loc[:,"D"]

0    3
1    3
2    3
3    3
Name: D, dtype: int32

In [24]:
df2.loc[[0,1], ["C","D"]]

Unnamed: 0,C,D
0,1,3
1,1,3


In [25]:
df2.loc[:,"E"]

0     test
1    train
2     test
3    train
Name: E, dtype: object

In [26]:
df2.loc[:,"E"] == "test"

0     True
1    False
2     True
3    False
Name: E, dtype: bool

In [27]:
df2.loc[df2.loc[:,"E"] == "test",:]

Unnamed: 0,A,B,C,D,E,F,T
0,1,2013-01-02,1,3,test,foo,1
2,1,2013-01-02,1,3,test,foo,bob


In [28]:
df2.loc[(df2.loc[:,"E"] == "test") & (df2.loc[:,"T"] == "bob"),
        :]

Unnamed: 0,A,B,C,D,E,F,T
2,1,2013-01-02,1,3,test,foo,bob


#Friday's Material

In [29]:
for variable_value, group_data in df2.groupby("E"):
    print("Variable value:" + variable_value)
    print("Data:")
    print(group_data)
    print("")

Variable value:test
Data:
   A          B  C  D     E    F    T
0  1 2013-01-02  1  3  test  foo    1
2  1 2013-01-02  1  3  test  foo  bob

Variable value:train
Data:
   A          B  C  D      E    F    T
1  1 2013-01-02  1  3  train  foo    2
3  1 2013-01-02  1  3  train  foo  joe



In [30]:
df2.mean()

A    1
C    1
D    3
dtype: float64

In [32]:
df2.groupby("E").count()

Unnamed: 0_level_0,A,B,C,D,F,T
E,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
test,2,2,2,2,2,2
train,2,2,2,2,2,2
