### Pandas

In [92]:
import numpy as np
import pandas as pd

In [93]:
labels = ['a', 'b', 'c']
my_data = [10, 20, 30]
arr = np.array(my_data)
d = {'a':10, 'b':20, 'c':30}

In [94]:
d

{'a': 10, 'b': 20, 'c': 30}

### Series

In [95]:
pd.Series(data = my_data)

0    10
1    20
2    30
dtype: int64

In [96]:
pd.Series(data = my_data, index = labels)

a    10
b    20
c    30
dtype: int64

In [97]:
#rewrite the code above

pd.Series(my_data, labels)

a    10
b    20
c    30
dtype: int64

In [98]:
pd.Series(arr, labels)

a    10
b    20
c    30
dtype: int32

In [99]:
#fast to create a pd.Series from dictionary

pd.Series(d)

a    10
b    20
c    30
dtype: int64

In [100]:
d

{'a': 10, 'b': 20, 'c': 30}

In [101]:
pd.Series(data=labels)

0    a
1    b
2    c
dtype: object

In [102]:
ser1 = pd.Series([1,2,5,4],['USA','Germany', 'USSR','France'])
ser1

USA        1
Germany    2
USSR       5
France     4
dtype: int64

In [103]:
ser1['USA']

1

In [104]:
ser2 = pd.Series([1,2,5,4],['USA','Japan', 'Aus','France'])

In [105]:
ser1 + ser2

Aus        NaN
France     8.0
Germany    NaN
Japan      NaN
USA        2.0
USSR       NaN
dtype: float64

### DataFrames

In [106]:
from numpy.random import randn

In [107]:
np.random.seed(101)

In [108]:
df = pd.DataFrame(randn(5,4),['A','B','C','D','E'], ['W','X','Y','Z'])

In [109]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [110]:
#grab one column

df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [111]:
type(df['W'])

pandas.core.series.Series

In [112]:
type(df)

pandas.core.frame.DataFrame

In [113]:
#grab one columns - not recommend to use this method

df.W

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [114]:
#grab two columns

df[['W','X']]

Unnamed: 0,W,X
A,2.70685,0.628133
B,0.651118,-0.319318
C,-2.018168,0.740122
D,0.188695,-0.758872
E,0.190794,1.978757


In [115]:
#define a new columns

df['new'] = df['W'] + df['Y']

In [116]:
#drop a column - get the error because do not mention axis

df.drop('new')

KeyError: "['new'] not found in axis"

In [117]:
#try again to drop a column

df.drop('new', axis = 1, inplace = True)

In [118]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [119]:
#drop a row

df.drop('E', axis = 0)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057


In [120]:
#get a tuple of the df.shape
#df have 5 rows and 4 columns

df.shape

(5, 4)

In [121]:
#grab a row using loc (label base location)

df.loc['D']

W    0.188695
X   -0.758872
Y   -0.933237
Z    0.955057
Name: D, dtype: float64

In [122]:
#grab a row using iloc (index base location)

df.iloc[0]

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

In [123]:
#select subset of dataframe

df.loc['B','Y']

-0.8480769834036315

In [124]:
df.loc[['A','B'],['W','Y']]

Unnamed: 0,W,Y
A,2.70685,0.907969
B,0.651118,-0.848077


In [125]:
#Conditional selection with DataFrames

df[df > 0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [126]:
df[df['W']>0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [127]:
df[df['Z']<0]

Unnamed: 0,W,X,Y,Z
C,-2.018168,0.740122,0.528813,-0.589001


In [128]:
boolser = df['W'] >0
result = df[boolser]
col = ['Y','X']
result[col]

Unnamed: 0,Y,X
A,0.907969,0.628133
B,-0.848077,-0.319318
D,-0.933237,-0.758872
E,2.605967,1.978757


#### More than one conditions

In [129]:
#get errors

df[(df['W']>0) and (df['Y'] >1)]

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

#### and

In [130]:
# More than one conditions should be in this way - use & instead of `and`

df[(df['W'] > 0) & (df['Y'] < 1)]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057


#### or

In [131]:
#use | instead of `or`

df[(df['W'] > 0) | (df['Y'] <1)]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


#### Reset index

In [143]:
#old index will be an column

df.reset_index()

Unnamed: 0,States,W,X,Y,Z
0,CA,2.70685,0.628133,0.907969,0.503826
1,NY,0.651118,-0.319318,-0.848077,0.605965
2,WY,-2.018168,0.740122,0.528813,-0.589001
3,OR,0.188695,-0.758872,-0.933237,0.955057
4,CO,0.190794,1.978757,2.605967,0.683509


#### Add one  new column and use as index later

In [144]:
newind = 'CA NY WY OR CO'.split()

In [145]:
newind

['CA', 'NY', 'WY', 'OR', 'CO']

In [146]:
df['States'] = newind

In [148]:
# use one of the column as the index
# no need to type in reset

df.set_index('States')

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,2.70685,0.628133,0.907969,0.503826
NY,0.651118,-0.319318,-0.848077,0.605965
WY,-2.018168,0.740122,0.528813,-0.589001
OR,0.188695,-0.758872,-0.933237,0.955057
CO,0.190794,1.978757,2.605967,0.683509


In [151]:
# Index levels - index hierarchy

outside = ['G1','G1','G1', 'G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside, inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [152]:
df = pd.DataFrame(randn(6,2), hier_index, ['A', 'B'])

In [153]:
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [155]:
df.loc['G1'].loc[1]

A    0.302665
B    1.693723
Name: 1, dtype: float64

In [156]:
df.index.names

FrozenList([None, None])

In [157]:
df.index.names = ['Groups', 'Num']

In [158]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [161]:
df.loc['G2'].loc[2]

A    0.807706
B    0.072960
Name: 2, dtype: float64

#### cross section

In [166]:
df.xs(1, level = 'Num')

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,0.302665,1.693723
G2,0.166905,0.184502


### The End