## Pandas

#### [https://pandas.pydata.org/pandas-docs/stable/user_guide/index.html](https://pandas.pydata.org/pandas-docs/stable/user_guide/index.html)

In [4]:
import pandas as pd
import numpy as np

## pd.Serise

In [3]:
data = pd.Series([1,2,3,4,5])
type(data)

pandas.core.series.Series

In [108]:
data = pd.Series(['A', 'B', 'C', 'D', 'E'], index = range(5))
data

0    A
1    B
2    C
3    D
4    E
dtype: object

In [9]:
data.values

array(['A', 'B', 'C', 'D', 'E'], dtype=object)

In [11]:
data.index

RangeIndex(start=0, stop=5, step=1)

In [13]:
dup = lambda x: x*2
data.map(dup)

0    AA
1    BB
2    CC
3    DD
4    EE
dtype: object

In [109]:
data + '-' + data

0    A-A
1    B-B
2    C-C
3    D-D
4    E-E
dtype: object

In [111]:
s1 = pd.Series([3.5, 4.8, -3.1, 9.3], index = ['a', 'b', 'c', 'd'])
s2 = pd.Series([-4.2, 3.9, -4.7, 6], index = ['a', 'c', 'd', 'e'])

In [1]:
s1 + s2

NameError: name 's1' is not defined

## pd.DataFrame

In [35]:
data = {'Pop': [92, 99, 86, 90, 91], 'Eco': [2.6, 2.8, 2.1, 2.4, 1.8]}
cities = ['Beijing', 'Shanghai', 'Guangzhou', 'Shenzhen', 'Hangzhou']
frame = pd.DataFrame(data, index=cities)

In [36]:
frame

Unnamed: 0,Pop,Eco
Beijing,92,2.6
Shanghai,99,2.8
Guangzhou,86,2.1
Shenzhen,90,2.4
Hangzhou,91,1.8


In [37]:
frame.head()

Unnamed: 0,Pop,Eco
Beijing,92,2.6
Shanghai,99,2.8
Guangzhou,86,2.1
Shenzhen,90,2.4
Hangzhou,91,1.8


In [38]:
frame['Eco']

Beijing      2.6
Shanghai     2.8
Guangzhou    2.1
Shenzhen     2.4
Hangzhou     1.8
Name: Eco, dtype: float64

In [39]:
frame[['Eco', 'Pop']]

Unnamed: 0,Eco,Pop
Beijing,2.6,92
Shanghai,2.8,99
Guangzhou,2.1,86
Shenzhen,2.4,90
Hangzhou,1.8,91


In [70]:
frame.Pop

Beijing      92
Shanghai     99
Guangzhou    86
Shenzhen     90
Hangzhou     91
Name: Pop, dtype: int64

In [72]:
frame.loc['Shanghai']

Pop    99.0
Eco     2.8
Name: Shanghai, dtype: float64

In [76]:
frame['avg_sal'] = pd.Series([12000, 11000, 10000, 9000], index=['Beijing', 'Shanghai', 'Hangzhou', 'Chengdu'])

In [94]:
frame

Unnamed: 0,Pop,Eco,avg_sal
Beijing,92,2.6,12000.0
Shanghai,99,2.8,11000.0
Guangzhou,86,2.1,
Shenzhen,90,2.4,
Hangzhou,91,1.8,10000.0


In [6]:
frame[['Pop', 'avg_sal']]

NameError: name 'frame' is not defined

#### Add new data

In [26]:
df = pd.DataFrame(np.random.randn(5,4), index=['A', 'B', 'C', 'D', 'E'], columns=['C1', 'C2', 'C3', 'C4'])
df

Unnamed: 0,C1,C2,C3,C4
A,-1.420881,-0.678947,0.533885,0.743974
B,2.22505,0.117181,0.244615,-0.177299
C,-0.40573,0.781775,0.353478,-0.207279
D,-1.079697,-0.12307,-0.390982,1.255174
E,0.947126,-1.022311,1.167168,-0.571977


In [33]:
df['Max'] = df.max(axis=1)
df['Min'] = df.min(axis=1)
df['Mean'] = df.mean(axis=1)

### frame.drop

In [87]:
drop_index_frame = frame.drop('Guangzhou')
drop_index_frame.index

Index(['Beijing', 'Shanghai', 'Shenzhen', 'Hangzhou'], dtype='object')

In [90]:
drop_col_frame = frame.drop('Eco', axis=1)
dropped_frrame.columns

Index(['Pop', 'Eco', 'avg_sal'], dtype='object')

In [92]:
frame

Unnamed: 0,Pop,Eco,avg_sal
Beijing,92,2.6,12000.0
Shanghai,99,2.8,11000.0
Guangzhou,86,2.1,
Shenzhen,90,2.4,
Hangzhou,91,1.8,10000.0


In [45]:
frame2 = pd.DataFrame({'Com_num': [23000, 21000, 12000, 16000, 13000]}, index=cities)

In [115]:
frame2.head()

Unnamed: 0,Com_num
Beijing,23000
Shanghai,21000
Guangzhou,12000
Shenzhen,16000
Hangzhou,13000


### Arithmetic and Data alignment

In [154]:
frame + frame2

Unnamed: 0,Com_num,a,b,c,d
001,,,,,
002,,,,,
Beijing,,,,,
Guangzhou,,,,,
Hangzhou,,,,,
Shanghai,,,,,
Shenzhen,,,,,


In [116]:
frame.add(frame2, fill_value=0)

Unnamed: 0,Com_num,Eco,Pop,avg_sal
Beijing,23000.0,2.6,92.0,12000.0
Shanghai,21000.0,2.8,99.0,11000.0
Guangzhou,12000.0,2.1,86.0,
Shenzhen,16000.0,2.4,90.0,
Hangzhou,13000.0,1.8,91.0,10000.0


### Dataframe and Series

In [119]:
frame  = pd.DataFrame(np.arange(12).reshape((4,3)), columns = list('bde'), index=range(4))
series = frame.iloc[0]

In [121]:
frame

Unnamed: 0,b,d,e
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11


In [123]:
series

b    0
d    1
e    2
Name: 0, dtype: int32

In [124]:
frame - series

Unnamed: 0,b,d,e
0,0,0,0
1,3,3,3
2,6,6,6
3,9,9,9


In [131]:
frame

Unnamed: 0,b,d,e
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11


### Function Application and Mapping

In [129]:
def f(x):
    return pd.Series([x.min(), x.max()], index = ['min', 'max'])

def f2(x):
    return (x.max() - x.min()) / x.max()

frame.apply(f)


Unnamed: 0,b,d,e
min,0,1,2
max,9,10,11


In [132]:
frame.apply(f2, axis=1)

0    1.000000
1    0.400000
2    0.250000
3    0.181818
dtype: float64

In [143]:
fmt = lambda x: "%.2f" % x
frame.applymap(fmt)

Unnamed: 0,b,d,e
0,0.0,1.0,2.0
1,3.0,4.0,5.0
2,6.0,7.0,8.0
3,9.0,10.0,11.0


### Sorting and Ranking

In [146]:
## sort_index, sort_values

obj = pd.Series(range(4), index = ['d', 'b', 'c', 'a'])
obj.sort_values()
# obj.sort_index()

d    0
b    1
c    2
a    3
dtype: int64

In [150]:
frame = pd.DataFrame(np.arange(8).reshape((2,4)), index = ['001', '002'], columns=['d', 'b', 'c', 'a'])
frame.sort_index()

Unnamed: 0,d,b,c,a
1,0,1,2,3
2,4,5,6,7


In [160]:
frame.sort_values(by=['a', 'b'])

Unnamed: 0,d,b,c,a
1,0,1,2,3
2,4,5,6,7


### Data Loading, Storage and file formats

In [159]:
pd.read_csv

Object `frame.to_excel()` not found.
