# Summary of Pandas
## By Allen Huang

In [9]:
import pandas as pd

### 1. Create Pandas Object

#### 1.1 Create Series
``` Python
pd.Series(data=None, index=None, dtype=None, name=None, copy=False, fastpath=False)
# pass in a list to create a Series
pd.Series([0.25, 0.5, 0.75, 1.0]) 
# set the index argument
pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
# pass in a dict to creare a Series, key->index, value->value 
pd.Series({2:'a', 1:'b', 3:'c'})
# attribute of a Series
Series.values
Series.index
Series.index.names
# change a value using index
data['e'] = 1.25
# reindex
Series.reindex(index)
```
#### 1.2 Create DataFrame
``` Python
# you can set the columns and index
pd.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)
# create from Series (population and area)
# key->column name, value->value, keep index as before
pd.DataFrame({'population': population,'area': area})
# from a list of dict. 
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])

# attribute of a DataFrame
DataFrame.values
DataFrame.index
DataFrame.index.names
DataFrame.columns
DataFrame.columns.names
```

### 2. Index

#### 2.1 Slicing by explicit index or implicit index
- Explicit index: when using data[1] or loc. 
- Implicit index: when using data[1:3] or iloc
- ix: a combination of loc and iloc

```Python
# for a Series, just use the index if index is a string
data['a':'c']
data.iloc[1:3]
data.loc[1:3]
data.ix[1:3]
# for a DataFrame, [slicing row, slicing column] 
data.iloc[:3, :2]
data.loc[:'Florida', :'pop']
data.ix[:3, :'pop']
```
#### 2.2 MultiIndex
```Python
# get index from..
pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])
pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]])
pd.MultiIndex.from_product([['a', 'b'], [1, 2]])
pd.MultiIndex(levels=[['a', 'b'], [1, 2]], labels=[[0, 0, 1, 1], [0, 1, 0, 1]])
# we can set the index as a list
index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]]
# MultiIndex of a Series: Series[the first level, the second level]
pop[:, 2000]
# MultiIndex of a DataFrame: df[column level1, column level2]
# df.loc[(row level1, row level2), (column level1, column level2)]
health_data.loc[:, ('Bob', 'HR')]
health_data.loc[(2013,1), ('Bob', 'HR')]
# unstack(): multiply indexed Series -> DataFrame
pop_df = pop.unstack()
pop_df.stack()
pop.unstack(level=0)
# the best way: using IndexSlice
idx = pd.IndexSlice
health_data.loc[idx[2013:2014,1], idx['Bob':'Guido', 'Temp']]
health_data.loc[idx[:, 1], idx[:, 'HR']]
# build a MultiIndex from the column values
pop_flat.set_index(['state', 'year'])
```
#### 2.3 Sort and set index
```Python
# sort index
data = data.sort_index()
# set a column for a Series, then trun into a df
pop_flat = pop.reset_index(name='population')
```

### 3. Selection

#### 3.1 selection of Series

```Python
# It will select index
data['area']
# masking of a Series
data[(data > 0.3) & (data < 0.8)]
# fancy indexing, take care of double []
data[['a', 'e']]


```
#### 3.2 selection of DataFrame

```Python
# It will select column
data['area']
# data.density is data['density'] only when column is a string
data.loc[data.density > 100]
data['density'] = data['pop'] / data['area']
# fancy indexing, select column
data.iloc[[0,1,2,3]]
```

### 4. Basic Operations

#### 4.1 UFuncs

```Python
# transposition
data.T
# divide between two Series
Series1 / Series2
# union of two Series index
area.index | population.index
# add
A+B
A.add(B, fill_value = ?)
# operation between df and Series
df.subtract(df['R'], axis=0)
```
#### 4.2 Deal with missing value

```Python
# check for missing value
data.isnull()
data.notnull()
# Boolean masks can be used directly as a ``Series`` or ``DataFrame`` index
data[data.notnull()]
# drop missing value, rows by default
data.dropna()
df.dropna(axis='columns')
# all or any
df.dropna(axis='columns', how='all')
# the thresh parameter lets you specify a minimum number of non-null values for the row/column to be kept
df.dropna(axis='rows', thresh=3)
# fill null value
data.fillna(0)
# forward fill and back fill
data.fillna(method='ffill')
data.fillna(method='bfill')
df.fillna(method='ffill', axis=1)
```
#### 4.3 Data Aggregations
```Python
data_mean = health_data.mean(level='year')
data_mean.sum(axis=1, level='type')
```

### 5. Advance Operations

#### 5.1 Concat and Append
```Python
pd.concat([ser1, ser2])
pd.concat([df3, df4], axis= 1)
# ignore overlap of index
pd.concat([x, y], ignore_index=True)
# Error when overlap of index
pd.concat([x, y], verify_integrity=True)
# specify a label for the data sources
pd.concat([x, y], keys=['x', 'y'])
# use inner join to avoid NaN
pd.concat([df5, df6], join='inner')
# specify the index of the remaininig colums
pd.concat([df5, df6], join_axes=[df5.columns])
# append 
df1.append(df2)
```
#### 5.2 Merge
```Python
pd.merge(df1, df2)