# The Pandas Library, an Introduction

In [None]:
import pandas as pd
import numpy as np

## Introduction to Pandas Data Structures

### Declaring a Series 

In [None]:
s = pd.Series([12, -4, 7, 9])
s

In [None]:
s = pd.Series([12, -4, 7, 9], index=['a', 'b', 'c', 'd'])
s

In [None]:
s.values

In [None]:
s.index

### Selecting the Internal Elements

In [None]:
s[2]

In [None]:
s['b']

In [None]:
s[0:2]

In [None]:
s[['b','c']]

### Assigning Values to the Elements

In [None]:
s[1] = 0
s

In [None]:
s['b'] = 1
s

### Defining Series from NumPy Arrays and Other Series

In [None]:
arr = np.array([1, 2, 3, 4])
s3 = pd.Series(arr)
s3

In [None]:
s4 = pd.Series(s)
s4

In [None]:
s3

### Filtering Values 

In [None]:
s[s > 8]

### Operations and Mathematical Functions

In [None]:
s / 2

In [None]:
np.log(s)

### Evaluating Values

In [None]:
serd = pd.Series([1,0,2,1,2,3], index=['white', 'white', 'blue', 'green', 'green', 'yellow'])
serd

In [None]:
serd.unique()

In [None]:
serd.value_counts()

In [None]:
serd.isin([0,3])

In [None]:
serd[serd.isin([0,3])]

### NaN Values

In [None]:
s2 = pd.Series([5, -3, np.NaN, 14])
s2

In [None]:
s2.isnull()

In [None]:
s2.notnull()

In [None]:
s2[s2.notnull()]

In [None]:
s2[s2.isnull()]

### Series as Dictionaries

In [None]:
mydict = {'red': 2000, 'blue': 1000, 'yellow': 500, 'orange': 1000}
myseries = pd.Series(mydict)
myseries

In [None]:
colors = ['red', 'yellow', 'orange', 'blue', 'green']
myseries = pd.Series(mydict, index=colors)
myseries

### Operations between Series


In [None]:
mydict2 = {'red': 400, 'yellow': 1000, 'black': 700}
myseries2 = pd.Series(mydict2)
myseries + myseries2

## The DataFrame

### Defining a DataFrame

In [None]:
data = {'color' : ['blue', 'green', 'yellow', 'red', 'white'],
        'object' : ['ball', 'pen', 'pencil', 'paper', 'mug'],
        'price' : [1.2, 1.0, 0.6, 0.9, 1.7]}
frame = pd.DataFrame(data)
frame

In [None]:
frame2 = pd.DataFrame(data, columns=['object', 'price'])
frame2

In [None]:
frame2 = pd.DataFrame(data, index=['one', 'two', 'three', 'four', 'five'])
frame2

In [None]:
frame3 = pd.DataFrame(np.arange(16).reshape((4,4)),
                     index=['red', 'blue', 'yellow', 'white'],
                     columns=['ball', 'pen', 'pencil', 'paper'])
frame3

### Selecting Elements

In [None]:
frame.columns

In [None]:
frame.index

In [None]:
frame.values

In [None]:
frame['price']

In [None]:
frame.price

In [None]:
frame.ix[2]

In [None]:
a = frame.loc[2]
a

In [None]:
frame.iloc[2]

In [None]:
frame.iloc[[2,4]]

In [None]:
#deprecated
frame.ix[[2,4]]

In [None]:
frame[0:1]

In [None]:
frame[1:3]

In [None]:
frame['object'][3]

### Assigning Values

In [None]:
frame.index.name = 'id'; 
frame.columns.name = 'item'
frame

In [None]:
frame['new'] = 12
frame

In [None]:
frame['new'] = [3.0, 1.3, 2.2, 0.8, 1.1]
frame

In [None]:
ser = pd.Series(np.arange(5))
ser

In [None]:
frame['new'] = ser
frame

In [None]:
frame['price'][2] = 3.3

### Membership of a Value

In [None]:
frame.isin([1.0, 'pen'])

In [None]:
frame[frame.isin([1.0, 'pen'])]

### Deleting a Column

In [None]:
del frame['new']
frame

### Filtering 

In [None]:
frame[frame['price'] < 1.2]

### DataFrame from Nested dict 

In [None]:
nestdict = {'red': { 2012: 22, 2013: 33},
            'white': { 2011: 13, 2012: 22, 2013: 16},
            'blue': { 2011: 17, 2012: 27, 2013: 18}}
frame2 = pd.DataFrame(nestdict)
frame2

### Transposition of a DataFrame 

In [None]:
frame2.T

## The Index Objects

In [None]:
ser = pd.Series([5, 0, 3, 8, 4], index=['red', 'blue', 'yellow', 'white', 'green'])
ser.index

### Methods on Index

In [None]:
ser.idxmin()

In [None]:
ser.idxmax()

### Index with Duplicate Labels

In [None]:
serd = pd.Series(range(6), index=['white', 'white', 'blue', 'green', 'green', 'yellow'])
serd

In [None]:
serd['white']

In [None]:
serd.index.is_unique

In [None]:
frame.index.is_unique

 ### Other Functionalities on Indexes

In [None]:
ser = pd.Series([2, 5, 7, 4], index=['one', 'two', 'three', 'four'])
ser

In [None]:
ser.reindex(['three', 'four', 'five', 'one'])

In [None]:
ser3 = pd.Series([1, 5, 6, 3], index=[0, 3, 5, 6])
ser3

In [None]:
ser3.reindex(range(6), method='ffill')

In [None]:
ser3.reindex(range(6), method='bfill')

In [None]:
frame.reindex(range(5), method='ffill', columns=['colors', 'price', 'new', 'object'])

### Dropping

In [None]:
ser = pd.Series(np.arange(4.), index=['red', 'blue', 'yellow', 'white'])
ser

In [None]:
ser.drop('yellow')

In [None]:
ser.drop(['blue','white'])

In [None]:
frame = pd.DataFrame(np.arange(16).reshape((4,4)), 
                    index=['red', 'blue', 'yellow', 'white'],
                    columns=['ball', 'pen', 'pencil', 'paper'])
frame

In [None]:
frame.drop(['blue','yellow'])

In [None]:
frame.drop(['pen','pencil'], axis=1)

### Arithmetic and Data Alignment

In [None]:
s1 = pd.Series([3,2,5,1],['white','yellow','green','blue'])
s2 = pd.Series([1,4,7,2,1],['white','yellow','black','blue','brown'])

In [None]:
s1 + s2

In [None]:
frame1 = pd.DataFrame(np.arange(16).reshape((4,4)),
          index=['red', 'blue', 'yellow', 'white'],
          columns=['ball','pen','pencil','paper'])
frame2 = pd.DataFrame(np.arange(12).reshape((4,3)),
          index=['blue', 'green', 'white', 'yellow'],
          columns=['mug','pen','ball'])
frame1

In [None]:
frame2

In [None]:
frame1 + frame2

## Operations between Data Structures

### Flexible Arithmetic Methods

In [None]:
frame1.add(frame2)

### Operations between DataFrame and Series

In [None]:
frame = pd.DataFrame(np.arange(16).reshape((4,4)),
          index=['red', 'blue', 'yellow', 'white'],
          columns=['ball','pen','pencil','paper'])
frame

In [None]:
ser = pd.Series(np.arange(4), index=['ball','pen','pencil','paper'])
ser

In [None]:
frame - ser

In [None]:
ser['mug'] = 9
ser

In [None]:
frame - ser

## Function Application and Mapping

### Functions by Element

In [None]:
frame = pd.DataFrame(np.arange(16).reshape((4,4)),
          index=['red', 'blue', 'yellow', 'white'],
          columns=['ball','pen','pencil','paper'])
np.sqrt(frame)

### Functions by Row or Column 

In [None]:
f = lambda x: x.max() - x.min()

def f(x):
    return x.max() - x.min()

frame.apply(f)

In [None]:
frame.apply(f, axis=1)

In [None]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)

In [None]:
frame.sum()

In [None]:
frame.mean()

In [None]:
frame.describe()

## Sorting and Ranking

In [None]:
ser = pd.Series([5, 0, 3, 8, 4], index=['red','blue','yellow','white','green'])
ser

In [None]:
ser.sort_index()

In [None]:
ser.sort_index(ascending=False)

In [None]:
frame = pd.DataFrame(np.arange(16).reshape((4,4)),
          index=['red', 'blue', 'yellow', 'white'],
          columns=['ball','pen','pencil','paper'])
frame

In [None]:
frame.sort_index()

In [None]:
frame.sort_index(axis=1)

In [None]:
ser.sort_values()

In [None]:
frame.sort_values(by='pen')

In [None]:
frame.sort_values(by=['pen','pencil'])

In [None]:
ser.rank()

In [None]:
ser.rank(method='first')

In [None]:
ser.rank(ascending=False)

## Correlation and Covariance

In [None]:
seq = pd.Series([1,2,3,4,4,3,2,1],['2006','2007','2008','2009','2010','2011','2012','2013'])
seq2 = pd.Series([3,4,3,4,5,4,3,2],['2006','2007','2008','2009','2010','2011','2012','2013'])
seq.corr(seq2)

In [None]:
seq.cov(seq2)

In [None]:
frame2 = pd.DataFrame([[1,4,3,6],[4,5,6,1],[3,3,1,5],[4,1,6,4]],
          index=['red', 'blue', 'yellow', 'white'],
          columns=['ball','pen','pencil','paper'])
frame2

In [None]:
frame2.corr()

In [None]:
frame2.cov()

In [None]:
ser = pd.Series([0, 1, 2, 3, 9], index=['red','blue','yellow','white','green'])
ser

In [None]:
frame2.corrwith(ser)

In [None]:
frame2.corrwith(frame)

## "Not a Number" Data

### Assigning a NaN Value

In [None]:
ser = pd.Series([0,1,2,np.NaN,9], index=['red','blue','yellow','white','green'])
ser

In [None]:
ser['white'] = None
ser

### FIltering Out NaN Values

In [None]:
ser.dropna()

In [None]:
ser[ser.notnull()]

In [None]:
frame3 = pd.DataFrame([[6,np.nan,6],[np.nan,np.nan,np.nan],[2,np.nan,5]],
                     index=['blue','green','red'],
                     columns=['ball','mug','pen'])
frame3

In [None]:
frame3.dropna()

In [None]:
frame3.dropna(how='all')

### Filliing in NaN Occurrences

In [None]:
frame3.fillna(0)

In [None]:
frame3.fillna({'ball':1, 'mug':0, 'pen': 99})

## Hierarchical Indexing and Leveling

In [None]:
mser = pd.Series(np.random.rand(8),
                index=[['white','white','white','blue','blue','red','red','red'],
                      ['up','down','right','up','down','up','down','left']])
mser

In [None]:
mser.index

In [None]:
mser['white']

In [None]:
mser[:,'up']

In [None]:
mser['white','up']

In [None]:
mser.unstack()

In [None]:
frame

In [None]:
frame.stack()

In [None]:
mframe = pd.DataFrame(np.random.randn(16).reshape(4,4),
                     index=[['white','white','red','red'], ['up','down','up','down']],
                     columns=[['pen','pen','paper','paper'],[1,2,1,2]])
mframe

# Reordering and Sorting Levels

In [None]:
mframe.columns.names = ['objects','id']
mframe.index.names = ['colors','status']
mframe

In [None]:
mframe.swaplevel('colors','status')

In [None]:
mframe.sort_index(level='colors')

### Summary Statistic by Level

In [None]:
mframe.sum(level='colors')

In [None]:
mframe.sum(level='id',axis=1)