In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to run all arguments and not just the last one

In [2]:
# There are 3 types of data structures: series, data frames and planes
# Creating a series
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s # Again, remembering that indexes in python start with 0 and not 1

# Creating a data frame by passing a numpy array with a datetime index and labelled columns
dates = pd.date_range('20130101', periods=6) # creates 6 rows with 4 columns
dates
df = pd.DataFrame(np.random.randn(6, 4), index = dates, columns = list('ABCD'))
df

# Creating a dataframe by passing a dict of objects that can be converted to seried-like
df2 = pd.DataFrame({'A' : 1.,
                   'B' : pd.Timestamp('20130102'),
                   'C' : pd.Series(1, index = list(range(4)), dtype = 'float32'),
                   'D' : np.array([3] * 4, dtype = 'int32'),
                   'E' : pd.Categorical(["test", "train", "test", "train"]),
                   'F' : 'foo'})
df2

# having specific dtypes
df2.dtypes 
# tab completion for attributes is automatic in ipython. For a list of attributes

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

Unnamed: 0,A,B,C,D
2013-01-01,0.483842,1.389233,-1.604447,0.686504
2013-01-02,-0.245957,0.424874,0.683387,-1.003888
2013-01-03,-1.438666,0.021363,0.025582,-1.372246
2013-01-04,0.371391,-0.065038,1.336967,-0.135345
2013-01-05,-0.592473,-0.322534,-0.281605,1.060983
2013-01-06,0.299948,0.329729,0.473723,0.888519


Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [3]:
# View the head or tail of a data frame
df2.head()
df2.tail(3)

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


Unnamed: 0,A,B,C,D,E,F
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [4]:
# Display the index, columns and the underlying numpy data
df.index
df.columns
df.values
df.describe()

# transposing the data
df.T

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

Index(['A', 'B', 'C', 'D'], dtype='object')

array([[ 0.48384223,  1.38923292, -1.60444713,  0.68650368],
       [-0.24595679,  0.4248742 ,  0.68338749, -1.00388756],
       [-1.43866564,  0.02136282,  0.02558235, -1.37224637],
       [ 0.37139074, -0.06503825,  1.33696713, -0.13534527],
       [-0.59247338, -0.32253431, -0.28160506,  1.06098323],
       [ 0.29994796,  0.32972933,  0.47372305,  0.88851935]])

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.186986,0.296271,0.105601,0.020755
std,0.739392,0.60021,1.006599,1.028885
min,-1.438666,-0.322534,-1.604447,-1.372246
25%,-0.505844,-0.043438,-0.204808,-0.786752
50%,0.026996,0.175546,0.249653,0.275579
75%,0.35353,0.401088,0.630971,0.838015
max,0.483842,1.389233,1.336967,1.060983


Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,0.483842,-0.245957,-1.438666,0.371391,-0.592473,0.299948
B,1.389233,0.424874,0.021363,-0.065038,-0.322534,0.329729
C,-1.604447,0.683387,0.025582,1.336967,-0.281605,0.473723
D,0.686504,-1.003888,-1.372246,-0.135345,1.060983,0.888519


In [5]:
# Sorting the data by an axis
df.sort_index(axis = 1, ascending = False)

# Sorting by a value
df.sort_values(by = "B")

Unnamed: 0,D,C,B,A
2013-01-01,0.686504,-1.604447,1.389233,0.483842
2013-01-02,-1.003888,0.683387,0.424874,-0.245957
2013-01-03,-1.372246,0.025582,0.021363,-1.438666
2013-01-04,-0.135345,1.336967,-0.065038,0.371391
2013-01-05,1.060983,-0.281605,-0.322534,-0.592473
2013-01-06,0.888519,0.473723,0.329729,0.299948


Unnamed: 0,A,B,C,D
2013-01-05,-0.592473,-0.322534,-0.281605,1.060983
2013-01-04,0.371391,-0.065038,1.336967,-0.135345
2013-01-03,-1.438666,0.021363,0.025582,-1.372246
2013-01-06,0.299948,0.329729,0.473723,0.888519
2013-01-02,-0.245957,0.424874,0.683387,-1.003888
2013-01-01,0.483842,1.389233,-1.604447,0.686504


In [6]:
# While selecting and setting can be done by standard expressions, we should use optimized pandas data access methods
# .at, .iat, .loc, .iloc and .ix
# Selecting a single column which gives a Series
df['A']

# Selecting via [] which slices the rows
df[0:3] # last index is not included
df['20130102':'20130104']

# Selection by a label
# For getting a cross selection using a label
df.loc[dates[0]]

# Selecting a multi-axis by laebls
df.loc[:, ['A', 'B']]

# Showing label slicing, both endpoints are included
df.loc['20130102':'20130104', ['A', 'B']]

# Reduction in dimentions of the returned object
df.loc['20130102', ['A', 'B']]

# For getting a scalar value
df.loc[dates[0],'A']

# For getting fast access to a scalar (equivalent to the prior code)
df.at[dates[0], 'A']

2013-01-01    0.483842
2013-01-02   -0.245957
2013-01-03   -1.438666
2013-01-04    0.371391
2013-01-05   -0.592473
2013-01-06    0.299948
Freq: D, Name: A, dtype: float64

Unnamed: 0,A,B,C,D
2013-01-01,0.483842,1.389233,-1.604447,0.686504
2013-01-02,-0.245957,0.424874,0.683387,-1.003888
2013-01-03,-1.438666,0.021363,0.025582,-1.372246


Unnamed: 0,A,B,C,D
2013-01-02,-0.245957,0.424874,0.683387,-1.003888
2013-01-03,-1.438666,0.021363,0.025582,-1.372246
2013-01-04,0.371391,-0.065038,1.336967,-0.135345


A    0.483842
B    1.389233
C   -1.604447
D    0.686504
Name: 2013-01-01 00:00:00, dtype: float64

Unnamed: 0,A,B
2013-01-01,0.483842,1.389233
2013-01-02,-0.245957,0.424874
2013-01-03,-1.438666,0.021363
2013-01-04,0.371391,-0.065038
2013-01-05,-0.592473,-0.322534
2013-01-06,0.299948,0.329729


Unnamed: 0,A,B
2013-01-02,-0.245957,0.424874
2013-01-03,-1.438666,0.021363
2013-01-04,0.371391,-0.065038


A   -0.245957
B    0.424874
Name: 2013-01-02 00:00:00, dtype: float64

0.48384222505192825

0.48384222505192825

In [7]:
# Selection by position 
# select via the position of passed integers
df.iloc[3] # 4th date as 3 in index corresponds to the 4th value

# By integer slices, acting similar to numpy/python
df.iloc[3:5, 0:2] # Again, ignores the 5th index(6th value); 3:5 gives 4th and 5th value

# By lists of integer position locations, similar to numpy/python style
df.iloc[[1, 2, 4], [0, 2]]

# For slicing rows explicitly
df.iloc[1:3,:]

# For slicing columns explicitly
df.iloc[:, 1:3]

# For getting a value explicitly
df.iloc[1, 1]

# For getting fast access to a scalar (equivalent to previous method)
df.iat[1, 1]

A    0.371391
B   -0.065038
C    1.336967
D   -0.135345
Name: 2013-01-04 00:00:00, dtype: float64

Unnamed: 0,A,B
2013-01-04,0.371391,-0.065038
2013-01-05,-0.592473,-0.322534


Unnamed: 0,A,C
2013-01-02,-0.245957,0.683387
2013-01-03,-1.438666,0.025582
2013-01-05,-0.592473,-0.281605


Unnamed: 0,A,B,C,D
2013-01-02,-0.245957,0.424874,0.683387,-1.003888
2013-01-03,-1.438666,0.021363,0.025582,-1.372246


Unnamed: 0,B,C
2013-01-01,1.389233,-1.604447
2013-01-02,0.424874,0.683387
2013-01-03,0.021363,0.025582
2013-01-04,-0.065038,1.336967
2013-01-05,-0.322534,-0.281605
2013-01-06,0.329729,0.473723


0.42487420299527767

0.42487420299527767

In [8]:
# Boolean Indexing
# Using a single column's value to select data
df[df.A > 0]

# A where operation for getting 
df[df > 0] # Converted all <= 0 to NaN

# Using the isin() method for filtering 
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D
2013-01-01,0.483842,1.389233,-1.604447,0.686504
2013-01-04,0.371391,-0.065038,1.336967,-0.135345
2013-01-06,0.299948,0.329729,0.473723,0.888519


Unnamed: 0,A,B,C,D
2013-01-01,0.483842,1.389233,,0.686504
2013-01-02,,0.424874,0.683387,
2013-01-03,,0.021363,0.025582,
2013-01-04,0.371391,,1.336967,
2013-01-05,,,,1.060983
2013-01-06,0.299948,0.329729,0.473723,0.888519


Unnamed: 0,A,B,C,D,E
2013-01-01,0.483842,1.389233,-1.604447,0.686504,one
2013-01-02,-0.245957,0.424874,0.683387,-1.003888,one
2013-01-03,-1.438666,0.021363,0.025582,-1.372246,two
2013-01-04,0.371391,-0.065038,1.336967,-0.135345,three
2013-01-05,-0.592473,-0.322534,-0.281605,1.060983,four
2013-01-06,0.299948,0.329729,0.473723,0.888519,three


Unnamed: 0,A,B,C,D,E
2013-01-03,-1.438666,0.021363,0.025582,-1.372246,two
2013-01-05,-0.592473,-0.322534,-0.281605,1.060983,four


In [9]:
# Setting
# setting a new column automatically aligns the data by indexes
s1 = pd.Series([1, 2, 3, 4, 5, 6], index = pd.date_range('20130102', periods = 6))

# Setting values by label
df.at[dates[0], 'A'] = 0

# Setting values by position 
df.iat[0, 1] = 0

# Setting with assigning with a numpy array
df.loc[:, 'D'] = np.array([5] * len(df))

# A where operation with setting
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,-1.604447,-5
2013-01-02,-0.245957,-0.424874,-0.683387,-5
2013-01-03,-1.438666,-0.021363,-0.025582,-5
2013-01-04,-0.371391,-0.065038,-1.336967,-5
2013-01-05,-0.592473,-0.322534,-0.281605,-5
2013-01-06,-0.299948,-0.329729,-0.473723,-5


In [10]:
# Missing data
# Pandas primarily uses the value np.nan to represent missing data. It is by default not included in the computations 
# reindexing allows you to add/change/delete the index on a specifiid axis. This returns a copy of the data
df1 = df.reindex(index = dates[0:4], columns = list(df.columns) + ['E'])
df1.loc[dates[0]:dates[1], 'E'] = 1
df1

# To drop any rows that have missing data
df1.dropna(how = 'any')

# To fill missing data
df1.fillna(value = 5)

# To get the boolean mask where values are nan
pd.isnull(df1)

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,-1.604447,5,1.0
2013-01-02,-0.245957,0.424874,0.683387,5,1.0
2013-01-03,-1.438666,0.021363,0.025582,5,
2013-01-04,0.371391,-0.065038,1.336967,5,


Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,-1.604447,5,1.0
2013-01-02,-0.245957,0.424874,0.683387,5,1.0


Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,-1.604447,5,1.0
2013-01-02,-0.245957,0.424874,0.683387,5,1.0
2013-01-03,-1.438666,0.021363,0.025582,5,5.0
2013-01-04,0.371391,-0.065038,1.336967,5,5.0


Unnamed: 0,A,B,C,D,E
2013-01-01,False,False,False,False,False
2013-01-02,False,False,False,False,False
2013-01-03,False,False,False,False,True
2013-01-04,False,False,False,False,True


In [11]:
# Operations
# Operations in general exclude missing data
# Performing a descriptive statistic
df.mean() # Gives a column wise mean, completely ignores NA


# Same operation on the other axis
df.mean(1) # Other axis means a row wise mean or an index wise mean

# Operating with objects that have different dimensionality and need alignment. In addition, pandas automatically broadcast 
# along the specified dimension
s = pd.Series([1, 3, 5, np.nan, 6, 8], index = dates).shift(2) # Like lag/lead in data.table!
s

df.sub(s, axis = 'index')

A   -0.267626
B    0.064732
C    0.105601
D    5.000000
dtype: float64

In [18]:
# Apply
# Applying functions to the data
df
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,-1.604447,5
2013-01-02,-0.245957,0.424874,0.683387,5
2013-01-03,-1.438666,0.021363,0.025582,5
2013-01-04,0.371391,-0.065038,1.336967,5
2013-01-05,-0.592473,-0.322534,-0.281605,5
2013-01-06,0.299948,0.329729,0.473723,5


Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,-1.604447,5
2013-01-02,-0.245957,0.424874,-0.92106,10
2013-01-03,-1.684622,0.446237,-0.895477,15
2013-01-04,-1.313232,0.381199,0.44149,20
2013-01-05,-1.905705,0.058664,0.159885,25
2013-01-06,-1.605757,0.388394,0.633608,30


In [None]:
df.apply(lambda x: x.max() - x.min)