In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to run all arguments and not just the last one

In [36]:
# There are 3 types of data structures: series, data frames and planes
# Creating a series
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s # Again, remembering that indexes in python start with 0 and not 1

# Creating a data frame by passing a numpy array with a datetime index and labelled columns
dates = pd.date_range('20130101', periods=6) # creates 6 rows with 4 columns
dates
df = pd.DataFrame(np.random.randn(6, 4), index = dates, columns = list('ABCD'))
df

# Creating a dataframe by passing a dict of objects that can be converted to seried-like
df2 = pd.DataFrame({'A' : 1.,
                   'B' : pd.Timestamp('20130102'),
                   'C' : pd.Series(1, index = list(range(4)), dtype = 'float32'),
                   'D' : np.array([3] * 4, dtype = 'int32'),
                   'E' : pd.Categorical(["test", "train", "test", "train"]),
                   'F' : 'foo'})
df2

# having specific dtypes
df2.dtypes 
# tab completion for attributes is automatic in ipython. For a list of attributes

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

Unnamed: 0,A,B,C,D
2013-01-01,-0.293688,-0.473759,0.288288,-0.02545
2013-01-02,-0.264828,-1.621612,0.618854,-0.870892
2013-01-03,-0.408055,-1.233055,0.215659,1.462381
2013-01-04,-0.889685,-1.058136,-0.735844,0.605095
2013-01-05,0.266253,-0.027101,-0.174008,-0.359295
2013-01-06,1.583522,0.255655,1.94118,-0.238714


Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [21]:
# View the head or tail of a data frame
df2.head()
df2.tail(3)

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


Unnamed: 0,A,B,C,D,E,F
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [28]:
# Display the index, columns and the underlying numpy data
df.index
df.columns
df.values
df.describe()

# transposing the data
df.T

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

Index(['A', 'B', 'C', 'D'], dtype='object')

array([[ 0.04031971,  1.64970906,  1.96297017, -0.67401388],
       [-0.40793297,  0.98439977,  0.787499  , -0.18044256],
       [ 1.03063351, -1.25312546, -1.24929537, -1.70154745],
       [-1.32273186, -1.06957318,  0.19923243,  0.88937514],
       [-0.17144   , -0.86395418, -0.83428291, -1.54512931],
       [-0.30796198,  1.80438547,  2.23015043, -0.32912377]])

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.189852,0.20864,0.516046,-0.590147
std,0.759977,1.424495,1.424473,0.956997
min,-1.322732,-1.253125,-1.249295,-1.701547
25%,-0.38294,-1.018168,-0.575904,-1.32735
50%,-0.239701,0.060223,0.493366,-0.501569
75%,-0.01262,1.483382,1.669102,-0.217613
max,1.030634,1.804385,2.23015,0.889375


Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,0.04032,-0.407933,1.030634,-1.322732,-0.17144,-0.307962
B,1.649709,0.9844,-1.253125,-1.069573,-0.863954,1.804385
C,1.96297,0.787499,-1.249295,0.199232,-0.834283,2.23015
D,-0.674014,-0.180443,-1.701547,0.889375,-1.545129,-0.329124


In [29]:
# Sorting the data by an axis
df.sort_index(axis = 1, ascending = False)

# Sorting by a value
df.sort_values(by = "B")

Unnamed: 0,D,C,B,A
2013-01-01,-0.674014,1.96297,1.649709,0.04032
2013-01-02,-0.180443,0.787499,0.9844,-0.407933
2013-01-03,-1.701547,-1.249295,-1.253125,1.030634
2013-01-04,0.889375,0.199232,-1.069573,-1.322732
2013-01-05,-1.545129,-0.834283,-0.863954,-0.17144
2013-01-06,-0.329124,2.23015,1.804385,-0.307962


In [31]:
# While selecting and setting can be done by standard expressions, we should use optimized pandas data access methods
# .at, .iat, .loc, .iloc and .ix
# Selecting a single column which gives a Series
df['A']

# Selecting via [] which slices the rows
df[0:3] # last index is not included
df['20130102':'20130104']

# Selection by a label
# For getting a cross selection using a label
df.loc[dates[0]]

# Selecting a multi-axis by laebls
df.loc[:, ['A', 'B']]

# Showing label slicing, both endpoints are included
df.loc['20130102':'20130104', ['A', 'B']]

# Reduction in dimentions of the returned object
df.loc['20130102', ['A', 'B']]

# For getting a scalar value
df.loc[dates[0],'A']

# For getting fast access to a scalar (equivalent to the prior code)
df.at[dates[0], 'A']

2013-01-01    0.040320
2013-01-02   -0.407933
2013-01-03    1.030634
2013-01-04   -1.322732
2013-01-05   -0.171440
2013-01-06   -0.307962
Freq: D, Name: A, dtype: float64

In [44]:
# Selection by position 
# select via the position of passed integers
df.iloc[3] # 4th date as 3 in index corresponds to the 4th value

# By integer slices, acting similar to numpy/python
df.iloc[3:5, 0:2] # Again, ignores the 5th index(6th value); 3:5 gives 4th and 5th value

# By lists of integer position locations, similar to numpy/python style
df.iloc[[1, 2, 4], [0, 2]]

# For slicing rows explicitly
df.iloc[1:3,:]

# For slicing columns explicitly
df.iloc[:, 1:3]

# For getting a value explicitly
df.iloc[1, 1]

# For getting fast access to a scalar (equivalent to previous method)
df.iat[1, 1]

A   -0.889685
B   -1.058136
C   -0.735844
D    0.605095
Name: 2013-01-04 00:00:00, dtype: float64

In [52]:
# Boolean Indexing
# Using a single column's value to select data
df[df.A > 0]

# A where operation for getting 
df[df > 0] # Converted all <= 0 to NaN

# Using the isin() method for filtering 
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D
2013-01-05,0.266253,-0.027101,-0.174008,-0.359295
2013-01-06,1.583522,0.255655,1.94118,-0.238714


In [None]:
# Setting
# setting a new column automatically aligns the data by indexes