#### Importing the pandas and Numpy

In [1]:
import numpy as np
import pandas as pd

#### Creating a Series Object

In [2]:
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

#### Creating a DataFrame 

#### By passing a Numpy array, with a datetime index and labeled columns

In [4]:
dates = pd.date_range("20130101",periods=6)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list("ABCD"))

In [5]:
df

Unnamed: 0,A,B,C,D
2013-01-01,1.24724,-0.298577,-2.409358,-0.026067
2013-01-02,-1.617793,0.243773,-0.133399,-0.035955
2013-01-03,-1.736064,0.098297,0.463387,-0.304373
2013-01-04,-0.628743,1.492278,2.419546,0.186677
2013-01-05,1.187242,-0.847169,0.311317,-0.582514
2013-01-06,1.80409,-0.821284,1.979296,0.413671


##### Creating a DataFrame by passing a dict of objects 

In [8]:
dic = {
    "A": 1.0,
    "B":pd.Timestamp(20130102),
    "C":pd.Series(1,index=list(range(4)),dtype='float32'),
    "D":np.array([3]*4,dtype='int32'),
    "E":pd.Categorical(["test","train","test","train"]),
    "F":"foo"
      }
df2 = pd.DataFrame(dic)
df2.head()

Unnamed: 0,A,B,C,D,E,F
0,1.0,1970-01-01 00:00:00.020130102,1.0,3,test,foo
1,1.0,1970-01-01 00:00:00.020130102,1.0,3,train,foo
2,1.0,1970-01-01 00:00:00.020130102,1.0,3,test,foo
3,1.0,1970-01-01 00:00:00.020130102,1.0,3,train,foo


In [9]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

### Viewing Data

In [10]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,1.24724,-0.298577,-2.409358,-0.026067
2013-01-02,-1.617793,0.243773,-0.133399,-0.035955
2013-01-03,-1.736064,0.098297,0.463387,-0.304373
2013-01-04,-0.628743,1.492278,2.419546,0.186677
2013-01-05,1.187242,-0.847169,0.311317,-0.582514


In [12]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,-0.628743,1.492278,2.419546,0.186677
2013-01-05,1.187242,-0.847169,0.311317,-0.582514
2013-01-06,1.80409,-0.821284,1.979296,0.413671


In [13]:
print(df.index)
print(df.columns)

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
Index(['A', 'B', 'C', 'D'], dtype='object')


**Numpy arrays have one dtype for the entire array, while pandas DataFrame have one dtype per column**

**Showing quick statistic summary of data**

In [14]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.042662,-0.022114,0.438465,-0.058093
std,1.564319,0.868838,1.717717,0.35204
min,-1.736064,-0.847169,-2.409358,-0.582514
25%,-1.37053,-0.690607,-0.02222,-0.237269
50%,0.27925,-0.10014,0.387352,-0.031011
75%,1.232241,0.207404,1.600319,0.133491
max,1.80409,1.492278,2.419546,0.413671


**Transposing data**

In [15]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,1.24724,-1.617793,-1.736064,-0.628743,1.187242,1.80409
B,-0.298577,0.243773,0.098297,1.492278,-0.847169,-0.821284
C,-2.409358,-0.133399,0.463387,2.419546,0.311317,1.979296
D,-0.026067,-0.035955,-0.304373,0.186677,-0.582514,0.413671


**Sorting by axes**

In [16]:
df.sort_index(axis=1,ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,-0.026067,-2.409358,-0.298577,1.24724
2013-01-02,-0.035955,-0.133399,0.243773,-1.617793
2013-01-03,-0.304373,0.463387,0.098297,-1.736064
2013-01-04,0.186677,2.419546,1.492278,-0.628743
2013-01-05,-0.582514,0.311317,-0.847169,1.187242
2013-01-06,0.413671,1.979296,-0.821284,1.80409


**Sorting by values**

In [17]:
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2013-01-05,1.187242,-0.847169,0.311317,-0.582514
2013-01-06,1.80409,-0.821284,1.979296,0.413671
2013-01-01,1.24724,-0.298577,-2.409358,-0.026067
2013-01-03,-1.736064,0.098297,0.463387,-0.304373
2013-01-02,-1.617793,0.243773,-0.133399,-0.035955
2013-01-04,-0.628743,1.492278,2.419546,0.186677


### Selection

Pandas Data selections methods are : .at, .iat, .loc, .iloc

**Selecting a single column, which yields a Series**

In [18]:
df["A"]

2013-01-01    1.247240
2013-01-02   -1.617793
2013-01-03   -1.736064
2013-01-04   -0.628743
2013-01-05    1.187242
2013-01-06    1.804090
Freq: D, Name: A, dtype: float64

**Selecting via [ ]**

In [19]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,1.24724,-0.298577,-2.409358,-0.026067
2013-01-02,-1.617793,0.243773,-0.133399,-0.035955
2013-01-03,-1.736064,0.098297,0.463387,-0.304373


In [20]:
df["20130102":"20130104"]

Unnamed: 0,A,B,C,D
2013-01-02,-1.617793,0.243773,-0.133399,-0.035955
2013-01-03,-1.736064,0.098297,0.463387,-0.304373
2013-01-04,-0.628743,1.492278,2.419546,0.186677


#### Selecting by Label

**For getting a cross section using a label**

In [22]:
df.loc[dates[0]] #selecting first data row values

A    1.247240
B   -0.298577
C   -2.409358
D   -0.026067
Name: 2013-01-01 00:00:00, dtype: float64

**Selecting on a multi-axis label**

In [23]:
df.loc[:,["A","B"]]

Unnamed: 0,A,B
2013-01-01,1.24724,-0.298577
2013-01-02,-1.617793,0.243773
2013-01-03,-1.736064,0.098297
2013-01-04,-0.628743,1.492278
2013-01-05,1.187242,-0.847169
2013-01-06,1.80409,-0.821284


**Showing label slicing, both endpoints are included**

In [24]:
df.loc["20130102":"20130104",["A","B"]]

Unnamed: 0,A,B
2013-01-02,-1.617793,0.243773
2013-01-03,-1.736064,0.098297
2013-01-04,-0.628743,1.492278


**Reduction in the dimensions of the returned object**

In [25]:
df.loc["20130102",["A","B"]]

A   -1.617793
B    0.243773
Name: 2013-01-02 00:00:00, dtype: float64

In [27]:
#Getting scalar value
df.loc[dates[0],"A"]

1.2472404724507258

**For getting fast access to a scalar**

In [28]:
df.at[dates[0],"A"]

1.2472404724507258

### Selection by Position

**Select via the position of the passed integers**

In [29]:
df.iloc[3]

A   -0.628743
B    1.492278
C    2.419546
D    0.186677
Name: 2013-01-04 00:00:00, dtype: float64

**By integer slices**

In [30]:
df.iloc[3:5,0:2]

Unnamed: 0,A,B
2013-01-04,-0.628743,1.492278
2013-01-05,1.187242,-0.847169


**By list of integer position locations**

In [31]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,-1.617793,-0.133399
2013-01-03,-1.736064,0.463387
2013-01-05,1.187242,0.311317


**For getting a value explicitly**

In [32]:
df.iloc[1,2]

-0.1333987025685469

In [34]:
# for getting fast access to a scalar 
df.iat[1,2]

-0.1333987025685469

### Boolean Indexing

**Using a single column's values to select data**

In [35]:
df[df["A"]>0]

Unnamed: 0,A,B,C,D
2013-01-01,1.24724,-0.298577,-2.409358,-0.026067
2013-01-05,1.187242,-0.847169,0.311317,-0.582514
2013-01-06,1.80409,-0.821284,1.979296,0.413671


**Selecting values from a DataFrame where a boolean condition is met**

In [36]:
df[df>0]

Unnamed: 0,A,B,C,D
2013-01-01,1.24724,,,
2013-01-02,,0.243773,,
2013-01-03,,0.098297,0.463387,
2013-01-04,,1.492278,2.419546,0.186677
2013-01-05,1.187242,,0.311317,
2013-01-06,1.80409,,1.979296,0.413671


In [37]:
df2 = df.copy()
df2["E"] = ["one","one","two","three","four","three"]
df2 

Unnamed: 0,A,B,C,D,E
2013-01-01,1.24724,-0.298577,-2.409358,-0.026067,one
2013-01-02,-1.617793,0.243773,-0.133399,-0.035955,one
2013-01-03,-1.736064,0.098297,0.463387,-0.304373,two
2013-01-04,-0.628743,1.492278,2.419546,0.186677,three
2013-01-05,1.187242,-0.847169,0.311317,-0.582514,four
2013-01-06,1.80409,-0.821284,1.979296,0.413671,three


In [39]:
df2[df2["E"].isin(["two","four"])] #.isin() method is best suitable for selecting column values 

Unnamed: 0,A,B,C,D,E
2013-01-03,-1.736064,0.098297,0.463387,-0.304373,two
2013-01-05,1.187242,-0.847169,0.311317,-0.582514,four


### Setting

**Setting a new column automatically aligns the data by the indexes**

In [41]:
s1 = pd.Series([1,2,3,4,5,6],index=pd.date_range("20130102",periods=6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

**Setting values by label**

In [43]:
df["F"]=s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,1.24724,-0.298577,-2.409358,-0.026067,
2013-01-02,-1.617793,0.243773,-0.133399,-0.035955,1.0
2013-01-03,-1.736064,0.098297,0.463387,-0.304373,2.0
2013-01-04,-0.628743,1.492278,2.419546,0.186677,3.0
2013-01-05,1.187242,-0.847169,0.311317,-0.582514,4.0
2013-01-06,1.80409,-0.821284,1.979296,0.413671,5.0


**Setting Values by label**

In [45]:
df.at[dates[0],"A"]=0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-0.298577,-2.409358,-0.026067,
2013-01-02,-1.617793,0.243773,-0.133399,-0.035955,1.0
2013-01-03,-1.736064,0.098297,0.463387,-0.304373,2.0
2013-01-04,-0.628743,1.492278,2.419546,0.186677,3.0
2013-01-05,1.187242,-0.847169,0.311317,-0.582514,4.0
2013-01-06,1.80409,-0.821284,1.979296,0.413671,5.0


**Setting values by position**

In [47]:
df.iat[0,4]=0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,-0.298577,-2.409358,-0.026067,0.0
2013-01-02,-1.617793,0.243773,-0.133399,-0.035955,1.0
2013-01-03,-1.736064,0.098297,0.463387,-0.304373,2.0
2013-01-04,-0.628743,1.492278,2.419546,0.186677,3.0
2013-01-05,1.187242,-0.847169,0.311317,-0.582514,4.0
2013-01-06,1.80409,-0.821284,1.979296,0.413671,5.0


### Handling Missing Data