___

<p style="text-align: center;"><img src="https://docs.google.com/uc?id=1lY0Uj5R04yMY3-ZppPWxqCr5pvBLYPnV" class="img-fluid" alt="CLRSWY"></p>

___

<h1><p style="text-align: center;">Pandas Lesson, Session - 4</p><h1>
    

# Data Frames

 - ### ``DataFrames`` are the workhorse of pandas and are directly inspired by the R programming language. We can think of a DataFrame as a bunch of Series objects put together to share the same index. Let's use pandas to explore this topic!

In [34]:
import pandas as pd
import numpy as np

 - ### Creating a DataFrame using the ``list``s of data and columns

In [35]:
datas = [1, 3, 5, 7, 9, 18]
datas

[1, 3, 5, 7, 9, 18]

In [36]:
pd.DataFrame(datas, columns=['column1'])

Unnamed: 0,column1
0,1
1,3
2,5
3,7
4,9
5,18


 - ### Creating a DataFrame using a ``NumPy Arrays``

In [37]:
m = np.arange(1, 24, 2).reshape(3, 4)
m

array([[ 1,  3,  5,  7],
       [ 9, 11, 13, 15],
       [17, 19, 21, 23]])

In [38]:
pd.DataFrame(m, columns=['var1','var2','var3','var4'])

Unnamed: 0,var1,var2,var3,var4
0,1,3,5,7
1,9,11,13,15
2,17,19,21,23


In [39]:
df=pd.DataFrame(data=m, columns=['var1','var2','var3','var4'])
df

Unnamed: 0,var1,var2,var3,var4
0,1,3,5,7
1,9,11,13,15
2,17,19,21,23


In [40]:
df.head(2)

Unnamed: 0,var1,var2,var3,var4
0,1,3,5,7
1,9,11,13,15


In [41]:
df.tail(2)

Unnamed: 0,var1,var2,var3,var4
1,9,11,13,15
2,17,19,21,23


In [44]:
df.sample(2)

Unnamed: 0,var1,var2,var3,var4
1,9,11,13,15
2,17,19,21,23


In [45]:
df.columns

Index(['var1', 'var2', 'var3', 'var4'], dtype='object')

In [46]:
for i in df.columns:
    print(i)

var1
var2
var3
var4


In [47]:
df.column s= ['new1', 'new2', 'new3', 'new4']
df

Unnamed: 0,new1,new2,new3,new4
0,1,3,5,7
1,9,11,13,15
2,17,19,21,23


In [48]:
type(df)

pandas.core.frame.DataFrame

In [49]:
df.shape

(3, 4)

In [50]:
df.shape[1]

4

In [51]:
df.ndim

2

In [52]:
df.size

12

In [53]:
len(df)

3

In [56]:
df.values

array([[ 1,  3,  5,  7],
       [ 9, 11, 13, 15],
       [17, 19, 21, 23]])

In [57]:
type(df.values)

numpy.ndarray

 - ### Creating a DataFrame using a ``dict``

In [58]:
s1 = np.random.randint(2, 10, size = 4)
s2 = np.random.randint(3, 10, size = 4)
s3 = np.random.randint(4, 15, size = 4)

In [59]:
s1

array([7, 5, 9, 9])

In [60]:
s2

array([7, 8, 8, 6])

In [61]:
s3

array([ 8, 11, 12, 14])

In [62]:
myDict= {'var1':s1, 'var2':s2, 'var3':s3}

In [63]:
df1 = pd.DataFrame(myDict)
df1

Unnamed: 0,var1,var2,var3
0,7,7,8
1,5,8,11
2,9,8,12
3,9,6,14


In [64]:
df1.index

RangeIndex(start=0, stop=4, step=1)

In [66]:
[i for i in df1.index]

[0, 1, 2, 3]

In [67]:
df1.index = ["a", "b", "c", "d"]

In [68]:
df1

Unnamed: 0,var1,var2,var3
a,7,7,8
b,5,8,11
c,9,8,12
d,9,6,14


In [69]:
# we can check any column name whether it belongs to the DataFrame or not

"var2" in df1

True

In [70]:
'var5' in df1

False

In [71]:
df1

Unnamed: 0,var1,var2,var3
a,7,7,8
b,5,8,11
c,9,8,12
d,9,6,14


### Now, let's examine again the ***indexing, selection*** and ***slicing*** methods and several ***attributes*** using a different DataFrame

In [1]:
from numpy.random import randn
np.random.seed(101)

<IPython.core.display.Javascript object>

In [None]:
# df3 = pd.DataFrame(randn(5,4), index = ['A','B','C','D','E'], columns = 'W X Y Z'.split())

In [5]:
df3 = pd.DataFrame(randn(5, 4), index = 'A B C D E'.split(), columns = 'W X Y Z'.split())

<IPython.core.display.Javascript object>

In [336]:
df3

Unnamed: 0,W,X,Y,Z
A,0.74556,-0.298864,0.140029,1.094066
B,-0.946043,0.065646,0.15616,-0.643741
C,-0.538651,0.386578,1.271608,-1.666574
D,0.605344,-1.525565,0.038347,0.093195
E,0.337926,0.087206,0.379886,0.966958


In [77]:
# creating a DataFrame by "positional arguments"

pd.DataFrame(randn(5, 4), 'a b c d e'.split(), 'w x y z'.split())

Unnamed: 0,w,x,y,z
a,-0.993263,0.1968,-1.136645,0.000366
b,1.025984,-0.156598,-0.031579,0.649826
c,2.154846,-0.610259,-0.755325,-0.346419
d,0.147027,-0.479448,0.558769,1.02481
e,-0.925874,1.862864,-1.133817,0.610478


In [4]:
# creating a DataFrame by "keyword arguments"
# np.random.seed(101)
pd.DataFrame(data=randn(5, 4), columns='w x y z'.split(), index='a b c d e'.split())

<IPython.core.display.Javascript object>

Unnamed: 0,w,x,y,z
a,-0.993263,0.1968,-1.136645,0.000366
b,1.025984,-0.156598,-0.031579,0.649826
c,2.154846,-0.610259,-0.755325,-0.346419
d,0.147027,-0.479448,0.558769,1.02481
e,-0.925874,1.862864,-1.133817,0.610478


## Selection and Indexing

Let's learn the various methods to grab data from a DataFrame

In [6]:
df3

Unnamed: 0,W,X,Y,Z
A,0.38603,2.084019,-0.376519,0.230336
B,0.681209,1.035125,-0.03116,1.939932
C,-1.005187,-0.74179,0.187125,-0.732845
D,-1.38292,1.482495,0.961458,-2.141212
E,0.992573,1.192241,-1.04678,1.292765


In [7]:
df3['Y']

A   -0.376519
B   -0.031160
C    0.187125
D    0.961458
E   -1.046780
Name: Y, dtype: float64

In [8]:
# SQL Syntax (NOT RECOMMENDED!)

df3.Y

A   -0.376519
B   -0.031160
C    0.187125
D    0.961458
E   -1.046780
Name: Y, dtype: float64

#### DataFrame Columns are just Series

In [340]:
type(df3['Y'])

pandas.core.series.Series

In [341]:
df3[['Y']]

Unnamed: 0,Y
A,0.140029
B,0.15616
C,1.271608
D,0.038347
E,0.379886


In [342]:
type(df3[['Y']])

pandas.core.frame.DataFrame

In [9]:
# Pass a list of column names

df3[['Z','X']]

Unnamed: 0,Z,X
A,0.230336,2.084019
B,1.939932,1.035125
C,-0.732845,-0.74179
D,-2.141212,1.482495
E,1.292765,1.192241


In [10]:
sam_list = ['Z','X', 'Y']

In [11]:
df3[sam_list]

Unnamed: 0,Z,X,Y
A,0.230336,2.084019,-0.376519
B,1.939932,1.035125,-0.03116
C,-0.732845,-0.74179,0.187125
D,-2.141212,1.482495,0.961458
E,1.292765,1.192241,-1.04678


In [330]:
# df3['Z','X'] gives error

Unnamed: 0_level_0,Z,X
newidx,Unnamed: 1_level_1,Unnamed: 2_level_1
CA,0.0,0.0
NY,0.0,0.0
WY,-0.589001,0.740122
OR,0.0,0.0
CO,0.683509,1.978757


In [346]:
df3["X":"Z"] 

Unnamed: 0,W,X,Y,Z


In [347]:
df3

Unnamed: 0,W,X,Y,Z
A,0.74556,-0.298864,0.140029,1.094066
B,-0.946043,0.065646,0.15616,-0.643741
C,-0.538651,0.386578,1.271608,-1.666574
D,0.605344,-1.525565,0.038347,0.093195
E,0.337926,0.087206,0.379886,0.966958


In [348]:
df3['B':'D']

Unnamed: 0,W,X,Y,Z
B,-0.946043,0.065646,0.15616,-0.643741
C,-0.538651,0.386578,1.271608,-1.666574
D,0.605344,-1.525565,0.038347,0.093195


In [None]:
#df3['C','D'] gives error

In [349]:
df3["A":"C"][["Y", "Z"]]

Unnamed: 0,Y,Z
A,0.140029,1.094066
B,0.15616,-0.643741
C,1.271608,-1.666574


**Creating a new column:**

In [96]:
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [97]:
# feature engineering

df3['X*Y'] = df3['X'] * df3['Y']
df3

Unnamed: 0,W,X,Y,Z,X*Y
A,2.70685,0.628133,0.907969,0.503826,0.570325
B,0.651118,-0.319318,-0.848077,0.605965,0.270806
C,-2.018168,0.740122,0.528813,-0.589001,0.391387
D,0.188695,-0.758872,-0.933237,0.955057,0.708208
E,0.190794,1.978757,2.605967,0.683509,5.156577


In [98]:
df3["T"] = [1, 2, 3, 4, 5]
df3

Unnamed: 0,W,X,Y,Z,X*Y,T
A,2.70685,0.628133,0.907969,0.503826,0.570325,1
B,0.651118,-0.319318,-0.848077,0.605965,0.270806,2
C,-2.018168,0.740122,0.528813,-0.589001,0.391387,3
D,0.188695,-0.758872,-0.933237,0.955057,0.708208,4
E,0.190794,1.978757,2.605967,0.683509,5.156577,5


### [Removing Columns & Rows](http://localhost:8888/notebooks/pythonic/DAwPythonSessions/w3resource-pandas-dataframe-drop.ipynb)

 - ### Removing Columns

In [99]:
df3.drop('X*Y', axis=1)

Unnamed: 0,W,X,Y,Z,T
A,2.70685,0.628133,0.907969,0.503826,1
B,0.651118,-0.319318,-0.848077,0.605965,2
C,-2.018168,0.740122,0.528813,-0.589001,3
D,0.188695,-0.758872,-0.933237,0.955057,4
E,0.190794,1.978757,2.605967,0.683509,5


In [100]:
df3

Unnamed: 0,W,X,Y,Z,X*Y,T
A,2.70685,0.628133,0.907969,0.503826,0.570325,1
B,0.651118,-0.319318,-0.848077,0.605965,0.270806,2
C,-2.018168,0.740122,0.528813,-0.589001,0.391387,3
D,0.188695,-0.758872,-0.933237,0.955057,0.708208,4
E,0.190794,1.978757,2.605967,0.683509,5.156577,5


In [101]:
df3.drop(["X*Y", "T"], axis=1)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [102]:
df3

Unnamed: 0,W,X,Y,Z,X*Y,T
A,2.70685,0.628133,0.907969,0.503826,0.570325,1
B,0.651118,-0.319318,-0.848077,0.605965,0.270806,2
C,-2.018168,0.740122,0.528813,-0.589001,0.391387,3
D,0.188695,-0.758872,-0.933237,0.955057,0.708208,4
E,0.190794,1.978757,2.605967,0.683509,5.156577,5


In [103]:
# Not inplace unless specified!

df3.drop(["X*Y", "T"], axis=1, inplace=True)

In [104]:
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


 - ### Removing rows

In [105]:
df3.drop('C', axis=0)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [106]:
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [107]:
# the default value of axis is 0 (axis = 0)

df4 = df3.drop('C', axis=0)
df4

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [108]:
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


### Selecting Rows

- ### First, let's take a quick look at [`.loc[]`](http://localhost:8888/notebooks/pythonic/DAwPythonSessions/w3resource-pandas-dataframe-loc.ipynb) | [`.iloc[]`](http://localhost:8888/notebooks/pythonic/DAwPythonSessions/w3resource-pandas-dataframe-iloc.ipynb)

#### `.loc[]` → allows us to select data using **labels** (names) of rows (index) & columns

#### `.iloc[]` → allows us to select data using **index numbers** of rows (index) & columns. it's like classical indexing logic

In [12]:
m = np.random.randint(1, 40, size=(8, 4))

df4 = pd.DataFrame(m, columns = ["var1", "var2", "var3", 'var4'])
df4

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,var1,var2,var3,var4
0,12,28,34,25
1,37,39,10,4
2,2,16,5,5
3,37,19,15,29
4,18,24,17,7
5,8,21,11,33
6,24,23,22,27
7,13,3,18,2


In [13]:
# df4.index = np.arange(1, len(df4)+1)
# df4

<IPython.core.display.Javascript object>

Unnamed: 0,var1,var2,var3,var4
1,12,28,34,25
2,37,39,10,4
3,2,16,5,5
4,37,19,15,29
5,18,24,17,7
6,8,21,11,33
7,24,23,22,27
8,13,3,18,2


In [117]:
# df4.index = pd.RangeIndex(120, 120 + len(df4))
# df4

Unnamed: 0,var1,var2,var3,var4
120,8,11,39,10
121,19,8,16,1
122,13,18,12,16
123,34,30,25,37
124,20,36,31,11
125,21,28,9,23
126,27,24,38,23
127,10,3,19,29


In [14]:
# df4.index = pd.RangeIndex(start=0, stop=0+len(df4), step=1)
# df4

<IPython.core.display.Javascript object>

Unnamed: 0,var1,var2,var3,var4
0,12,28,34,25
1,37,39,10,4
2,2,16,5,5
3,37,19,15,29
4,18,24,17,7
5,8,21,11,33
6,24,23,22,27
7,13,3,18,2


In [122]:
df4.loc[4]

var1    20
var2    36
var3    31
var4    11
Name: 4, dtype: int32

In [123]:
df4.loc[[4]]

Unnamed: 0,var1,var2,var3,var4
4,20,36,31,11


In [15]:
# Slicing produces the same type of the data. Here, DataFrame

df4.loc[2:5]

Unnamed: 0,var1,var2,var3,var4
2,2,16,5,5
3,37,19,15,29
4,18,24,17,7
5,8,21,11,33


In [16]:
df4.iloc[2:5]

Unnamed: 0,var1,var2,var3,var4
2,2,16,5,5
3,37,19,15,29
4,18,24,17,7


In [17]:
df4

Unnamed: 0,var1,var2,var3,var4
0,12,28,34,25
1,37,39,10,4
2,2,16,5,5
3,37,19,15,29
4,18,24,17,7
5,8,21,11,33
6,24,23,22,27
7,13,3,18,2


In [18]:
df4.index = 'a b c d e f g h'.split()
df4

Unnamed: 0,var1,var2,var3,var4
a,12,28,34,25
b,37,39,10,4
c,2,16,5,5
d,37,19,15,29
e,18,24,17,7
f,8,21,11,33
g,24,23,22,27
h,13,3,18,2


In [19]:
df4.iloc[[1]]

Unnamed: 0,var1,var2,var3,var4
b,37,39,10,4


In [21]:
df4.loc[['b']]

Unnamed: 0,var1,var2,var3,var4
b,37,39,10,4


In [20]:
df4.iloc[1:4]

Unnamed: 0,var1,var2,var3,var4
b,37,39,10,4
c,2,16,5,5
d,37,19,15,29


In [None]:
#df4.loc[1:4] gives error

In [129]:
df4.loc['c':'g']

Unnamed: 0,var1,var2,var3,var4
c,13,18,12,16
d,34,30,25,37
e,20,36,31,11
f,21,28,9,23
g,27,24,38,23


In [130]:
df4

Unnamed: 0,var1,var2,var3,var4
a,8,11,39,10
b,19,8,16,1
c,13,18,12,16
d,34,30,25,37
e,20,36,31,11
f,21,28,9,23
g,27,24,38,23
h,10,3,19,29


In [133]:
df4.iloc[4, 1]

36

In [136]:
df4.loc['d':'g', 'var3']

d    25
e    31
f     9
g    38
Name: var3, dtype: int32

In [139]:
df4.loc['d':'g']['var3']

d    25
e    31
f     9
g    38
Name: var3, dtype: int32

In [140]:
# how can we select these data as a DataFrame not a series

df4.loc['d':'g'][['var3']]

Unnamed: 0,var3
d,25
e,31
f,9
g,38


In [141]:
df4.loc['d':'g', ["var3"]]

Unnamed: 0,var3
d,25
e,31
f,9
g,38


In [24]:
df4.iloc[2:5, 2]

c     5
d    15
e    17
Name: var3, dtype: int32

In [23]:
df4.iloc[2:5, [2]]

Unnamed: 0,var3
c,5
d,15
e,17


In [144]:
df4.iloc[2:5][['var3']]

Unnamed: 0,var3
c,12
d,25
e,31


#### Let's continue to examine `.loc[]` and `.iloc[]` using ``df3`` again

In [145]:
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [27]:
df3.loc['C']

W   -1.005187
X   -0.741790
Y    0.187125
Z   -0.732845
Name: C, dtype: float64

Or select based off of position instead of label 

In [28]:
df3.iloc[2]

W   -1.005187
X   -0.741790
Y    0.187125
Z   -0.732845
Name: C, dtype: float64

In [149]:
# returns as a DataFrame

df3.loc[['C']]

Unnamed: 0,W,X,Y,Z
C,-2.018168,0.740122,0.528813,-0.589001


In [150]:
# returns as a DataFrame

df3.iloc[[2]]

Unnamed: 0,W,X,Y,Z
C,-2.018168,0.740122,0.528813,-0.589001


In [151]:
# Well, how can we select entire column "Y" using ".iloc[]"

df3.iloc[:, 2]

A    0.907969
B   -0.848077
C    0.528813
D   -0.933237
E    2.605967
Name: Y, dtype: float64

In [152]:
df3.iloc[:, [2]]

Unnamed: 0,Y
A,0.907969
B,-0.848077
C,0.528813
D,-0.933237
E,2.605967


In [158]:
df3[["Y"]]

Unnamed: 0,Y
A,0.907969
B,-0.848077
C,0.528813
D,-0.933237
E,2.605967


In [159]:
df3[['Y', 'X']]

Unnamed: 0,Y,X
A,0.907969,0.628133
B,-0.848077,-0.319318
C,0.528813,0.740122
D,-0.933237,-0.758872
E,2.605967,1.978757


### Selecting subset of rows and columns

 - ### `.loc[[row labels|names], [column labels|names]]`

 - ### `.iloc[[row index numbers], [column index numbers]]`

In [29]:
df3

Unnamed: 0,W,X,Y,Z
A,0.38603,2.084019,-0.376519,0.230336
B,0.681209,1.035125,-0.03116,1.939932
C,-1.005187,-0.74179,0.187125,-0.732845
D,-1.38292,1.482495,0.961458,-2.141212
E,0.992573,1.192241,-1.04678,1.292765


In [30]:
df3.loc['C','Z']

-0.7328451475428807

In [31]:
# let's select the same data as a DataFrame

df3.loc[['C'],['Z']]

Unnamed: 0,Z
C,-0.732845


In [33]:
df3.loc[['A','C'], ['W','Z']]

Unnamed: 0,W,Z
A,0.38603,0.230336
C,-1.005187,-0.732845


In [34]:
df3.iloc[[0,2], [0,3]]

Unnamed: 0,W,Z
A,0.38603,0.230336
C,-1.005187,-0.732845


### Conditional Selection

An important feature of pandas is conditional selection using bracket notation, very similar to numpy:

In [169]:
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [170]:
# returns a DataFrame consists of bool type

df3 > 0.5

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,False,False,False,True
E,False,True,True,True


In [171]:
df3[df3 > 0.5]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,,,,0.955057
E,,1.978757,2.605967,0.683509


In [172]:
# It returns based on rows.

df3[df3['Z'] > 0.5]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [173]:
df3[['Z']]

Unnamed: 0,Z
A,0.503826
B,0.605965
C,-0.589001
D,0.955057
E,0.683509


In [174]:
df3

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [175]:
df3[df3['X'] < 1][['W']]

Unnamed: 0,W
A,2.70685
B,0.651118
C,-2.018168
D,0.188695


In [176]:
# how can we select the data as a DataFrame

In [177]:
df3[df3['Y'] > 0][['Z','W','Y']]

Unnamed: 0,Z,W,Y
A,0.503826,2.70685,0.907969
C,-0.589001,-2.018168,0.528813
E,0.683509,0.190794,2.605967


#### For two conditions you can use **|** → `or`,  **&** →  `and` with parenthesis:

In [35]:
df3

Unnamed: 0,W,X,Y,Z
A,0.38603,2.084019,-0.376519,0.230336
B,0.681209,1.035125,-0.03116,1.939932
C,-1.005187,-0.74179,0.187125,-0.732845
D,-1.38292,1.482495,0.961458,-2.141212
E,0.992573,1.192241,-1.04678,1.292765


In [36]:
df3[(df3['W'] > 0) & (df3['Y'] < 1)] = 0

In [37]:
df3

Unnamed: 0,W,X,Y,Z
A,0.0,0.0,0.0,0.0
B,0.0,0.0,0.0,0.0
C,-1.005187,-0.74179,0.187125,-0.732845
D,-1.38292,1.482495,0.961458,-2.141212
E,0.0,0.0,0.0,0.0


#### Conditional selection using ``.loc[]`` and ``.iloc[]``

In [181]:
df3

Unnamed: 0,W,X,Y,Z
A,0.0,0.0,0.0,0.0
B,0.0,0.0,0.0,0.0
C,-2.018168,0.740122,0.528813,-0.589001
D,0.0,0.0,0.0,0.0
E,0.190794,1.978757,2.605967,0.683509


In [182]:
df3.loc[(df3.X>0), ['X','Z']]

Unnamed: 0,X,Z
C,0.740122,-0.589001
E,1.978757,0.683509


In [183]:
df3.loc[((df3.W > 1) | (df3.Y < 1)), ['Y','Z']]

Unnamed: 0,Y,Z
A,0.0,0.0
B,0.0,0.0
C,0.528813,-0.589001
D,0.0,0.0


## More Index Details

Let's discuss some more features of indexing, including resetting the index or setting it something else. We'll also talk about index hierarchy!

In [184]:
df3

Unnamed: 0,W,X,Y,Z
A,0.0,0.0,0.0,0.0
B,0.0,0.0,0.0,0.0
C,-2.018168,0.740122,0.528813,-0.589001
D,0.0,0.0,0.0,0.0
E,0.190794,1.978757,2.605967,0.683509


In [185]:
# Reset to default 0,1...n index

df3.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,0.0,0.0,0.0,0.0
1,B,0.0,0.0,0.0,0.0
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.0,0.0,0.0,0.0
4,E,0.190794,1.978757,2.605967,0.683509


In [186]:
df3

Unnamed: 0,W,X,Y,Z
A,0.0,0.0,0.0,0.0
B,0.0,0.0,0.0,0.0
C,-2.018168,0.740122,0.528813,-0.589001
D,0.0,0.0,0.0,0.0
E,0.190794,1.978757,2.605967,0.683509


In [187]:
df3.reset_index(drop=True)

Unnamed: 0,W,X,Y,Z
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,-2.018168,0.740122,0.528813,-0.589001
3,0.0,0.0,0.0,0.0
4,0.190794,1.978757,2.605967,0.683509


In [188]:
newindx = 'CA NY WY OR CO'.split()
newindx

['CA', 'NY', 'WY', 'OR', 'CO']

In [189]:
df3['newidx'] = newindx

In [191]:
df3

Unnamed: 0,W,X,Y,Z,newidx
A,0.0,0.0,0.0,0.0,CA
B,0.0,0.0,0.0,0.0,NY
C,-2.018168,0.740122,0.528813,-0.589001,WY
D,0.0,0.0,0.0,0.0,OR
E,0.190794,1.978757,2.605967,0.683509,CO


In [192]:
df3.set_index('newidx')

Unnamed: 0_level_0,W,X,Y,Z
newidx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,0.0,0.0,0.0,0.0
NY,0.0,0.0,0.0,0.0
WY,-2.018168,0.740122,0.528813,-0.589001
OR,0.0,0.0,0.0,0.0
CO,0.190794,1.978757,2.605967,0.683509


In [193]:
df3

Unnamed: 0,W,X,Y,Z,newidx
A,0.0,0.0,0.0,0.0,CA
B,0.0,0.0,0.0,0.0,NY
C,-2.018168,0.740122,0.528813,-0.589001,WY
D,0.0,0.0,0.0,0.0,OR
E,0.190794,1.978757,2.605967,0.683509,CO


In [194]:
df3.set_index('newidx', inplace=True)

In [195]:
df3

Unnamed: 0_level_0,W,X,Y,Z
newidx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,0.0,0.0,0.0,0.0
NY,0.0,0.0,0.0,0.0
WY,-2.018168,0.740122,0.528813,-0.589001
OR,0.0,0.0,0.0,0.0
CO,0.190794,1.978757,2.605967,0.683509


In [196]:
df3.reset_index()

Unnamed: 0,newidx,W,X,Y,Z
0,CA,0.0,0.0,0.0,0.0
1,NY,0.0,0.0,0.0,0.0
2,WY,-2.018168,0.740122,0.528813,-0.589001
3,OR,0.0,0.0,0.0,0.0
4,CO,0.190794,1.978757,2.605967,0.683509


In [197]:
df3

Unnamed: 0_level_0,W,X,Y,Z
newidx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,0.0,0.0,0.0,0.0
NY,0.0,0.0,0.0,0.0
WY,-2.018168,0.740122,0.528813,-0.589001
OR,0.0,0.0,0.0,0.0
CO,0.190794,1.978757,2.605967,0.683509


## Multi-Index and Index Hierarchy

Let us go over how to work with Multi-Index, first we'll create a quick example of what a Multi-Indexed DataFrame would look like:

In [258]:
# Index Levels

outside = ['M1', 'M1', 'M1', 'M1', 'M1', 'M1', 'M2', 'M2', 'M2', 'M2', 'M2', 'M2']
inside = [1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3]
third = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l']
multi_index = list(zip(outside, inside, third))
multi_index

[('M1', 1, 'a'),
 ('M1', 1, 'b'),
 ('M1', 2, 'c'),
 ('M1', 2, 'd'),
 ('M1', 3, 'e'),
 ('M1', 3, 'f'),
 ('M2', 1, 'g'),
 ('M2', 1, 'h'),
 ('M2', 2, 'i'),
 ('M2', 2, 'j'),
 ('M2', 3, 'k'),
 ('M2', 3, 'l')]

In [272]:
zip(outside, inside, third)

<zip at 0x2dd9a41f880>

In [273]:
list(zip(outside, inside, third))

[('M1', 1, 'a'),
 ('M1', 1, 'b'),
 ('M1', 2, 'c'),
 ('M1', 2, 'd'),
 ('M1', 3, 'e'),
 ('M1', 3, 'f'),
 ('M2', 1, 'g'),
 ('M2', 1, 'h'),
 ('M2', 2, 'i'),
 ('M2', 2, 'j'),
 ('M2', 3, 'k'),
 ('M2', 3, 'l')]

In [259]:
hier_index = pd.MultiIndex.from_tuples(multi_index)

In [260]:
hier_index

MultiIndex([('M1', 1, 'a'),
            ('M1', 1, 'b'),
            ('M1', 2, 'c'),
            ('M1', 2, 'd'),
            ('M1', 3, 'e'),
            ('M1', 3, 'f'),
            ('M2', 1, 'g'),
            ('M2', 1, 'h'),
            ('M2', 2, 'i'),
            ('M2', 2, 'j'),
            ('M2', 3, 'k'),
            ('M2', 3, 'l')],
           )

In [262]:
df5 = pd.DataFrame(np.random.randn(12, 4), index = hier_index, columns=['A', 'B', 'C', 'D'])
df5

Unnamed: 0,Unnamed: 1,Unnamed: 2,A,B,C,D
M1,1,a,-0.266598,0.288172,-0.186946,-0.07285
M1,1,b,0.360293,-0.253136,1.424846,-1.148209
M1,2,c,-1.745976,-0.851874,-0.148627,0.478169
M1,2,d,-2.079632,0.364785,-0.389643,1.054263
M1,3,e,0.193175,0.866667,1.912587,1.212039
M1,3,f,-0.828568,0.508801,1.812898,0.438464
M2,1,g,0.184212,0.088795,-0.448151,2.25707
M2,1,h,0.030853,-0.268911,2.770488,-0.573197
M2,2,i,0.014738,1.267547,0.368468,1.02288
M2,2,j,0.344081,-0.904709,0.227171,0.142235


Now let's show how to index this! For index hierarchy we use ``df.loc[]``, if this was on the columns axis, you would just use normal bracket notation ``df[]``. Calling one level of the index returns the sub-dataframe:

In [263]:
df5.loc['M1']

Unnamed: 0,Unnamed: 1,A,B,C,D
1,a,-0.266598,0.288172,-0.186946,-0.07285
1,b,0.360293,-0.253136,1.424846,-1.148209
2,c,-1.745976,-0.851874,-0.148627,0.478169
2,d,-2.079632,0.364785,-0.389643,1.054263
3,e,0.193175,0.866667,1.912587,1.212039
3,f,-0.828568,0.508801,1.812898,0.438464


In [264]:
df5.iloc[8]

A    0.014738
B    1.267547
C    0.368468
D    1.022880
Name: (M2, 2, i), dtype: float64

In [265]:
df5.loc['M1'].loc[2]

Unnamed: 0,A,B,C,D
c,-1.745976,-0.851874,-0.148627,0.478169
d,-2.079632,0.364785,-0.389643,1.054263


In [269]:
df5.loc['M1'].loc[2].loc['c']

A   -1.745976
B   -0.851874
C   -0.148627
D    0.478169
Name: c, dtype: float64

In [268]:
df5.loc['M1'].loc[2].loc[['c']]

Unnamed: 0,A,B,C,D
c,-1.745976,-0.851874,-0.148627,0.478169


In [271]:
df5.loc['M1'].loc[2].iloc[[0]]

Unnamed: 0,A,B,C,D
c,-1.745976,-0.851874,-0.148627,0.478169


All of the MultiIndex constructors accept a names argument which stores string names for the levels themselves. If no names are provided, None will be assigned:

https://pandas.pydata.org/pandas-docs/version/0.13.0/indexing.html

In [274]:
df5.index.names

FrozenList([None, None, None])

In [275]:
df5.index.names = ['Group','Num', "class"]

In [276]:
df5

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,A,B,C,D
Group,Num,class,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
M1,1,a,-0.266598,0.288172,-0.186946,-0.07285
M1,1,b,0.360293,-0.253136,1.424846,-1.148209
M1,2,c,-1.745976,-0.851874,-0.148627,0.478169
M1,2,d,-2.079632,0.364785,-0.389643,1.054263
M1,3,e,0.193175,0.866667,1.912587,1.212039
M1,3,f,-0.828568,0.508801,1.812898,0.438464
M2,1,g,0.184212,0.088795,-0.448151,2.25707
M2,1,h,0.030853,-0.268911,2.770488,-0.573197
M2,2,i,0.014738,1.267547,0.368468,1.02288
M2,2,j,0.344081,-0.904709,0.227171,0.142235


In [277]:
df5.index.names

FrozenList(['Group', 'Num', 'class'])

### let's take a quick look at the [``.xs()``](http://localhost:8888/notebooks/pythonic/DAwPythonSessions/w3resource-pandas-dataframe-xs.ipynb)

In [278]:
#This method takes a `key` argument to select data at a particular level of a MultiIndex.

df5.xs('M1')

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Num,class,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,a,-0.266598,0.288172,-0.186946,-0.07285
1,b,0.360293,-0.253136,1.424846,-1.148209
2,c,-1.745976,-0.851874,-0.148627,0.478169
2,d,-2.079632,0.364785,-0.389643,1.054263
3,e,0.193175,0.866667,1.912587,1.212039
3,f,-0.828568,0.508801,1.812898,0.438464


In [279]:
df5.loc['M1']

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Num,class,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,a,-0.266598,0.288172,-0.186946,-0.07285
1,b,0.360293,-0.253136,1.424846,-1.148209
2,c,-1.745976,-0.851874,-0.148627,0.478169
2,d,-2.079632,0.364785,-0.389643,1.054263
3,e,0.193175,0.866667,1.912587,1.212039
3,f,-0.828568,0.508801,1.812898,0.438464


In [280]:
df5

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,A,B,C,D
Group,Num,class,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
M1,1,a,-0.266598,0.288172,-0.186946,-0.07285
M1,1,b,0.360293,-0.253136,1.424846,-1.148209
M1,2,c,-1.745976,-0.851874,-0.148627,0.478169
M1,2,d,-2.079632,0.364785,-0.389643,1.054263
M1,3,e,0.193175,0.866667,1.912587,1.212039
M1,3,f,-0.828568,0.508801,1.812898,0.438464
M2,1,g,0.184212,0.088795,-0.448151,2.25707
M2,1,h,0.030853,-0.268911,2.770488,-0.573197
M2,2,i,0.014738,1.267547,0.368468,1.02288
M2,2,j,0.344081,-0.904709,0.227171,0.142235


In [282]:
df5.xs(['M1', 2, "c"])

A   -1.745976
B   -0.851874
C   -0.148627
D    0.478169
Name: (M1, 2, c), dtype: float64

In [283]:
df5.xs(['M1', 2])

Unnamed: 0_level_0,A,B,C,D
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
c,-1.745976,-0.851874,-0.148627,0.478169
d,-2.079632,0.364785,-0.389643,1.054263


In [284]:
df5.xs(('M2', 1, 'g'))

A    0.184212
B    0.088795
C   -0.448151
D    2.257070
Name: (M2, 1, g), dtype: float64

In [285]:
df5.xs(('M2', 1, 'g'), level=[0, 1, 2])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,A,B,C,D
Group,Num,class,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
M2,1,g,0.184212,0.088795,-0.448151,2.25707


In [290]:
df5.xs('g', level = 'class')

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M2,1,0.184212,0.088795,-0.448151,2.25707


In [292]:
df5

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,A,B,C,D
Group,Num,class,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
M1,1,a,-0.266598,0.288172,-0.186946,-0.07285
M1,1,b,0.360293,-0.253136,1.424846,-1.148209
M1,2,c,-1.745976,-0.851874,-0.148627,0.478169
M1,2,d,-2.079632,0.364785,-0.389643,1.054263
M1,3,e,0.193175,0.866667,1.912587,1.212039
M1,3,f,-0.828568,0.508801,1.812898,0.438464
M2,1,g,0.184212,0.088795,-0.448151,2.25707
M2,1,h,0.030853,-0.268911,2.770488,-0.573197
M2,2,i,0.014738,1.267547,0.368468,1.02288
M2,2,j,0.344081,-0.904709,0.227171,0.142235


In [291]:
df5.xs(2, level = 'Num')

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Group,class,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M1,c,-1.745976,-0.851874,-0.148627,0.478169
M1,d,-2.079632,0.364785,-0.389643,1.054263
M2,i,0.014738,1.267547,0.368468,1.02288
M2,j,0.344081,-0.904709,0.227171,0.142235


In [293]:
df5.xs('M1', level = 'Group')

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
Num,class,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,a,-0.266598,0.288172,-0.186946,-0.07285
1,b,0.360293,-0.253136,1.424846,-1.148209
2,c,-1.745976,-0.851874,-0.148627,0.478169
2,d,-2.079632,0.364785,-0.389643,1.054263
3,e,0.193175,0.866667,1.912587,1.212039
3,f,-0.828568,0.508801,1.812898,0.438464


In [297]:
df5.xs('C', axis=1)

Group  Num  class
M1     1    a       -0.186946
            b        1.424846
       2    c       -0.148627
            d       -0.389643
       3    e        1.912587
            f        1.812898
M2     1    g       -0.448151
            h        2.770488
       2    i        0.368468
            j        0.227171
       3    k        0.475452
            l       -0.540079
Name: C, dtype: float64

In [296]:
df5.xs(['C'], axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,C
Group,Num,class,Unnamed: 3_level_1
M1,1,a,-0.186946
M1,1,b,1.424846
M1,2,c,-0.148627
M1,2,d,-0.389643
M1,3,e,1.912587
M1,3,f,1.812898
M2,1,g,-0.448151
M2,1,h,2.770488
M2,2,i,0.368468
M2,2,j,0.227171


### Let's learn new functions/attributes/methods on "iris dataset" 

In [38]:
import seaborn as sns

In [40]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'exercise',
 'flights',
 'fmri',
 'gammas',
 'geyser',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'taxis',
 'tips',
 'titanic']

In [42]:
df = sns.load_dataset('iris')
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [44]:
df.loc[((df['species'] == 'setosa') & (df['sepal_length'] > 5)), ['sepal_length','sepal_width']]

Unnamed: 0,sepal_length,sepal_width
0,5.1,3.5
5,5.4,3.9
10,5.4,3.7
14,5.8,4.0
15,5.7,4.4
16,5.4,3.9
17,5.1,3.5
18,5.7,3.8
19,5.1,3.8
20,5.4,3.4


In [300]:
df.shape

(150, 5)

In [301]:
df.ndim

2

In [302]:
df.size

750

In [303]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [304]:
df.sample(4)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
33,5.5,4.2,1.4,0.2,setosa
135,7.7,3.0,6.1,2.3,virginica
132,6.4,2.8,5.6,2.2,virginica
136,6.3,3.4,5.6,2.4,virginica


In [305]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [306]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [309]:
df.species.value_counts()

virginica     50
versicolor    50
setosa        50
Name: species, dtype: int64

In [310]:
df['species'].value_counts()

virginica     50
versicolor    50
setosa        50
Name: species, dtype: int64

In [311]:
df.species.value_counts()

sepal_length    5.843333
sepal_width     3.057333
petal_length    3.758000
petal_width     1.199333
dtype: float64

In [313]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [312]:
df.sum(axis=0)

sepal_length                                                876.5
sepal_width                                                 458.6
petal_length                                                563.7
petal_width                                                 179.9
species         setosasetosasetosasetosasetosasetosasetosaseto...
dtype: object

In [None]:
df.sum(axis=1)

0      10.2
1       9.5
2       9.4
3       9.4
4      10.2
       ... 
145    17.2
146    15.7
147    16.7
148    17.3
149    15.8
Length: 150, dtype: float64

In [314]:
df.sepal_length.sum()

876.5

In [315]:
df.species.unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [316]:
df.isnull()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
...,...,...,...,...,...
145,False,False,False,False,False
146,False,False,False,False,False
147,False,False,False,False,False
148,False,False,False,False,False


In [317]:
df.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [318]:
len(df)

150

In [319]:
df.head(9)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa


In [320]:
df.iloc[0:6, 0:]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa


In [322]:
df.loc[0:6, :]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa


In [323]:
df.drop('species', axis=1)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [324]:
df[(df.sepal_length > 5) & (df.sepal_width > 3)].head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
10,5.4,3.7,1.5,0.2,setosa
14,5.8,4.0,1.2,0.2,setosa
15,5.7,4.4,1.5,0.4,setosa


In [None]:
df[(df.sepal_length>5) | (df.sepal_width>3)].tail()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica
149,5.9,3.0,5.1,1.8,virginica


In [325]:
df.sort_values(by = 'species', ascending = True)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
27,5.2,3.5,1.5,0.2,setosa
28,5.2,3.4,1.4,0.2,setosa
29,4.7,3.2,1.6,0.2,setosa
30,4.8,3.1,1.6,0.2,setosa
...,...,...,...,...,...
119,6.0,2.2,5.0,1.5,virginica
120,6.9,3.2,5.7,2.3,virginica
121,5.6,2.8,4.9,2.0,virginica
111,6.4,2.7,5.3,1.9,virginica


# End of the Session