___

<p style="text-align: center;"><img src="https://docs.google.com/uc?id=1lY0Uj5R04yMY3-ZppPWxqCr5pvBLYPnV" class="img-fluid" alt="CLRSWY"></p>

___

<h1><p style="text-align: center;">Pandas Lesson, Session - 4</p><h1>
    

# Data Frames

 - ### ``DataFrames`` are the workhorse of pandas and are directly inspired by the R programming language. We can think of a DataFrame as a bunch of Series objects put together to share the same index. Let's use pandas to explore this topic!

In [1]:
import pandas as pd
import numpy as np

 - ### Creating a DataFrame using the ``list``s of data and columns

In [3]:
datas=[1,3,5,7,9,10]
datas

[1, 3, 5, 7, 9, 10]

 - ### Creating a DataFrame using a ``NumPy Arrays``

In [5]:
pd.DataFrame(datas,columns=['Column1'])

Unnamed: 0,Column1
0,1
1,3
2,5
3,7
4,9
5,10


In [9]:
m=np.arange(1,24,2).reshape(3,4)
m

array([[ 1,  3,  5,  7],
       [ 9, 11, 13, 15],
       [17, 19, 21, 23]])

In [17]:
df = pd.DataFrame(m,columns=['Col1','Col2','Col3','Col4'])
df

Unnamed: 0,Col1,Col2,Col3,Col4
0,1,3,5,7
1,9,11,13,15
2,17,19,21,23


In [18]:
df.head()

Unnamed: 0,Col1,Col2,Col3,Col4
0,1,3,5,7
1,9,11,13,15
2,17,19,21,23


In [20]:
df.tail(2)

Unnamed: 0,Col1,Col2,Col3,Col4
1,9,11,13,15
2,17,19,21,23


In [21]:
df.columns

Index(['Col1', 'Col2', 'Col3', 'Col4'], dtype='object')

In [22]:
for i in df.columns:
    print(i)

Col1
Col2
Col3
Col4


In [33]:
df.columns=['varr1','varr2','varr3','varr4']
df

Unnamed: 0,varr1,varr2,varr3,varr4
0,1,3,5,7
1,9,11,13,15
2,17,19,21,23


In [24]:
type(df)

pandas.core.frame.DataFrame

In [25]:
df.shape

(3, 4)

In [26]:
df.shape[0]

3

In [27]:
df.size

12

In [28]:
df.values

array([[ 1,  3,  5,  7],
       [ 9, 11, 13, 15],
       [17, 19, 21, 23]])

In [29]:
type(df.values)

numpy.ndarray

 - ### Creating a DataFrame using a ``dict``

In [30]:
s1 = np.random.randint(2, 10, size = 4)
s2 = np.random.randint(3, 10, size = 4)
s3 = np.random.randint(4, 15, size = 4)

In [32]:
print(s1,s2,s3)

[5 6 5 8] [4 4 8 5] [13 11 13 13]


In [34]:
myDict={'var1':s1,'var2':s2,'var3':s3}
myDict

{'var1': array([5, 6, 5, 8]),
 'var2': array([4, 4, 8, 5]),
 'var3': array([13, 11, 13, 13])}

In [36]:
df1 = pd.DataFrame(myDict)
df1

Unnamed: 0,var1,var2,var3
0,5,4,13
1,6,4,11
2,5,8,13
3,8,5,13


In [37]:
df1.index

RangeIndex(start=0, stop=4, step=1)

In [38]:
[i for i in df1.index]

[0, 1, 2, 3]

In [39]:
print(* df1.index)

0 1 2 3


In [40]:
['a','b','c','d']

['a', 'b', 'c', 'd']

In [41]:
df1.index=['a', 'b', 'c', 'd']

In [42]:
print(* df1.index)

a b c d


In [43]:
df1

Unnamed: 0,var1,var2,var3
a,5,4,13
b,6,4,11
c,5,8,13
d,8,5,13


In [44]:
'var1' in df1

True

In [45]:
'var5' in df1

False

### Now, let's examine again the ***indexing, selection*** and ***slicing*** methods and several ***attributes*** using a different DataFrame

In [46]:
from numpy.random import randn
np.random.seed(101)

In [48]:
df3 = pd.DataFrame(randn(5, 4),\
    index = 'A B C D E'.split(), columns = 'W X Y Z'.split())

In [49]:
df3

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [50]:
'A B C D E'.split()

['A', 'B', 'C', 'D', 'E']

In [51]:
pd.DataFrame(data=randn(5, 4),\
    columns='w x y z'.split(), index='a b c d e'.split())

Unnamed: 0,w,x,y,z
a,-0.993263,0.1968,-1.136645,0.000366
b,1.025984,-0.156598,-0.031579,0.649826
c,2.154846,-0.610259,-0.755325,-0.346419
d,0.147027,-0.479448,0.558769,1.02481
e,-0.925874,1.862864,-1.133817,0.610478


In [52]:
pd.DataFrame(randn(5, 4), 'a b c d e'.split(), 'w x y z'.split())

Unnamed: 0,w,x,y,z
a,0.38603,2.084019,-0.376519,0.230336
b,0.681209,1.035125,-0.03116,1.939932
c,-1.005187,-0.74179,0.187125,-0.732845
d,-1.38292,1.482495,0.961458,-2.141212
e,0.992573,1.192241,-1.04678,1.292765


## Selection and Indexing

Let's learn the various methods to grab data from a DataFrame

In [53]:
df3

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [54]:
df3['Y']

A   -1.706086
B    0.166905
C    0.638787
D   -0.943406
E    0.238127
Name: Y, dtype: float64

In [55]:
type(df3['Y'])

pandas.core.series.Series

In [56]:
df3[['Y']]

Unnamed: 0,Y
A,-1.706086
B,0.166905
C,0.638787
D,-0.943406
E,0.238127


In [57]:
df3.Y

A   -1.706086
B    0.166905
C    0.638787
D   -0.943406
E    0.238127
Name: Y, dtype: float64

In [58]:
type(df3.Y)

pandas.core.series.Series

In [59]:
df3.T

Unnamed: 0,A,B,C,D,E
W,0.302665,-0.134841,0.807706,-0.497104,-0.116773
X,1.693723,0.390528,0.07296,-0.75407,1.901755
Y,-1.706086,0.166905,0.638787,-0.943406,0.238127
Z,-1.159119,0.184502,0.329646,0.484752,1.996652


In [60]:
df3['Y','Z'] # dataframe oldugu icin iki parantez gerekir

KeyError: ('Y', 'Z')

In [62]:
df3[['Y','Z']]

Unnamed: 0,Y,Z
A,-1.706086,-1.159119
B,0.166905,0.184502
C,0.638787,0.329646
D,-0.943406,0.484752
E,0.238127,1.996652


In [63]:
sample_list = ['Y','Z']

In [65]:
df3[sample_list]

Unnamed: 0,Y,Z
A,-1.706086,-1.159119
B,0.166905,0.184502
C,0.638787,0.329646
D,-0.943406,0.484752
E,0.238127,1.996652


df3

In [66]:
df3

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [67]:
df3['W':'Z']

Unnamed: 0,W,X,Y,Z


In [69]:
df3['A':'B']

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502


In [72]:
df3[['W','X']]

Unnamed: 0,W,X
A,0.302665,1.693723
B,-0.134841,0.390528
C,0.807706,0.07296
D,-0.497104,-0.75407
E,-0.116773,1.901755


In [74]:
df3['A':'C'][['W','Y']]

Unnamed: 0,W,Y
A,0.302665,-1.706086
B,-0.134841,0.166905
C,0.807706,0.638787


#### DataFrame Columns are just Series

In [75]:
df3

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [76]:
df3['X'] * df3['Y']

A   -2.889637
B    0.065181
C    0.046606
D    0.711394
E    0.452859
dtype: float64

In [77]:
df3['Multiply'] = df3['X'] * df3['Y']
df3

Unnamed: 0,W,X,Y,Z,Multiply
A,0.302665,1.693723,-1.706086,-1.159119,-2.889637
B,-0.134841,0.390528,0.166905,0.184502,0.065181
C,0.807706,0.07296,0.638787,0.329646,0.046606
D,-0.497104,-0.75407,-0.943406,0.484752,0.711394
E,-0.116773,1.901755,0.238127,1.996652,0.452859


In [91]:
df3['sample'] = [1,2,3,4,5]
df3

Unnamed: 0,W,X,Y,Z,Multiply,sample
A,0.302665,1.693723,-1.706086,-1.159119,-2.889637,1
B,-0.134841,0.390528,0.166905,0.184502,0.065181,2
C,0.807706,0.07296,0.638787,0.329646,0.046606,3
D,-0.497104,-0.75407,-0.943406,0.484752,0.711394,4
E,-0.116773,1.901755,0.238127,1.996652,0.452859,5


In [81]:
df3['empty']=""
df3

Unnamed: 0,W,X,Y,Z,Multiply,sample,empty
A,0.302665,1.693723,-1.706086,-1.159119,-2.889637,1,
B,-0.134841,0.390528,0.166905,0.184502,0.065181,2,
C,0.807706,0.07296,0.638787,0.329646,0.046606,3,
D,-0.497104,-0.75407,-0.943406,0.484752,0.711394,4,
E,-0.116773,1.901755,0.238127,1.996652,0.452859,5,


**Creating a new column:**

### [Removing Columns & Rows](http://localhost:8888/notebooks/pythonic/DAwPythonSessions/w3resource-pandas-dataframe-drop.ipynb)

 - ### Removing Columns

In [83]:
df3.drop('empty') # axis 0 da olmadigi icin hatali

KeyError: "['empty'] not found in axis"

In [89]:
df3 = df3.drop('empty',axis=1)
df3

Unnamed: 0,W,X,Y,Z,Multiply
A,0.302665,1.693723,-1.706086,-1.159119,-2.889637
B,-0.134841,0.390528,0.166905,0.184502,0.065181
C,0.807706,0.07296,0.638787,0.329646,0.046606
D,-0.497104,-0.75407,-0.943406,0.484752,0.711394
E,-0.116773,1.901755,0.238127,1.996652,0.452859


In [90]:
#df3.drop('sample',axis=1, inplace = True)
df3

Unnamed: 0,W,X,Y,Z,Multiply
A,0.302665,1.693723,-1.706086,-1.159119,-2.889637
B,-0.134841,0.390528,0.166905,0.184502,0.065181
C,0.807706,0.07296,0.638787,0.329646,0.046606
D,-0.497104,-0.75407,-0.943406,0.484752,0.711394
E,-0.116773,1.901755,0.238127,1.996652,0.452859


In [92]:
df3.drop(['Multiply','sample'],axis=1, inplace = True)
df3

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


 - ### Removing rows

In [93]:
df3

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [95]:
df3.drop('C') # axis=0 default


Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [97]:
df4 = df3.drop('C',axis=0) 
df4

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [98]:
df3

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


### Selecting Rows

- ### First, let's take a quick look at [`.loc[]`](http://localhost:8888/notebooks/pythonic/DAwPythonSessions/w3resource-pandas-dataframe-loc.ipynb) | [`.iloc[]`](http://localhost:8888/notebooks/pythonic/DAwPythonSessions/w3resource-pandas-dataframe-iloc.ipynb)

#### `.loc[]` → allows us to select data using **labels** (names) of rows (index) & columns

#### `.iloc[]` → allows us to select data using **index numbers** of rows (index) & columns. it's like classical indexing logic

In [100]:
m = np.random.randint(1, 40, size=(8, 4))
df4 = pd.DataFrame(m, columns = ["var1", "var2", "var3", 'var4'])
df4

Unnamed: 0,var1,var2,var3,var4
0,26,3,17,3
1,25,16,30,7
2,29,23,21,26
3,34,8,9,17
4,30,1,17,39
5,36,26,32,12
6,9,12,13,16
7,23,38,3,39


In [102]:
df4.loc[4]

var1    30
var2     1
var3    17
var4    39
Name: 4, dtype: int32

In [103]:
df4.iloc[2:5]

Unnamed: 0,var1,var2,var3,var4
2,29,23,21,26
3,34,8,9,17
4,30,1,17,39


In [106]:
df4.index='a b c d e f g h'.split()
df4

Unnamed: 0,var1,var2,var3,var4
a,26,3,17,3
b,25,16,30,7
c,29,23,21,26
d,34,8,9,17
e,30,1,17,39
f,36,26,32,12
g,9,12,13,16
h,23,38,3,39


In [107]:
df4.iloc[[1]]

Unnamed: 0,var1,var2,var3,var4
b,25,16,30,7


In [109]:
df4.loc['b']

var1    25
var2    16
var3    30
var4     7
Name: b, dtype: int32

In [110]:
df4.loc[['b']]

Unnamed: 0,var1,var2,var3,var4
b,25,16,30,7


In [112]:
df4

Unnamed: 0,var1,var2,var3,var4
a,26,3,17,3
b,25,16,30,7
c,29,23,21,26
d,34,8,9,17
e,30,1,17,39
f,36,26,32,12
g,9,12,13,16
h,23,38,3,39


In [111]:
df4.iloc[4,1]

1

In [113]:
df4.loc['d':'g']

Unnamed: 0,var1,var2,var3,var4
d,34,8,9,17
e,30,1,17,39
f,36,26,32,12
g,9,12,13,16


In [116]:
df4.loc['d':'g'][['var3']]

Unnamed: 0,var3
d,9
e,17
f,32
g,13


In [119]:
df4.iloc[2:5,2]

c    21
d     9
e    17
Name: var3, dtype: int32

In [120]:
df4.iloc[2:5,[2]]

Unnamed: 0,var3
c,21
d,9
e,17


#### Let's continue to examine `.loc[]` and `.iloc[]` using ``df3`` again

In [121]:
df3

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [122]:
df3.loc['C']

W    0.807706
X    0.072960
Y    0.638787
Z    0.329646
Name: C, dtype: float64

In [123]:
df3.iloc[2]

W    0.807706
X    0.072960
Y    0.638787
Z    0.329646
Name: C, dtype: float64

In [125]:
df3.loc[['C']]

Unnamed: 0,W,X,Y,Z
C,0.807706,0.07296,0.638787,0.329646


In [126]:
df3.iloc[[2]]

Unnamed: 0,W,X,Y,Z
C,0.807706,0.07296,0.638787,0.329646


In [129]:
df3.iloc[:,[2]]

Unnamed: 0,Y
A,-1.706086
B,0.166905
C,0.638787
D,-0.943406
E,0.238127


### Selecting subset of rows and columns

 - ### `.loc[[row labels|names], [column labels|names]]`

 - ### `.iloc[[row index numbers], [column index numbers]]`

In [130]:
df3

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [None]:
df3.loc['C']

In [131]:
df3.loc['C','Z']

0.32964629880452445

In [132]:
df3.loc[['C'],['Z']]

Unnamed: 0,Z
C,0.329646


In [133]:
df3.loc[['C','A'],['Z','W']]

Unnamed: 0,Z,W
C,0.329646,0.807706
A,-1.159119,0.302665


In [137]:
df3.iloc[[0,2],[0,3]]

Unnamed: 0,W,Z
A,0.302665,-1.159119
C,0.807706,0.329646


In [138]:
df3.iloc[[2,0],[3,0]]

Unnamed: 0,Z,W
C,0.329646,0.807706
A,-1.159119,0.302665


### Conditional Selection

An important feature of pandas is conditional selection using bracket notation, very similar to numpy:

In [139]:
df3

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [140]:
df3>0.5

Unnamed: 0,W,X,Y,Z
A,False,True,False,False
B,False,False,False,False
C,True,False,True,False
D,False,False,False,False
E,False,True,False,True


In [141]:
df3[df3>0.5]

Unnamed: 0,W,X,Y,Z
A,,1.693723,,
B,,,,
C,0.807706,,0.638787,
D,,,,
E,,1.901755,,1.996652


In [143]:
df3[df3['Z']>0.5]

Unnamed: 0,W,X,Y,Z
E,-0.116773,1.901755,0.238127,1.996652


In [150]:
df3[df3['Y']>0][['W','X']]

Unnamed: 0,W,X
B,-0.134841,0.390528
C,0.807706,0.07296
E,-0.116773,1.901755


In [151]:
df3[df3['Y']>0][['Y','X']]

Unnamed: 0,Y,X
B,0.166905,0.390528
C,0.638787,0.07296
E,0.238127,1.901755


#### For two conditions you can use **|** → `or`,  **&** →  `and` with parenthesis:

### 

In [155]:
df3

Unnamed: 0,W,X,Y,Z
A,0.0,0.0,0.0,0.0
B,-0.134841,0.390528,0.166905,0.184502
C,0.0,0.0,0.0,0.0
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [156]:
df3[(df3['W'] > 0) & (df3['Y'] < 1)] 


Unnamed: 0,W,X,Y,Z
A,0.0,0.0,0.0,0.0
B,-0.134841,0.390528,0.166905,0.184502
C,0.0,0.0,0.0,0.0
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


#### Conditional selection using ``.loc[]`` and ``.iloc[]``

In [157]:
df3.loc[df3.X > 0,['X','Z']]

Unnamed: 0,X,Z
B,0.390528,0.184502
E,1.901755,1.996652


In [158]:
df3.loc[((df3.X > 0)|(df3.Y<1)),['X','Z']]

Unnamed: 0,X,Z
A,0.0,0.0
B,0.390528,0.184502
C,0.0,0.0
D,-0.75407,0.484752
E,1.901755,1.996652


In [159]:
df3.loc[((df3['X'] > 0)|(df3.Y<1)),['X','Z']]

Unnamed: 0,X,Z
A,0.0,0.0
B,0.390528,0.184502
C,0.0,0.0
D,-0.75407,0.484752
E,1.901755,1.996652


## 

## More Index Details

Let's discuss some more features of indexing, including resetting the index or setting it something else. We'll also talk about index hierarchy!

In [160]:
df3

Unnamed: 0,W,X,Y,Z
A,0.0,0.0,0.0,0.0
B,-0.134841,0.390528,0.166905,0.184502
C,0.0,0.0,0.0,0.0
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [164]:
df3.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,0.0,0.0,0.0,0.0
1,B,-0.134841,0.390528,0.166905,0.184502
2,C,0.0,0.0,0.0,0.0
3,D,-0.497104,-0.75407,-0.943406,0.484752
4,E,-0.116773,1.901755,0.238127,1.996652


In [165]:
df3.reset_index(drop=True)

Unnamed: 0,W,X,Y,Z
0,0.0,0.0,0.0,0.0
1,-0.134841,0.390528,0.166905,0.184502
2,0.0,0.0,0.0,0.0
3,-0.497104,-0.75407,-0.943406,0.484752
4,-0.116773,1.901755,0.238127,1.996652


In [166]:
df3

Unnamed: 0,W,X,Y,Z
A,0.0,0.0,0.0,0.0
B,-0.134841,0.390528,0.166905,0.184502
C,0.0,0.0,0.0,0.0
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [162]:
newindx = 'CA NY WY OR CO'.split()
newindx

['CA', 'NY', 'WY', 'OR', 'CO']

In [168]:
df3['newindex']=newindx
df3


Unnamed: 0,W,X,Y,Z,newindex
A,0.0,0.0,0.0,0.0,CA
B,-0.134841,0.390528,0.166905,0.184502,NY
C,0.0,0.0,0.0,0.0,WY
D,-0.497104,-0.75407,-0.943406,0.484752,OR
E,-0.116773,1.901755,0.238127,1.996652,CO


In [169]:
df3.set_index('newindex')
df3

Unnamed: 0,W,X,Y,Z,newindex
A,0.0,0.0,0.0,0.0,CA
B,-0.134841,0.390528,0.166905,0.184502,NY
C,0.0,0.0,0.0,0.0,WY
D,-0.497104,-0.75407,-0.943406,0.484752,OR
E,-0.116773,1.901755,0.238127,1.996652,CO


In [170]:
df3.set_index('newindex',inplace=True)
df3

Unnamed: 0_level_0,W,X,Y,Z
newindex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,0.0,0.0,0.0,0.0
NY,-0.134841,0.390528,0.166905,0.184502
WY,0.0,0.0,0.0,0.0
OR,-0.497104,-0.75407,-0.943406,0.484752
CO,-0.116773,1.901755,0.238127,1.996652


## Multi-Index and Index Hierarchy

Let us go over how to work with Multi-Index, first we'll create a quick example of what a Multi-Indexed DataFrame would look like:

In [171]:
outside = ['M1', 'M1', 'M1', 'M1', 'M1', 'M1', 'M2', 'M2', 'M2', 'M2', 'M2', 'M2']
inside = [1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3]
third = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l']
multi_index = list(zip(outside, inside, third))
multi_index


[('M1', 1, 'a'),
 ('M1', 1, 'b'),
 ('M1', 2, 'c'),
 ('M1', 2, 'd'),
 ('M1', 3, 'e'),
 ('M1', 3, 'f'),
 ('M2', 1, 'g'),
 ('M2', 1, 'h'),
 ('M2', 2, 'i'),
 ('M2', 2, 'j'),
 ('M2', 3, 'k'),
 ('M2', 3, 'l')]

In [172]:
hier_index = pd.MultiIndex.from_tuples(multi_index)
hier_index

MultiIndex([('M1', 1, 'a'),
            ('M1', 1, 'b'),
            ('M1', 2, 'c'),
            ('M1', 2, 'd'),
            ('M1', 3, 'e'),
            ('M1', 3, 'f'),
            ('M2', 1, 'g'),
            ('M2', 1, 'h'),
            ('M2', 2, 'i'),
            ('M2', 2, 'j'),
            ('M2', 3, 'k'),
            ('M2', 3, 'l')],
           )

In [173]:
df5 = pd.DataFrame(np.random.randn(12,4),index=hier_index, columns='A B C D'.split())

In [174]:
df5

Unnamed: 0,Unnamed: 1,Unnamed: 2,A,B,C,D
M1,1,a,1.836753,0.862782,1.506899,0.04927
M1,1,b,0.816299,-1.607242,-0.807731,-1.096354
M1,2,c,-0.789084,1.286486,0.274132,-0.880482
M1,2,d,1.066049,-1.394954,0.304737,-0.595581
M1,3,e,0.107576,0.453847,-0.756019,-1.118859
M1,3,f,1.828815,-0.720654,-2.052395,-1.325425
M2,1,g,-0.518817,-1.387928,-0.46677,-1.174098
M2,1,h,-0.077359,1.366345,1.241589,-2.919887
M2,2,i,0.547547,0.294506,-1.681709,-0.643638
M2,2,j,-1.504789,-0.018412,0.81892,0.921243


In [175]:
df5.loc['M1']

Unnamed: 0,Unnamed: 1,A,B,C,D
1,a,1.836753,0.862782,1.506899,0.04927
1,b,0.816299,-1.607242,-0.807731,-1.096354
2,c,-0.789084,1.286486,0.274132,-0.880482
2,d,1.066049,-1.394954,0.304737,-0.595581
3,e,0.107576,0.453847,-0.756019,-1.118859
3,f,1.828815,-0.720654,-2.052395,-1.325425


In [176]:
df5.iloc[8] # 8. satir

A    0.547547
B    0.294506
C   -1.681709
D   -0.643638
Name: (M2, 2, i), dtype: float64

In [177]:
df5.loc['M1'].loc[2]

Unnamed: 0,A,B,C,D
c,-0.789084,1.286486,0.274132,-0.880482
d,1.066049,-1.394954,0.304737,-0.595581


In [180]:
df5.loc['M1'].loc[2].loc[['d']]

Unnamed: 0,A,B,C,D
d,1.066049,-1.394954,0.304737,-0.595581


In [181]:
df5.loc['M1'].loc[2].iloc[[1]]

Unnamed: 0,A,B,C,D
d,1.066049,-1.394954,0.304737,-0.595581


In [185]:
df5.index.names

FrozenList([None, None, None])

In [186]:
df5.index.names=['Group','Num','Class']

In [188]:
df5


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,A,B,C,D
Group,Num,Class,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
M1,1,a,1.836753,0.862782,1.506899,0.04927
M1,1,b,0.816299,-1.607242,-0.807731,-1.096354
M1,2,c,-0.789084,1.286486,0.274132,-0.880482
M1,2,d,1.066049,-1.394954,0.304737,-0.595581
M1,3,e,0.107576,0.453847,-0.756019,-1.118859
M1,3,f,1.828815,-0.720654,-2.052395,-1.325425
M2,1,g,-0.518817,-1.387928,-0.46677,-1.174098
M2,1,h,-0.077359,1.366345,1.241589,-2.919887
M2,2,i,0.547547,0.294506,-1.681709,-0.643638
M2,2,j,-1.504789,-0.018412,0.81892,0.921243


### let's take a quick look at the [``.xs()``](http://localhost:8888/notebooks/pythonic/DAwPythonSessions/w3resource-pandas-dataframe-xs.ipynb)

In [191]:
df5.xs(['M1',2])

  df5.xs(['M1',2])


Unnamed: 0_level_0,A,B,C,D
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
c,-0.789084,1.286486,0.274132,-0.880482
d,1.066049,-1.394954,0.304737,-0.595581


### Let's learn new functions/attributes/methods on "iris dataset" 

In [192]:
import seaborn as sns

In [194]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'exercise',
 'flights',
 'fmri',
 'gammas',
 'geyser',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'taxis',
 'tips',
 'titanic']

In [195]:
df=sns.load_dataset('iris')
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [196]:
df.shape

(150, 5)

In [197]:
df.ndim

2

In [198]:
df.size

750

In [199]:
df.tail()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica
149,5.9,3.0,5.1,1.8,virginica


In [200]:
df.sample(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
95,5.7,3.0,4.2,1.2,versicolor
61,5.9,3.0,4.2,1.5,versicolor
106,4.9,2.5,4.5,1.7,virginica
23,5.1,3.3,1.7,0.5,setosa
19,5.1,3.8,1.5,0.3,setosa
22,4.6,3.6,1.0,0.2,setosa
101,5.8,2.7,5.1,1.9,virginica
11,4.8,3.4,1.6,0.2,setosa
0,5.1,3.5,1.4,0.2,setosa
125,7.2,3.2,6.0,1.8,virginica


In [201]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [204]:
df.species.value_counts()

setosa        50
versicolor    50
virginica     50
Name: species, dtype: int64

In [206]:
df['species'].value_counts()

setosa        50
versicolor    50
virginica     50
Name: species, dtype: int64

In [207]:
df.mean()

  df.mean()


sepal_length    5.843333
sepal_width     3.057333
petal_length    3.758000
petal_width     1.199333
dtype: float64

In [209]:
df.sum(axis=0)

sepal_length                                                876.5
sepal_width                                                 458.6
petal_length                                                563.7
petal_width                                                 179.9
species         setosasetosasetosasetosasetosasetosasetosaseto...
dtype: object

In [210]:
df.sum(axis=1)

  df.sum(axis=1)


0      10.2
1       9.5
2       9.4
3       9.4
4      10.2
       ... 
145    17.2
146    15.7
147    16.7
148    17.3
149    15.8
Length: 150, dtype: float64

In [211]:
df.species.unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [212]:
df.isnull()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
...,...,...,...,...,...
145,False,False,False,False,False
146,False,False,False,False,False
147,False,False,False,False,False
148,False,False,False,False,False


In [213]:
df.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [214]:
df.info

<bound method DataFrame.info of      sepal_length  sepal_width  petal_length  petal_width    species
0             5.1          3.5           1.4          0.2     setosa
1             4.9          3.0           1.4          0.2     setosa
2             4.7          3.2           1.3          0.2     setosa
3             4.6          3.1           1.5          0.2     setosa
4             5.0          3.6           1.4          0.2     setosa
..            ...          ...           ...          ...        ...
145           6.7          3.0           5.2          2.3  virginica
146           6.3          2.5           5.0          1.9  virginica
147           6.5          3.0           5.2          2.0  virginica
148           6.2          3.4           5.4          2.3  virginica
149           5.9          3.0           5.1          1.8  virginica

[150 rows x 5 columns]>

In [215]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [216]:
len(df)

150

In [217]:
df.iloc[0:6,:]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa


In [218]:
df.loc[((df['species']=='setosa')&(df['sepal_length']>5)),['sepal_length','sepal_width']]

Unnamed: 0,sepal_length,sepal_width
0,5.1,3.5
5,5.4,3.9
10,5.4,3.7
14,5.8,4.0
15,5.7,4.4
16,5.4,3.9
17,5.1,3.5
18,5.7,3.8
19,5.1,3.8
20,5.4,3.4


In [220]:
df.loc[((df['species']=='setosa')&(df['sepal_length']>5)),['sepal_length','sepal_width','species']]

Unnamed: 0,sepal_length,sepal_width,species
0,5.1,3.5,setosa
5,5.4,3.9,setosa
10,5.4,3.7,setosa
14,5.8,4.0,setosa
15,5.7,4.4,setosa
16,5.4,3.9,setosa
17,5.1,3.5,setosa
18,5.7,3.8,setosa
19,5.1,3.8,setosa
20,5.4,3.4,setosa


In [223]:
df3.index=pd.RangeIndex(start=0,stop=0+len(df3),step=1)
df3

Unnamed: 0,W,X,Y,Z
0,0.0,0.0,0.0,0.0
1,-0.134841,0.390528,0.166905,0.184502
2,0.0,0.0,0.0,0.0
3,-0.497104,-0.75407,-0.943406,0.484752
4,-0.116773,1.901755,0.238127,1.996652


In [227]:
df3.index=pd.RangeIndex(start=5,stop=9+len(df3),step=2)
df3

Unnamed: 0,W,X,Y,Z
5,0.0,0.0,0.0,0.0
7,-0.134841,0.390528,0.166905,0.184502
9,0.0,0.0,0.0,0.0
11,-0.497104,-0.75407,-0.943406,0.484752
13,-0.116773,1.901755,0.238127,1.996652


In [229]:
df3.index=[np.arange(1,6)]
df3

Unnamed: 0,W,X,Y,Z
1,0.0,0.0,0.0,0.0
2,-0.134841,0.390528,0.166905,0.184502
3,0.0,0.0,0.0,0.0
4,-0.497104,-0.75407,-0.943406,0.484752
5,-0.116773,1.901755,0.238127,1.996652


# End of the Session

In [None]:
a