# DataFrames
** DataFrame is a 2-dimensional labeled data structure with columns of potentially different types. You can think of it like a spreadsheet or SQL table, or a dict of Series objects. <br>
** DataFrame might have a bunch of series object together with a common index <br>
Ref: https://pandas.pydata.org/docs/user_guide/dsintro.html#dataframe

In [1]:
import numpy as np 
import pandas as pd 

In [2]:
#import randn for normal random data
from numpy.random import randn 
np.random.seed(41)

In [3]:
#df = pd.DataFrame(data=randn(5,4),index='A B C D E'.split(),columns='W X Y Z'.split())
df = pd.DataFrame(randn(5,4),index=['A','B','C','D','E'],columns=['W','X','Y','Z'])

In [4]:
df

Unnamed: 0,W,X,Y,Z
A,-0.270712,0.104848,0.250528,-0.9252
B,0.567144,-1.04018,-0.153676,0.789852
C,-1.226216,-0.948007,-0.569654,-0.97715
D,-0.770632,-0.033711,-1.032859,1.142427
E,-0.609778,1.469416,1.492679,0.707125


In [5]:
#see column type 
df.dtypes

W    float64
X    float64
Y    float64
Z    float64
dtype: object

## Column-wise selection and indexing 
Getting, setting, and deleting columns works with the same syntax as the analogous dictionary operations.
Ref: https://pandas.pydata.org/docs/user_guide/dsintro.html#indexing-selection

In [6]:
#Selecting single column 
df['W']

A   -0.270712
B    0.567144
C   -1.226216
D   -0.770632
E   -0.609778
Name: W, dtype: float64

In [7]:
#Also works but not recommended
df.W

A   -0.270712
B    0.567144
C   -1.226216
D   -0.770632
E   -0.609778
Name: W, dtype: float64

In [8]:
#Selecting multiple columns 
df[['W','X']]

Unnamed: 0,W,X
A,-0.270712,0.104848
B,0.567144,-1.04018
C,-1.226216,-0.948007
D,-0.770632,-0.033711
E,-0.609778,1.469416


In [9]:
#A df column is a series 
type(df['W'])

pandas.core.series.Series

## Creating a new column 

In [10]:
df['new'] = df['W']-df['X']

In [11]:
df

Unnamed: 0,W,X,Y,Z,new
A,-0.270712,0.104848,0.250528,-0.9252,-0.37556
B,0.567144,-1.04018,-0.153676,0.789852,1.607324
C,-1.226216,-0.948007,-0.569654,-0.97715,-0.278209
D,-0.770632,-0.033711,-1.032859,1.142427,-0.73692
E,-0.609778,1.469416,1.492679,0.707125,-2.079194


## Removing a column 

In [12]:
df.drop('new', axis=1)

Unnamed: 0,W,X,Y,Z
A,-0.270712,0.104848,0.250528,-0.9252
B,0.567144,-1.04018,-0.153676,0.789852
C,-1.226216,-0.948007,-0.569654,-0.97715
D,-0.770632,-0.033711,-1.032859,1.142427
E,-0.609778,1.469416,1.492679,0.707125


In [13]:
#drop is not excuted in the df 
df

Unnamed: 0,W,X,Y,Z,new
A,-0.270712,0.104848,0.250528,-0.9252,-0.37556
B,0.567144,-1.04018,-0.153676,0.789852,1.607324
C,-1.226216,-0.948007,-0.569654,-0.97715,-0.278209
D,-0.770632,-0.033711,-1.032859,1.142427,-0.73692
E,-0.609778,1.469416,1.492679,0.707125,-2.079194


In [14]:
# set inplace is True
df.drop('new',axis=1,inplace=True)
df

Unnamed: 0,W,X,Y,Z
A,-0.270712,0.104848,0.250528,-0.9252
B,0.567144,-1.04018,-0.153676,0.789852
C,-1.226216,-0.948007,-0.569654,-0.97715
D,-0.770632,-0.033711,-1.032859,1.142427
E,-0.609778,1.469416,1.492679,0.707125


In [16]:
#dropping row A: axis=0
df.drop('A',axis=0,inplace=True)
df

Unnamed: 0,W,X,Y,Z
B,0.567144,-1.04018,-0.153676,0.789852
C,-1.226216,-0.948007,-0.569654,-0.97715
D,-0.770632,-0.033711,-1.032859,1.142427
E,-0.609778,1.469416,1.492679,0.707125


In [None]:
#another way of deleting a column 
del df['new']

## Selecting row

In [17]:
#using row-name
df.loc['B']

W    0.567144
X   -1.040180
Y   -0.153676
Z    0.789852
Name: B, dtype: float64

In [18]:
#using row index
df.iloc[0]

W    0.567144
X   -1.040180
Y   -0.153676
Z    0.789852
Name: B, dtype: float64

In [19]:
#select a specific element by row-colum name
df.loc['B','W']

0.567143660285906

In [20]:
#select a specific element by row-colum index
df.iloc[0,0]

0.567143660285906

In [21]:
df

Unnamed: 0,W,X,Y,Z
B,0.567144,-1.04018,-0.153676,0.789852
C,-1.226216,-0.948007,-0.569654,-0.97715
D,-0.770632,-0.033711,-1.032859,1.142427
E,-0.609778,1.469416,1.492679,0.707125


In [22]:
#select a sub-set of elements
df.loc[['B','C'],['W','X']]

Unnamed: 0,W,X
B,0.567144,-1.04018
C,-1.226216,-0.948007


In [23]:
#row slicing 
df.iloc[0:2]

Unnamed: 0,W,X,Y,Z
B,0.567144,-1.04018,-0.153676,0.789852
C,-1.226216,-0.948007,-0.569654,-0.97715


In [24]:
#slicing like numpy array works as well
df[:2]

Unnamed: 0,W,X,Y,Z
B,0.567144,-1.04018,-0.153676,0.789852
C,-1.226216,-0.948007,-0.569654,-0.97715


In [26]:
#with step size 2
df[::2]

Unnamed: 0,W,X,Y,Z
B,0.567144,-1.04018,-0.153676,0.789852
D,-0.770632,-0.033711,-1.032859,1.142427


### Conditional selection

In [137]:
df

Unnamed: 0,W,X,Y,Z
B,0.567144,-1.04018,-0.153676,0.789852
C,-1.226216,-0.948007,-0.569654,-0.97715
D,-0.770632,-0.033711,-1.032859,1.142427
E,-0.609778,1.469416,1.492679,0.707125


In [138]:
#values >0 
df>0

Unnamed: 0,W,X,Y,Z
B,True,False,False,True
C,False,False,False,False
D,False,False,False,True
E,False,True,True,True


In [139]:
#get those positive vlaues 
df[df>0]

Unnamed: 0,W,X,Y,Z
B,0.567144,,,0.789852
C,,,,
D,,,,1.142427
E,,1.469416,1.492679,0.707125


In [140]:
# Find values based on W colum value >0
df[df['W']>0]

Unnamed: 0,W,X,Y,Z
B,0.567144,-1.04018,-0.153676,0.789852


In [141]:
# Find Y colum value based on W colum value >0
df[df['W']>0]['Y']

B   -0.153676
Name: Y, dtype: float64

In [142]:
# Find Y and Z column values based on W colum value >0
df[df['W']>0][['Y','Z']]

Unnamed: 0,Y,Z
B,-0.153676,0.789852


In [143]:
df

Unnamed: 0,W,X,Y,Z
B,0.567144,-1.04018,-0.153676,0.789852
C,-1.226216,-0.948007,-0.569654,-0.97715
D,-0.770632,-0.033711,-1.032859,1.142427
E,-0.609778,1.469416,1.492679,0.707125


In [144]:
#Find values based on both W>0 and Z>0
df[(df['W']>0) & (df['Z']>0)]

Unnamed: 0,W,X,Y,Z
B,0.567144,-1.04018,-0.153676,0.789852


### Reseting and setting index

In [145]:
#resetting index to 0,1,2,3,
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,B,0.567144,-1.04018,-0.153676,0.789852
1,C,-1.226216,-0.948007,-0.569654,-0.97715
2,D,-0.770632,-0.033711,-1.032859,1.142427
3,E,-0.609778,1.469416,1.492679,0.707125


In [146]:
new_index = 'M N O P'.split()

In [147]:
new_index

['M', 'N', 'O', 'P']

In [148]:
df['sl no.'] = new_index

In [149]:
df

Unnamed: 0,W,X,Y,Z,sl no.
B,0.567144,-1.04018,-0.153676,0.789852,M
C,-1.226216,-0.948007,-0.569654,-0.97715,N
D,-0.770632,-0.033711,-1.032859,1.142427,O
E,-0.609778,1.469416,1.492679,0.707125,P


In [150]:
df.set_index('sl no.')

Unnamed: 0_level_0,W,X,Y,Z
sl no.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M,0.567144,-1.04018,-0.153676,0.789852
N,-1.226216,-0.948007,-0.569654,-0.97715
O,-0.770632,-0.033711,-1.032859,1.142427
P,-0.609778,1.469416,1.492679,0.707125


In [151]:
df

Unnamed: 0,W,X,Y,Z,sl no.
B,0.567144,-1.04018,-0.153676,0.789852,M
C,-1.226216,-0.948007,-0.569654,-0.97715,N
D,-0.770632,-0.033711,-1.032859,1.142427,O
E,-0.609778,1.469416,1.492679,0.707125,P


In [152]:
df.set_index('sl no.',inplace=True)

In [153]:
df

Unnamed: 0_level_0,W,X,Y,Z
sl no.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M,0.567144,-1.04018,-0.153676,0.789852
N,-1.226216,-0.948007,-0.569654,-0.97715
O,-0.770632,-0.033711,-1.032859,1.142427
P,-0.609778,1.469416,1.492679,0.707125
