# Dataframes

In [1]:
# Main resource when working with Pandas. 
# Can be built upon series

In [2]:
import numpy as np
import pandas as pd
from numpy.random import randn

In [3]:
np.random.seed(101)

In [5]:
df = pd.DataFrame(data=randn(5,4),index=['A','B', 'C','D','E'], columns=['W', 'X','Y', 'Z'])

In [6]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [7]:
# EACH COLUMN ABOVE ARE ACTUAL SERIES WHO SHARE SAME INDEX

In [8]:
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [10]:
type(df['W'])

pandas.core.series.Series

In [11]:
type(df)

pandas.core.frame.DataFrame

In [12]:
df.W

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [13]:
df[['W','Z']]

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
C,-2.018168,-0.589001
D,0.188695,0.955057
E,0.190794,0.683509


In [14]:
# WHEN ASKING MULTIPLE COLUMNS I GET A DATAFRAME NOT A SERIES

# CREATE NEW COLUMN

In [15]:
df['NEW'] = df['W'] + df['Y']

In [16]:
df

Unnamed: 0,W,X,Y,Z,NEW
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [19]:
df.drop('NEW', axis=1, inplace=True)
# NOT IN PLACE : To avoid losing data by mistake. Pandas requires inplace=True

In [20]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [22]:
df.drop(['A','C'])

Unnamed: 0,W,X,Y,Z
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [23]:
df.shape

(5, 4)

In [24]:
# Tuple: index 0 - Rows
# Tuple: index 1 - Columns

# Selecting Rows

In [26]:
df.loc['A']

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

In [27]:
# Rows are Series as well!

In [28]:
df.iloc[2]

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

<b> Choosing between df.loc[] or df.iloc[] </b>
<p> If you want to use the actual label: df.loc['LABEL'] or if you prefer index number: df.iloc[0] </p>

# SELECTING SUBSET

In [31]:
# Select row and column
df.loc['B','Y']

-0.8480769834036315

In [32]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [33]:
df.loc[['A','B'],['Y','Z']]

Unnamed: 0,Y,Z
A,0.907969,0.503826
B,-0.848077,0.605965


In [37]:
df.iloc[0:2,2:]

Unnamed: 0,Y,Z
A,0.907969,0.503826
B,-0.848077,0.605965
