# pandas DataFrame
pandas DataFrame is our main tool for working with data. A DataFrame is simply multiple pandas series that share the same index. You can think of a DataFrame as being similar to a spredsheet, just a lot more powerful!

In [None]:
import numpy as np
import pandas as pd


In [None]:
columns = ['W', 'X', 'Y', 'Z']

In [None]:
index = ['A', 'B', 'C', 'D', 'E']

In [None]:
from numpy.random import randint

In [None]:
np.random.seed(42)
data = randint(-100, 100, (5,4))
data

array([[  2,  79,  -8, -86],
       [  6, -29,  88, -80],
       [  2,  21, -26, -13],
       [ 16,  -1,   3,  51],
       [ 30,  49, -48, -99]])

In [None]:
df = pd.DataFrame(data, index, columns)
df

Unnamed: 0,W,X,Y,Z
A,2,79,-8,-86
B,6,-29,88,-80
C,2,21,-26,-13
D,16,-1,3,51
E,30,49,-48,-99


In [None]:
# returns a series of column W
df['W']

A     2
B     6
C     2
D    16
E    30
Name: W, dtype: int64

In [None]:
type(df['W'])

pandas.core.series.Series

In [None]:
# select multiple columns. returns a DataFrame
df[['W', 'Z']]

Unnamed: 0,W,Z
A,2,-86
B,6,-80
C,2,-13
D,16,51
E,30,-99


In [None]:
# we can use any order for the columns
df[['Z', 'W']]

Unnamed: 0,Z,W
A,-86,2
B,-80,6
C,-13,2
D,51,16
E,-99,30


In [None]:
# returns an error if we use a column that doesn't exist
# df['new']

In [None]:
# we create a column in the existing DataFrames
df['new'] = df['W'] = df['Y']
df

Unnamed: 0,W,X,Y,Z,new
A,-8,79,-8,-86,-8
B,88,-29,88,-80,88
C,-26,21,-26,-13,-26
D,3,-1,3,51,3
E,-48,49,-48,-99,-48


In [None]:
# remove a column
# throws an error because axis=0 by default
# df.drop('new')

# thus we have to set axis=1
# we have to reassign to df to change the original DataFrame
df = df.drop('new', axis=1)

# or we can

Unnamed: 0,W,X,Y,Z
A,-8,79,-8,-86
B,88,-29,88,-80
C,-26,21,-26,-13
D,3,-1,3,51
E,-48,49,-48,-99


In [None]:
# this throws an eror because a col could have the same index as the row
# df['A']


In [None]:
# to get a row, we use the ff
# also returns a series
df.loc['A']

W      -8
X      79
Y      -8
Z     -86
new    -8
Name: A, dtype: int64

In [None]:
# getting multiple rows. returns a DataFrame
df.loc[['A', 'E']]

Unnamed: 0,W,X,Y,Z,new
A,-8,79,-8,-86,-8
E,-48,49,-48,-99,-48


In [None]:
# returns the first row
df.iloc[0]

W      -8
X      79
Y      -8
Z     -86
new    -8
Name: A, dtype: int64

In [None]:
# returns the last row
df.iloc[-1]

W       3
X      -1
Y       3
Z      51
new     3
Name: D, dtype: int64

In [None]:
# use slice notation to get multiple rows
df.iloc[:3]

Unnamed: 0,W,X,Y,Z,new
A,-8,79,-8,-86,-8
B,88,-29,88,-80,88
C,-26,21,-26,-13,-26


In [None]:
# add a row
df.loc['F'] = df.loc['A'] + df.loc['B']
df


Unnamed: 0,W,X,Y,Z,new
A,88,-29,88,-80,88
B,88,-29,88,-80,88
D,3,-1,3,51,3
C,88,-29,88,-80,88
E,88,-29,88,-80,88


In [None]:
# remove a row
# we have to reassign to change original DataFrame
df = df.drop('F')
df

Unnamed: 0,W,X,Y,Z,new
A,88,-29,88,-80,88
B,88,-29,88,-80,88
D,3,-1,3,51,3
C,88,-29,88,-80,88


In [None]:
# get an element based on row and col
df.loc['A', 'W']

88

In [None]:
# getting multiple rows and cols
df.loc[['A', 'B'], ['W','Y']]

Unnamed: 0,W,Y
A,2,-8
B,6,88


In [None]:
# comparison of values in the pandas DataFrame
df > 0

Unnamed: 0,W,X,Y,Z
A,True,True,False,False
B,True,False,True,False
C,True,True,False,False
D,True,False,True,True
E,True,True,False,False


In [None]:
# pandas removes the values from the DataFrame that are not greater than 0
df[df > 0]


Unnamed: 0,W,X,Y,Z
A,2,79.0,,
B,6,,88.0,
C,2,21.0,,
D,16,,3.0,51.0
E,30,49.0,,


In [None]:
# compare a column only
df['X'] > 0


A     True
B    False
C     True
D    False
E     True
Name: X, dtype: bool

In [None]:
# get the dataframe results where x > 0
df[df['X'] > 0]


Unnamed: 0,W,X,Y,Z
A,2,79,-8,-86
C,2,21,-26,-13
E,30,49,-48,-99


In [None]:
# we can also select a specific row / col from the selected DF
df[df['X'] > 0]['W']


A     2
C     2
E    30
Name: W, dtype: int64

In [None]:
df

Unnamed: 0,W,X,Y,Z
A,2,79,-8,-86
B,6,-29,88,-80
C,2,21,-26,-13
D,16,-1,3,51
E,30,49,-48,-99


In [None]:
# select a row / col based on an AND condition
df[(df['W']>0) & (df['Y'] > 1)]

Unnamed: 0,W,X,Y,Z
B,6,-29,88,-80
D,16,-1,3,51


In [None]:
# select a row / col based on an OR condition
df[(df['W']>0) | (df['Y'] > 1)]

Unnamed: 0,W,X,Y,Z
A,2,79,-8,-86
B,6,-29,88,-80
C,2,21,-26,-13
D,16,-1,3,51
E,30,49,-48,-99


In [None]:
# add a numbered index to the rows and sets the old label column to "index"
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,2,79,-8,-86
1,B,6,-29,88,-80
2,C,2,21,-26,-13
3,D,16,-1,3,51
4,E,30,49,-48,-99


In [None]:
new_ind = ['CA', 'NY', 'WY', 'OR', 'CO']

In [None]:
df['States'] = new_ind
df

Unnamed: 0,W,X,Y,Z,States
A,2,79,-8,-86,CA
B,6,-29,88,-80,NY
C,2,21,-26,-13,WY
D,16,-1,3,51,OR
E,30,49,-48,-99,CO


In [None]:
# removes the old index and sets the States as the new index
# States is the name of the index. It is not a column
df = df.set_index('States')

In [None]:
df

Unnamed: 0,W,X,Y,Z,States
A,2,79,-8,-86,CA
B,6,-29,88,-80,NY
C,2,21,-26,-13,WY
D,16,-1,3,51,OR
E,30,49,-48,-99,CO


In [None]:
# proof that States is not a column
df.columns

Index(['W', 'X', 'Y', 'Z'], dtype='object')

In [None]:
df.describe()

Unnamed: 0,W,X,Y,Z
count,5.0,5.0,5.0,5.0
mean,11.2,23.8,1.8,-45.4
std,11.96662,42.109381,51.915316,63.366395
min,2.0,-29.0,-48.0,-99.0
25%,2.0,-1.0,-26.0,-86.0
50%,6.0,21.0,-8.0,-80.0
75%,16.0,49.0,3.0,-13.0
max,30.0,79.0,88.0,51.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, CA to CO
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   W       5 non-null      int64
 1   X       5 non-null      int64
 2   Y       5 non-null      int64
 3   Z       5 non-null      int64
dtypes: int64(4)
memory usage: 372.0+ bytes
