<h1> Introduction to Pandas -  DataFrames</h1>

<h3> FYI - DataFrame is the main tool in working with Pandas</h3>

In [6]:
import numpy as np
import pandas as pd
from numpy.random import randn
np.random.seed(101)                                       # Set a seed. Make sure get the same random numbers and the teacher

<h2> DataFrames - Part 1 </h2>

<h3> Creating a new DataFrame</h3>

In [7]:
df = pd.DataFrame(randn(5,4),['A','B','C','D','E'],['W','X','Y','Z'])        # A DataFrame passes data, index(rows), and columns parameters

In [8]:
df    # A DataFrame lists columns (w,x,y,z) and corresponding rows(a,b,c,d,e) with randn data

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


<h3> Referencing values within a DataFrame</h3>

In [9]:
df['W']             # Grabbing values by referencing column headings

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [10]:
df['Z']

A    0.503826
B    0.605965
C   -0.589001
D    0.955057
E    0.683509
Name: Z, dtype: float64

In [11]:
df['X']

A    0.628133
B   -0.319318
C    0.740122
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [12]:
type(df['X'])               # Each column is really a Panda Series (data,index)
                            # As identified by type method *pandas.core.series.Series*

pandas.core.series.Series

In [13]:
type(df)                    # A DataFrame is a bunch of Series which share the same indexes
                            # *pandas.core.frame.DataFrame*

pandas.core.frame.DataFrame

In [14]:
df[['W','Z','X']]               # To make a DataFrame of more than one column, pass in a list of columns
                            # Note: Double brackets when accessing more than one column [[]] in a DataFrame

Unnamed: 0,W,Z,X
A,2.70685,0.503826,0.628133
B,0.651118,0.605965,-0.319318
C,-2.018168,-0.589001,0.740122
D,0.188695,0.955057,-0.758872
E,0.190794,0.683509,1.978757


<h3> Adding and/or removing a COLUMN into/from an existing DataFrame</h3>

In [15]:
df['new'] = df['W'] + df['Y']

In [16]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [17]:
df.drop('new', axis=1,inplace=True)            # When dropping a column, include:
                                               # the column that will be dropped,
                                               # axis = 1 (which refers to the column heading),
                                               # inplace = True (to make sure the column is dropped from the actual DataFrame)

In [18]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [29]:
df.drop('E', axis = 0)                        # When dropping a row, include the row and can include axis = 0 (which is for rows/indexes)
                                              # *Optional* Can use inplace = True (to make sure the row is dropped from the actual DataFrame) 

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057


<h3> With regards to axis, why are rows 0 and columns 1?</h3>

In [20]:
df.shape

(5, 4)

In [21]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


<h3> There are <u>5</u> rows (A - E), which in numpy, rows start at position 0</h3>
<h3> There are <u>4</u> columns (W - Z), which in numpy, columns start at position 1</h3>

<h3> Selecting values from a ROW </h3>


In [22]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [23]:
df.loc['C']                        # To select a row in DataFrame, use .loc[] and pass in the row name

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

In [24]:
df.iloc[2]                        # Or, use the index position of the row and pass in that value using .iloc[]
                                  # For example, passing in the index position of 2 will produce the values of row C

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

<h2> Selecting subsets of rows and columns</h2>

In [25]:
df.loc['B','Y']                # To get a specific row and column, use .loc[row,column]

-0.84807698340363147

In [26]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [27]:
df.loc[['A','B'],['W','Y']]    # To get a subset of a DataFrame, pass in a list of rows and columns using .loc[[row1,row2],[column1,column2]]

Unnamed: 0,W,Y
A,2.70685,0.907969
B,0.651118,-0.848077


<h2> DataFrames - Part 2</h2>

In [31]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


<h3> Conditional Selection Using Bracket Notation</h3>

In [38]:
booldf = df > 0                 # Assign a value called booldf to conditional statement df > 0,
                                # to see which rows/columns meet the conditional statement

In [42]:
booldf

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [50]:
df[booldf]                      # The DataFrame returns values or NaN,
                                # based on the conditional df > 0

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [51]:
df[df>0]                        # Instead of assigning a value to the conditional statement,
                                # can use a df conditional statement inside of a df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [45]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [52]:
df['W'] > 0                     # Check to see if a column inside the df is greater 0

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

In [53]:
df['W']                         # Column C has a negative value and returns False in previous example

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [60]:
df[df['W']>0]                   # Return all rows in a df under a specific column which do *NOT* have NaN values, or is True.
                                # For example, Row C was omitted b/c it has a negative value under column W
                                # Note the syntax: df[df['some column'] > 0]  A dataframe of a dataframe of a column which is greater than 0

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [61]:
df[df['Z'] <0]                   # Grab all rows in a df where everything under column Z < 0
                                 # Note the syntax: df[df['some column'] < 0]         

Unnamed: 0,W,X,Y,Z
C,-2.018168,0.740122,0.528813,-0.589001


<h4> Assign output to a variable to use for data slicing</h4>

In [62]:
resultdf = df[df['W'] > 0]

In [63]:
resultdf

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [95]:
resultdf[['Y', 'X']]         # Grab all values from columns Y and X in the resultdf variable

Unnamed: 0,Y,X
A,0.907969,0.628133
B,-0.848077,-0.319318
D,-0.933237,-0.758872
E,2.605967,1.978757


In [71]:
df[df['W']>0][['Y','X']]    # Combine the previous steps into one line of code:
                            # Grab all values from the df column W that are greater than 0 (conditional statment),
                            # and apply the ouput of the conditional statement to columns Y and X.
                            # Note the syntax: df[df['some column']>0][['column1','column2']] 
                            # (double brackets for column1 and column2)
                                

Unnamed: 0,Y,X
A,0.907969,0.628133
B,-0.848077,-0.319318
D,-0.933237,-0.758872
E,2.605967,1.978757


<h3> Using Multiple Conditions</h3>

<h4> Using AND &</h4>

In [78]:
df[(df['W']> 0) & (df['Y']>1)]      # To combine conditional statements using *AND*:
                                    # Create the first df conditional statement using ()
                                    # then use the '&' (shift + 7)
                                    # then create the next df conditional statement using ()
                                    # For example: check df[w] > 0 & df[Y] >1 and return all values that's TRUE
                                    # Note syntax: df[(df['some column1' > 0]) & (df['some column2'] > 1)]

Unnamed: 0,W,X,Y,Z
E,0.190794,1.978757,2.605967,0.683509


<h4> Using OR  |</h4>

In [79]:
df[(df['W']> 0) | (df['Y']>1)]      # To combine conditional statements using *OR*:
                                    # Create the first df conditional statement using ()
                                    # then use the '|' (pipe)
                                    # then create the next df conditional statement using ()
                                    # For example: check df[w] > 0 | df[Y] >1 and return all values that's TRUE
                                    # Note syntax: df[(df['some column1' > 0]) | (df['some column2'] > 1)]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


<h3> Resetting the Index</h3>

In [80]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [93]:
df.reset_index() # reset index back to default
                 # Notice the first column has 0 - 4 (index column),
                 # and the word 'index' in the column (atop of the pervious index)
                 # Note: To reset the index permanently in the df, use df.reset_index(inplace=True) 

Unnamed: 0,index,W,X,Y,Z,States
0,A,2.70685,0.628133,0.907969,0.503826,CA
1,B,0.651118,-0.319318,-0.848077,0.605965,NY
2,C,-2.018168,0.740122,0.528813,-0.589001,WY
3,D,0.188695,-0.758872,-0.933237,0.955057,OR
4,E,0.190794,1.978757,2.605967,0.683509,CO


In [84]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


<h3>Setting the index</h3>

In [85]:
newind = 'CA NY WY OR CO'.split()       # Create a list (of state abbrivations), 
                                        # split them and assign to newind

In [86]:
newind

['CA', 'NY', 'WY', 'OR', 'CO']

In [89]:
df['States'] = newind                  # Add a new df column called 'States',
                                       # assign it to newind 

In [90]:
df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,CA
B,0.651118,-0.319318,-0.848077,0.605965,NY
C,-2.018168,0.740122,0.528813,-0.589001,WY
D,0.188695,-0.758872,-0.933237,0.955057,OR
E,0.190794,1.978757,2.605967,0.683509,CO


In [91]:
df.set_index('States')      # To set df column states to be the index
                            # To set it permanently in the df, df.set_index('States', inplace=True)

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,2.70685,0.628133,0.907969,0.503826
NY,0.651118,-0.319318,-0.848077,0.605965
WY,-2.018168,0.740122,0.528813,-0.589001
OR,0.188695,-0.758872,-0.933237,0.955057
CO,0.190794,1.978757,2.605967,0.683509


In [92]:
df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,CA
B,0.651118,-0.319318,-0.848077,0.605965,NY
C,-2.018168,0.740122,0.528813,-0.589001,WY
D,0.188695,-0.758872,-0.933237,0.955057,OR
E,0.190794,1.978757,2.605967,0.683509,CO


<h2> DataFrames - Part 3</h2>

<h3> Multi-Index DataFrame</h3>

In [99]:
# Index Levels
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))  # Create a list and merge lists 'outside' and 'inside'
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [100]:
outside

['G1', 'G1', 'G1', 'G2', 'G2', 'G2']

In [101]:
inside

[1, 2, 3, 1, 2, 3]

In [104]:
list(zip(outside,inside))

[('G1', 1), ('G1', 2), ('G1', 3), ('G2', 1), ('G2', 2), ('G2', 3)]

In [105]:
hier_index

MultiIndex(levels=[['G1', 'G2'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])

<h4>Create a DataFrame</h4>

In [110]:
df = pd.DataFrame(randn(6,2),hier_index,['A','B'])  # Create a DataFrame and assign to df:
                                                    # data = randn(6,2) (6 rows, 2 columns)
                                                    # index = hier_index (the multiIndex defined in previous steps)
                                                    # columns = ['A','B']

In [111]:
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,1.025984,-0.156598
G1,2,-0.031579,0.649826
G1,3,2.154846,-0.610259
G2,1,-0.755325,-0.346419
G2,2,0.147027,-0.479448
G2,3,0.558769,1.02481


<h4>Calling data from a Multi-level index</h4>

In [114]:
df.loc['G1']    # Call the outside index first
                # Example: G1

Unnamed: 0,A,B
1,1.025984,-0.156598
2,-0.031579,0.649826
3,2.154846,-0.610259


In [115]:
df.loc['G1'].loc[1]     # Dig deeper into the G1 level,
                        # get the values with row 1

A    1.025984
B   -0.156598
Name: 1, dtype: float64

In [116]:
df.loc['G1'].loc[2]     # Dig deeper into the G1 level,
                        # get the values with row 2

A   -0.031579
B    0.649826
Name: 2, dtype: float64

In [117]:
df.loc['G1'].loc[3]     # Dig deeper into the G1 level,
                        # get the values with row 3

A    2.154846
B   -0.610259
Name: 3, dtype: float64

<h4>Naming Indexes</h4>

In [119]:
df.index.names         # Pandas showing that the indexes don't have names [None,None]

FrozenList([None, None])

In [120]:
df.index.names = ['Groups','Num']   # To name indexes: 
                                    # Usage: df.index.names = ['outside/first index name','inside/second index name'] 

In [122]:
df.index.names        # Pandas showing the newly created index names

FrozenList(['Groups', 'Num'])

In [123]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,1.025984,-0.156598
G1,2,-0.031579,0.649826
G1,3,2.154846,-0.610259
G2,1,-0.755325,-0.346419
G2,2,0.147027,-0.479448
G2,3,0.558769,1.02481


<h4>Grabbing Information from the newly named multi-index DF Examples</h4>

In [128]:
df.loc['G2'].loc[2]['B']     # Grab value -0.479448

-0.47944803904109595

In [131]:
df.loc['G2'].loc[3]['A']    # Grab value 0.558769

0.55876940644306705

In [132]:
df.loc['G1'].loc[3]['B']    # Grab value -0.610259

-0.61025885582274142

In [133]:
df.loc['G1'].loc[1]['A']    # Grab value 1.025984

1.025984152081572

<h4>Cross Section</h4>
<h4>Grabs all values within specified rows based on identified row and specified index level </h4>

In [134]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,1.025984,-0.156598
G1,2,-0.031579,0.649826
G1,3,2.154846,-0.610259
G2,1,-0.755325,-0.346419
G2,2,0.147027,-0.479448
G2,3,0.558769,1.02481


In [136]:
df.xs(1,level='Num')        # Cross Section
                            # Grab all data from rows that start with '1',
                            # from the 'Num' level
                            # of both Groups (outside) level (G1 and G2)
                            # Usage: df.xs('index name/number', level = 'Name of inner index')

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,1.025984,-0.156598
G2,-0.755325,-0.346419


In [137]:
df.xs(3,level = 'Num')      # Grab all data from rows that start with '3'
                            # from the 'Num' level
                            # of both Groups (outside) level (G1 and G2)

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,2.154846,-0.610259
G2,0.558769,1.02481


In [143]:
df.xs(2, level='Num')       # Grab all data from rows that start with '2'
                            # from the 'Num' level
                            # of both Groups (outside) level (G1 and G2)

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,-0.031579,0.649826
G2,0.147027,-0.479448
