<h1> Introduction to Pandas -  DataFrames</h1>

<h3> FYI - DataFrame is the main tool in working with Pandas</h3>

In [3]:
import numpy as np
import pandas as pd
from numpy.random import randn
np.random.seed(101)                                       # Set a seed. Make sure get the same random numbers and the teacher

<h3> Creating a new DataFrame</h3>

In [4]:
df = pd.DataFrame(randn(5,4),['A','B','C','D','E'],['W','X','Y','Z'])        # A DataFrame passes data, index, and columns parameters

In [9]:
df    # A DataFrame lists columns (w,x,y,z) and corresponding rows(a,b,c,d,e) with randn data

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


<h3> Referencing values within a DataFrame</h3>

In [10]:
df['W']             # Grabbing values by referencing column headings

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [11]:
df['Z']

A    0.503826
B    0.605965
C   -0.589001
D    0.955057
E    0.683509
Name: Z, dtype: float64

In [12]:
df['X']

A    0.628133
B   -0.319318
C    0.740122
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [17]:
type(df['X'])               # Each column is really a Panda Series (data,index)
                            # As identified by type method *pandas.core.series.Series*

pandas.core.series.Series

In [16]:
type(df)                    # A DataFrame is a bunch of Series which share the same indexes
                            # *pandas.core.frame.DataFrame*

pandas.core.frame.DataFrame

In [20]:
df[['W','Z']]               # To make a DataFrame of more than one column, pass in a list of indexes
                            # Note: Double brackets when accessing more than one column [[]] in a DataFrame

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
C,-2.018168,-0.589001
D,0.188695,0.955057
E,0.190794,0.683509


<h3> Adding and/or removing a COLUMN into/from an existing DataFrame</h3>

In [21]:
df['new'] = df['W'] + df['Y']

In [22]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [26]:
df.drop('new', axis=1,inplace=True)            # When dropping a column, include:
                                               # the column that will be dropped,
                                               # axis = 1 (which refers to the column heading),
                                               # inplace = True (to make sure the column is dropped from the actual DataFrame)

In [27]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [28]:
df.drop('E', axis = 0)                        # When dropping a row, include the row and can include axis = 0 (which is for rows/indexes)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057


<h3> With regards to axis, why are rows 0 and columns 1?</h3>

In [29]:
df.shape

(5, 4)

In [32]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


<h3> There are <u>5</u> rows (A - E), which in numpy, rows start at position 0</h3>
<h3> There are <u>4</u> columns (W - Z), which in numpy, columns start at position 1</h3>

<h3> Selecting values from a ROW </h3>


In [33]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [37]:
df.loc['C']                        # To select a row in DataFrame, use .loc[] and pass in the row name

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

In [38]:
df.iloc[2]                        # Or, use the index position of the row and pass in that value using .iloc[]
                                  # For example, passing in the index position of 2 will produce the values of row C

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

<h2> Selecting subsets of rows and columns</h2>

In [41]:
df.loc['B','Y']                # To get a specific row and column, use .loc[row,column]

-0.84807698340363147

In [40]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [43]:
df.loc[['A','B'],['W','Y']]    # To get a subset of a DataFrame, pass in a list of rows and columns using .loc[[row1,row2],[column1,column2]]

Unnamed: 0,W,Y
A,2.70685,0.907969
B,0.651118,-0.848077
