## Load Libraries and Create Data Frame

In [None]:
#import Pandas and Numpy
import pandas as pd
import numpy as np

#enable viewing of graphical output in notebook
%matplotlib inline

In [19]:
#Create a toy data frame of test scores
#using a dictionary, mapping the student's test scores,
#stored as lists, to the student's name as the dict key
df = pd.DataFrame({"Pam":[90, 85, 100, 75, 100], "Bob":[60, 70, 75, 100, 80], 
                   "Sally":[80, 85, 80, 90, 95], "Mike":[100, 95,None,100, 90]},
                  index = range(1,6))

#view the toy data frame
df

Unnamed: 0,Pam,Bob,Sally,Mike
1,90,60,80,100.0
2,85,70,85,95.0
3,100,75,80,
4,75,100,90,100.0
5,100,80,95,90.0


## Investigate Data Frame

In [20]:
#use the shape property of the data frame
#to view a tuple of the the number of rows and columns
df.shape

(5, 4)

In [21]:
#use the function info() to look at the class of df
#the data column names, the data column types, and more
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 1 to 5
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pam     5 non-null      int64  
 1   Bob     5 non-null      int64  
 2   Sally   5 non-null      int64  
 3   Mike    4 non-null      float64
dtypes: float64(1), int64(3)
memory usage: 292.0 bytes


In [22]:
#use the function describe() to look at some 
#statistical descriptions of the columns
df.describe()

Unnamed: 0,Pam,Bob,Sally,Mike
count,5.0,5.0,5.0,4.0
mean,90.0,77.0,86.0,96.25
std,10.606602,14.832397,6.519202,4.787136
min,75.0,60.0,80.0,90.0
25%,85.0,70.0,80.0,93.75
50%,90.0,75.0,85.0,97.5
75%,100.0,80.0,90.0,100.0
max,100.0,100.0,95.0,100.0


In [23]:
#find the mean of each student's scores
df.mean()

Pam      90.00
Bob      77.00
Sally    86.00
Mike     96.25
dtype: float64

In [24]:
#find the maximum of each student's scores
df.max()

Pam      100.0
Bob      100.0
Sally     95.0
Mike     100.0
dtype: float64

In [25]:
#determine Sally's minimum score
df["Sally"].min()

80

## Use Chained Indexing To Drill Into Data Frame

In [26]:
#Find Sally's first test score.
#Select Sally's column from the data frame
#using the column label
#thus creating a series. 
#Then index into that series with a number.
#Even though Python has 0-based indexing,
#we set our indexing to start at 1
#when we created the data frame.
#So Sally's first score is at index 1.
df["Sally"][1]

80

In [27]:
#Find Pam's second, third, and fourth score.
#Use indexing with lists.
df["Pam"][[2,3,4]]

2     85
3    100
4     75
Name: Pam, dtype: int64

In [None]:
#Select Mike and Sally's third, fourth, and fifth scores.
#This time I use a slice to select all the rows after the 2nd row
#and then index into the resulting data frame using the column names.
df[2:][["Mike", "Sally"]]

## Indexing into Data Frame Using loc and iloc

In [34]:
#Since the row index names are integers, you can use loc
#to select Sally's first score with the following code.
#loc uses names instead of integer locations to index.
df.loc[1,"Sally"]

80

In [35]:
#Now select Sally's first score using iloc.
#Since Python uses 0-based indexing
#Sally's first score is located at (0,2)
#in the data frame.
df.iloc[0,2]

80

In [37]:
#Now we will create a new data frame from the first
#and re-index the rows with string labels instead of integers
new_df = df
new_df.index = ["First", "Second", "Third", "Fourth", "Fifth"]

#view new_df
new_df

Unnamed: 0,Pam,Bob,Sally,Mike
First,90,60,80,100.0
Second,85,70,85,95.0
Third,100,75,80,
Fourth,75,100,90,100.0
Fifth,100,80,95,90.0


In [40]:
#Now find Mike's second score using loc and iloc.
#First use loc.
new_df.loc["Second", "Mike"]

95.0

In [41]:
#Now use iloc (integer-based indexing).
new_df.iloc[1,3]

95.0

In [42]:
#Use loc to find all of Mike's scores but the third
new_df.loc[["First","Second","Fourth","Fifth"], "Mike"]

First     100.0
Second     95.0
Fourth    100.0
Fifth      90.0
Name: Mike, dtype: float64

## Filtering with Booleans

In [50]:
#Determine which students scored above an 80 on the first test.
#First use a transposed data frame and boolean filtering.
new_df_t = new_df.T
new_df_t[new_df_t["First"] > 80]

Unnamed: 0,First,Second,Third,Fourth,Fifth
Pam,90.0,85.0,100.0,75.0,100.0
Mike,100.0,95.0,,100.0,90.0


In [51]:
#Now use the original data frame and loc
new_df.loc[:,new_df.loc["First"] > 80]

Unnamed: 0,Pam,Mike
First,90,100.0
Second,85,95.0
Third,100,
Fourth,75,100.0
Fifth,100,90.0


In [52]:
#Which students scored above an 80 on the fourth and fifth test?
new_df.loc[:, ((new_df.loc["Fourth"] > 80) & (new_df.loc["Fifth"] > 80))]

Unnamed: 0,Sally,Mike
First,80,100.0
Second,85,95.0
Third,80,
Fourth,90,100.0
Fifth,95,90.0
