Creating a DataFrame from a Dictionary

In [1]:
import pandas as pd

grades_dict = {'Wally': [87, 96, 70], 'Eva': [100, 87, 90],
               'Sam': [94, 77, 90], 'Katie': [100, 81, 82],
               'Bob': [83, 65, 85]}

grades = pd.DataFrame(grades_dict)

grades

Unnamed: 0,Wally,Eva,Sam,Katie,Bob
0,87,100,94,100,83
1,96,87,77,81,65
2,70,90,90,82,85


In [2]:
# Customizing a DataFrame's Indices with the index Attribute

grades.index = ['Test1', 'Test2', 'Test3']

grades

Unnamed: 0,Wally,Eva,Sam,Katie,Bob
Test1,87,100,94,100,83
Test2,96,87,77,81,65
Test3,70,90,90,82,85


In [3]:
# Display Eva's grades in a column

grades['Eva']

Test1    100
Test2     87
Test3     90
Name: Eva, dtype: int64

In [4]:
# Display Sam's grades in a column using the Same attribute

grades.Sam

Test1    94
Test2    77
Test3    90
Name: Sam, dtype: int64

In [5]:
# Selecting rows via the loc and iloc Attributes

# Listing all the grades in the row 'Test1'

grades.loc['Test1']

Wally     87
Eva      100
Sam       94
Katie    100
Bob       83
Name: Test1, dtype: int64

In [6]:
# Access rows by integer zero-based indices using iloc

# Listing all the grades in the second row:

grades.iloc[1]

Wally    96
Eva      87
Sam      77
Katie    81
Bob      65
Name: Test2, dtype: int64

In [7]:
# Selecting rows via Slices and Lists with the loc and iloc Attributes

# When using slices containing labels with loc, the range specified INCLUDES the high index:

grades.loc['Test1':'Test3']

Unnamed: 0,Wally,Eva,Sam,Katie,Bob
Test1,87,100,94,100,83
Test2,96,87,77,81,65
Test3,70,90,90,82,85


In [8]:
# When using slices containing integer indices with iloc, the range EXCLUDES the high index:

grades.iloc[0:2]

Unnamed: 0,Wally,Eva,Sam,Katie,Bob
Test1,87,100,94,100,83
Test2,96,87,77,81,65


In [9]:
# Selecting specific rows

grades.loc[['Test1', 'Test3']]

Unnamed: 0,Wally,Eva,Sam,Katie,Bob
Test1,87,100,94,100,83
Test3,70,90,90,82,85


In [10]:
grades.iloc[[0, 2]]

Unnamed: 0,Wally,Eva,Sam,Katie,Bob
Test1,87,100,94,100,83
Test3,70,90,90,82,85


In [11]:
# Selecting Subsets of the Rows and Columns

# Combinations of slices and lists

grades.loc['Test1':'Test2', ['Eva', 'Katie']] #slice and list

Unnamed: 0,Eva,Katie
Test1,100,100
Test2,87,81


In [12]:
grades.iloc[[0, 2], 0:3] #list and slice

Unnamed: 0,Wally,Eva,Sam
Test1,87,100,94
Test3,70,90,90


In [13]:
# Boolean Indexing

# Selecting all A grades (90 or over)

grades[grades >= 90]

Unnamed: 0,Wally,Eva,Sam,Katie,Bob
Test1,,100.0,94.0,100.0,
Test2,96.0,,,,
Test3,,90.0,90.0,,


In [14]:
# Selecting all B grades (80-89)

grades[(grades >= 80) & (grades <90)]

Unnamed: 0,Wally,Eva,Sam,Katie,Bob
Test1,87.0,,,,83.0
Test2,,87.0,,81.0,
Test3,,,,82.0,85.0


In [15]:
# Accessing a Specific DataFrame Cell by Row and Column

grades.at['Test2', 'Eva'] # Eva's Test2 grade

87

In [16]:
grades.iat[2, 0] # Wally's Test3 grade

70

In [17]:
# Assigning new values:

grades.at['Test2', 'Eva'] = 100

grades.at['Test2', 'Eva']

100

In [18]:
grades.iat[1, 1] = 87

grades.iat[1, 1]

87

In [19]:
grades

Unnamed: 0,Wally,Eva,Sam,Katie,Bob
Test1,87,100,94,100,83
Test2,96,87,77,81,65
Test3,70,90,90,82,85


In [20]:
# Descriptive Statistics

grades.describe()

Unnamed: 0,Wally,Eva,Sam,Katie,Bob
count,3.0,3.0,3.0,3.0,3.0
mean,84.333333,92.333333,87.0,87.666667,77.666667
std,13.203535,6.806859,8.888194,10.692677,11.015141
min,70.0,87.0,77.0,81.0,65.0
25%,78.5,88.5,83.5,81.5,74.0
50%,87.0,90.0,90.0,82.0,83.0
75%,91.5,95.0,92.0,91.0,84.0
max,96.0,100.0,94.0,100.0,85.0


In [21]:
pd.set_option('display.precision', 2)

grades.describe()

Unnamed: 0,Wally,Eva,Sam,Katie,Bob
count,3.0,3.0,3.0,3.0,3.0
mean,84.33,92.33,87.0,87.67,77.67
std,13.2,6.81,8.89,10.69,11.02
min,70.0,87.0,77.0,81.0,65.0
25%,78.5,88.5,83.5,81.5,74.0
50%,87.0,90.0,90.0,82.0,83.0
75%,91.5,95.0,92.0,91.0,84.0
max,96.0,100.0,94.0,100.0,85.0


In [22]:
# mean for each student

grades.mean()

Wally    84.33
Eva      92.33
Sam      87.00
Katie    87.67
Bob      77.67
dtype: float64

In [23]:
# Transpose (rows become columns and columns become rows

grades.T

Unnamed: 0,Test1,Test2,Test3
Wally,87,96,70
Eva,100,87,90
Sam,94,77,90
Katie,100,81,82
Bob,83,65,85


In [24]:
# Summary statistics by tests instead of by student

grades.T.describe()

Unnamed: 0,Test1,Test2,Test3
count,5.0,5.0,5.0
mean,92.8,81.2,83.4
std,7.66,11.54,8.23
min,83.0,65.0,70.0
25%,87.0,77.0,82.0
50%,94.0,81.0,85.0
75%,100.0,87.0,90.0
max,100.0,96.0,90.0


In [25]:
# average of all the students' grades on each test

grades.T.mean()

Test1    92.8
Test2    81.2
Test3    83.4
dtype: float64

In [26]:
# Sorting the rows by their indices in descending order

grades.sort_index(ascending=False)

Unnamed: 0,Wally,Eva,Sam,Katie,Bob
Test3,70,90,90,82,85
Test2,96,87,77,81,65
Test1,87,100,94,100,83


In [27]:
# Sorting columns in ascending order by their column names:

grades.sort_index(axis=1)

Unnamed: 0,Bob,Eva,Katie,Sam,Wally
Test1,83,100,100,94,87
Test2,65,87,81,77,96
Test3,85,90,82,90,70


In [28]:
# Sorting Test1's grades in descending order to see the student's names in highest-to-lowest
# grade order:

grades.sort_values(by='Test1',axis=1, ascending=False)

Unnamed: 0,Eva,Katie,Sam,Wally,Bob
Test1,100,100,94,87,83
Test2,87,81,77,96,65
Test3,90,82,90,70,85


In [29]:
# Transposed and sorted by highest grade

grades.T.sort_values(by='Test1', ascending=False)

Unnamed: 0,Test1,Test2,Test3
Eva,100,87,90
Katie,100,81,82
Sam,94,77,90
Wally,87,96,70
Bob,83,65,85


In [30]:
# Combining selection with sorting to remove unwanted data:

grades.loc['Test1'].sort_values(ascending=False)

Eva      100
Katie    100
Sam       94
Wally     87
Bob       83
Name: Test1, dtype: int64

7.14.2 Self Check

In [31]:
# 1) Given the following dictionary:
#
#     temps = {'Mon': [68, 89], 'Tue': [71, 93], 'Wed': [66, 82], 'Thu': [75, 97], 'Fri': [62, 79]}
#
# perform the following tasks:
#
#     a) Convert the dictionary into the DataFrame named temperatures with 'Low' and 'High' as the indices,
#        then display the DataFrame.
#     b) Use the column names to select only the columns for 'Mon' through 'Wed'.
#     c) Use the row index 'Low' to select only the low temperatures for each day.
#     d) Set the floating-point precision to 2, then calculate the average temperature for each day.
#     e) Calculate the average low and high temperatures.

import pandas as pd

temps = {'Mon': [68, 89], 'Tue': [71, 93], 'Wed': [66, 82], 'Thu': [75, 97], 'Fri': [62, 79]}

temperatures = pd.DataFrame(temps, index=['Low', 'High']) #a

temperatures

Unnamed: 0,Mon,Tue,Wed,Thu,Fri
Low,68,71,66,75,62
High,89,93,82,97,79


In [33]:
temperatures.loc[:, 'Mon':'Wed'] #b

Unnamed: 0,Mon,Tue,Wed
Low,68,71,66
High,89,93,82


In [34]:
temperatures.loc['Low'] #c

Mon    68
Tue    71
Wed    66
Thu    75
Fri    62
Name: Low, dtype: int64

In [36]:
pd.set_option('display.precision', 2) #d

temperatures.mean() #e

Mon    78.5
Tue    82.0
Wed    74.0
Thu    86.0
Fri    70.5
dtype: float64

In [None]:
brendi kargel