In [1]:
## Library imports
import pandas as pd
import numpy as np
from pydataset import data

In [2]:
# Set the value used to generate random numbers
np.random.seed(123)

In [3]:
# Create a list with various names
students = ['Sally', 'Jane', 'Suzie', 'Billy', 'Ada', 'John', 'Thomas',
            'Marie', 'Albert', 'Richard', 'Isaac', 'Alan']

In [4]:
# randomly generate scores for each student for each subject
# note that all the values need to have the same length here
# np.randint is low inclusive high exclusive meaning that no one can get 100
math_grades = np.random.randint(low=60, high=100, size=len(students))
english_grades = np.random.randint(low=60, high=100, size=len(students))
reading_grades = np.random.randint(low=60, high=100, size=len(students))

In [5]:
# Creating a dataframe
# Use a dictionary to specify the columns
df = pd.DataFrame({'name': students,
                   'math': math_grades,
                   'english': english_grades,
                   'reading': reading_grades})

type(df)

pandas.core.frame.DataFrame

In [6]:
print(df)

       name  math  english  reading
0     Sally    62       85       80
1      Jane    88       79       67
2     Suzie    94       74       95
3     Billy    98       96       88
4       Ada    77       92       98
5      John    79       76       93
6    Thomas    82       64       81
7     Marie    93       63       90
8    Albert    92       62       87
9   Richard    69       80       94
10    Isaac    92       99       93
11     Alan    92       62       72


In [7]:
df

Unnamed: 0,name,math,english,reading
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98
5,John,79,76,93
6,Thomas,82,64,81
7,Marie,93,63,90
8,Albert,92,62,87
9,Richard,69,80,94


In [8]:
# View info about dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     12 non-null     object
 1   math     12 non-null     int64 
 2   english  12 non-null     int64 
 3   reading  12 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 512.0+ bytes


In [9]:
# Summary of numerical values
df.describe()

Unnamed: 0,math,english,reading
count,12.0,12.0,12.0
mean,84.833333,77.666667,86.5
std,11.134168,13.371158,9.643651
min,62.0,62.0,67.0
25%,78.5,63.75,80.75
50%,90.0,77.5,89.0
75%,92.25,86.75,93.25
max,98.0,99.0,98.0


# Dataframe Attributes

- dtypes - data types (the data types present in the dataframe)
- shape - number of rows by columns (same as series)
- columns - gives column names
- index - label for each row (autogenerated as an index)

In [10]:
# View our data types using dtypes
df.dtypes

name       object
math        int64
english     int64
reading     int64
dtype: object

In [11]:
# View data shape
df.shape

(12, 4)

In [12]:
# View column names
df.columns

Index(['name', 'math', 'english', 'reading'], dtype='object')

In [13]:
# View index
df.index

RangeIndex(start=0, stop=12, step=1)

In [14]:
# Changing columns not changing names
df.columns = [col.capitalize() for col in df.columns]
df

Unnamed: 0,Name,Math,English,Reading
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98
5,John,79,76,93
6,Thomas,82,64,81
7,Marie,93,63,90
8,Albert,92,62,87
9,Richard,69,80,94


In [15]:
# View multiple columns
# Don't forget that the string needs to be exact
# Because we capitalized the column names we need to use caps here

df [['Name', 'Math']]

Unnamed: 0,Name,Math
0,Sally,62
1,Jane,88
2,Suzie,94
3,Billy,98
4,Ada,77
5,John,79
6,Thomas,82
7,Marie,93
8,Albert,92
9,Richard,69


In [16]:
# Select a single column
math_scores = df['Math']
math_scores

0     62
1     88
2     94
3     98
4     77
5     79
6     82
7     93
8     92
9     69
10    92
11    92
Name: Math, dtype: int64

In [17]:
# Assigning rest of scores
reading_scores = df['Reading']
english_scores = df['English']

In [18]:
# Creating a variable that contains column names

columns = ['Name', 'Math']

In [19]:
# Using new variable to specify the columns

df[columns]

Unnamed: 0,Name,Math
0,Sally,62
1,Jane,88
2,Suzie,94
3,Billy,98
4,Ada,77
5,John,79
6,Thomas,82
7,Marie,93
8,Albert,92
9,Richard,69


In [20]:
# Access the individual columns by name
# Returns a series Using [''] will also return a series
df.Math

0     62
1     88
2     94
3     98
4     77
5     79
6     82
7     93
8     92
9     69
10    92
11    92
Name: Math, dtype: int64

In [21]:
# To return as a dataframe use double brackets
df[['Math']]

Unnamed: 0,Math
0,62
1,88
2,94
3,98
4,77
5,79
6,82
7,93
8,92
9,69


In [22]:
# Accessing subsets of the rows
# Defaults to 5
df.head()
# Organized by index number by default

Unnamed: 0,Name,Math,English,Reading
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98


In [23]:
df.tail(3)

Unnamed: 0,Name,Math,English,Reading
9,Richard,69,80,94
10,Isaac,92,99,93
11,Alan,92,62,72


In [24]:
# Random sample of our data
df.sample(4)

Unnamed: 0,Name,Math,English,Reading
7,Marie,93,63,90
11,Alan,92,62,72
1,Jane,88,79,67
4,Ada,77,92,98


# Using Boolean Values

In [25]:
# Returns boolean operation applied to entire column
df.Math < 80

0      True
1     False
2     False
3     False
4      True
5      True
6     False
7     False
8     False
9      True
10    False
11    False
Name: Math, dtype: bool

In [26]:
# Returns entire dataframe where Boolean is True by default
df[df.Math < 80]

Unnamed: 0,Name,Math,English,Reading
0,Sally,62,85,80
4,Ada,77,92,98
5,John,79,76,93
9,Richard,69,80,94


In [27]:
# To inverse Boolean to False either change the code or add parenthesis and == 0
df[(df.Math < 80) == 0]

Unnamed: 0,Name,Math,English,Reading
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
6,Thomas,82,64,81
7,Marie,93,63,90
8,Albert,92,62,87
10,Isaac,92,99,93
11,Alan,92,62,72


In [28]:
# Dropping a set of columns
# Use a list with column names
df.drop(columns = ['English', 'Reading'])

Unnamed: 0,Name,Math
0,Sally,62
1,Jane,88
2,Suzie,94
3,Billy,98
4,Ada,77
5,John,79
6,Thomas,82
7,Marie,93
8,Albert,92
9,Richard,69


In [29]:
# Renaming a column
df.rename(columns = {'Name': 'Student'})

Unnamed: 0,Student,Math,English,Reading
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98
5,John,79,76,93
6,Thomas,82,64,81
7,Marie,93,63,90
8,Albert,92,62,87
9,Richard,69,80,94


In [30]:
# Chaining operations
# Even though we have a variable named 'columns' the structure here actually calls for the syntax columns
df.drop(columns = ['English']).rename(columns = {'Name': 'Student'})

Unnamed: 0,Student,Math,Reading
0,Sally,62,80
1,Jane,88,67
2,Suzie,94,95
3,Billy,98,88
4,Ada,77,98
5,John,79,93
6,Thomas,82,81
7,Marie,93,90
8,Albert,92,87
9,Richard,69,94


In [31]:
df.Math >= 70

0     False
1      True
2      True
3      True
4      True
5      True
6      True
7      True
8      True
9     False
10     True
11     True
Name: Math, dtype: bool

In [32]:
# Creating a new column in dataframe
df['Passing Math'] = df.Math >= 70

In [34]:
df

Unnamed: 0,Name,Math,English,Reading,Passing Math
0,Sally,62,85,80,False
1,Jane,88,79,67,True
2,Suzie,94,74,95,True
3,Billy,98,96,88,True
4,Ada,77,92,98,True
5,John,79,76,93,True
6,Thomas,82,64,81,True
7,Marie,93,63,90,True
8,Albert,92,62,87,True
9,Richard,69,80,94,False


In [44]:
# Use assign to create a new column
# When you do it this way you cannot have spaces
df.assign(Passing_English = df.English >= 70)

Unnamed: 0,Name,Math,English,Reading,Passing Math,Passing_English
0,Sally,62,85,80,False,True
1,Jane,88,79,67,True,True
2,Suzie,94,74,95,True,True
3,Billy,98,96,88,True,True
4,Ada,77,92,98,True,True
5,John,79,76,93,True,True
6,Thomas,82,64,81,True,False
7,Marie,93,63,90,True,False
8,Albert,92,62,87,True,False
9,Richard,69,80,94,False,True


In [45]:
df.sort_values('English', ascending = False)

Unnamed: 0,Name,Math,English,Reading,Passing Math
10,Isaac,92,99,93,True
3,Billy,98,96,88,True
4,Ada,77,92,98,True
0,Sally,62,85,80,False
9,Richard,69,80,94,False
1,Jane,88,79,67,True
5,John,79,76,93,True
2,Suzie,94,74,95,True
6,Thomas,82,64,81,True
7,Marie,93,63,90,True


In [48]:
# Try and read what this line of code is doing step by step
df[df.English >= 90].sort_values(by='English').head(1).Name

4    Ada
Name: Name, dtype: object