# Let's create a Pandas dataframe and practice with it.

In [2]:
import pandas as pd
import numpy as np

In [3]:
# Set seed for reproducibility.
np.random.seed(123)

students = ['Christopher', 'Ariel', 'Samantha', 'Robert', 'Gabriel', 'Thomas', 'Louie Lob', 'Tiana', 'Karina', 'Danielle', 'Ebbany', 'Nayzeth', 'Dreython', 'Andrea Champion', 'Andrea Castillo', 'Dora', 'Ruben', 'Lilliana', 'Eric', 'Jacob', 'Jose']

In [5]:
# Randomly generate scores for the students for five subjects.
physics_grade = np.random.randint(low=50, high=100, size=len(students))
anatomy_grade = np.random.randint(low=50, high=100, size=len(students))
physiology_grade = np.random.randint(low=50, high=100, size=len(students))
statistics_grade = np.random.randint(low=50, high=100, size=len(students))
datascience_grade = np.random.randint(low=50, high=100, size=len(students))

In [10]:
# This will create a Pandas dataframe that will contain the name of the students and their grades for each class.
gradebook_df = pd.DataFrame({
    'Name':students, 
    'Physics':physics_grade,
    'Anatomy':anatomy_grade,
    'Physiology':physiology_grade,
    'Statistics':statistics_grade,
    'Data Science':datascience_grade})

#### Print out the newly created Pandas dataframe to show the world how much of a badass you are.

In [11]:
gradebook_df

Unnamed: 0,Name,Physics,Anatomy,Physiology,Statistics,Data Science
0,Christopher,95,86,77,56,98
1,Ariel,52,82,84,95,77
2,Samantha,78,66,83,76,99
3,Robert,84,54,62,66,72
4,Gabriel,88,99,90,56,53
5,Thomas,67,53,53,64,53
6,Louie Lob,69,52,92,89,61
7,Tiana,92,70,55,61,71
8,Karina,72,89,50,57,75
9,Danielle,83,52,61,51,89


#### It's always a good idea to get some info on the dataframe you are working with.

In [12]:
gradebook_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Name          21 non-null     object
 1   Physics       21 non-null     int64 
 2   Anatomy       21 non-null     int64 
 3   Physiology    21 non-null     int64 
 4   Statistics    21 non-null     int64 
 5   Data Science  21 non-null     int64 
dtypes: int64(5), object(1)
memory usage: 1.1+ KB


#### Always get a discription of your dataframe, too. This method will return a statistical description of all the numerical columns.

In [13]:
gradebook_df.describe()

Unnamed: 0,Physics,Anatomy,Physiology,Statistics,Data Science
count,21.0,21.0,21.0,21.0,21.0
mean,80.095238,76.238095,73.761905,72.761905,71.904762
std,13.508904,15.811087,14.049572,15.899386,15.378247
min,52.0,52.0,50.0,51.0,53.0
25%,69.0,66.0,62.0,61.0,59.0
50%,82.0,80.0,77.0,68.0,72.0
75%,92.0,88.0,84.0,89.0,84.0
max,99.0,99.0,94.0,99.0,99.0


#### Let's check out the attributes of our dataframe.

In [14]:
gradebook_df.dtypes

Name            object
Physics          int64
Anatomy          int64
Physiology       int64
Statistics       int64
Data Science     int64
dtype: object

In [15]:
gradebook_df.shape

(21, 6)

In [16]:
gradebook_df.columns

Index(['Name', 'Physics', 'Anatomy', 'Physiology', 'Statistics',
       'Data Science'],
      dtype='object')

In [21]:
gradebook_df.index

RangeIndex(start=0, stop=21, step=1)

### Let's subset our dataframe.

In [27]:
# Subset the dataframe by passing a list to the dataframe.
gradebook_df[['Name','Physiology']]

Unnamed: 0,Name,Physiology
0,Christopher,77
1,Ariel,84
2,Samantha,83
3,Robert,62
4,Gabriel,90
5,Thomas,53
6,Louie Lob,92
7,Tiana,55
8,Karina,50
9,Danielle,61


### Create a list of column names, store and name it something useful to the project. This will make things easier for referencing.

In [40]:
# Create a list of column names to reference later.
easy_stuff = ['Physics','Physiology']
type(easy_stuff)

list

In [29]:
# Pass list into dataframe to verify resulting dataframe.
gradebook_df[easy_stuff]

Unnamed: 0,Physics,Physiology
0,95,77
1,52,84
2,78,83
3,84,62
4,88,90
5,67,53
6,69,92
7,92,55
8,72,50
9,83,61


### Let's check the difference in what df['column_name']

In [34]:
whats_this = gradebook_df['Physics']

In [37]:
type(whats_this)

pandas.core.series.Series

In [38]:
how_about_this = gradebook_df[['Physics']]

In [39]:
type(how_about_this)

pandas.core.frame.DataFrame

#### Dataframes can also be indexed into with a boolean series. We can create a boolean series with the same length as our target column by subsetting the dataframe with a comparison operation.

In [45]:
(gradebook_df.Physics > 95)

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11     True
12     True
13    False
14    False
15     True
16    False
17     True
18    False
19    False
20    False
Name: Physics, dtype: bool

In [49]:
gradebook_df[gradebook_df.Physics > 95]

Unnamed: 0,Name,Physics,Anatomy,Physiology,Statistics,Data Science
11,Nayzeth,99,97,60,87,84
12,Dreython,97,98,72,75,53
15,Dora,96,85,86,62,80
17,Lilliana,97,88,93,67,59


In [54]:
gradebook_df.rename(columns={'Name':'Student'}, inplace=True)

In [55]:
gradebook_df

Unnamed: 0,Student,Physics,Anatomy,Physiology,Statistics,Data Science
0,Christopher,95,86,77,56,98
1,Ariel,52,82,84,95,77
2,Samantha,78,66,83,76,99
3,Robert,84,54,62,66,72
4,Gabriel,88,99,90,56,53
5,Thomas,67,53,53,64,53
6,Louie Lob,69,52,92,89,61
7,Tiana,92,70,55,61,71
8,Karina,72,89,50,57,75
9,Danielle,83,52,61,51,89


In [56]:
gradebook_df['The Elites'] = gradebook_df.Physics > 95

In [57]:
gradebook_df

Unnamed: 0,Student,Physics,Anatomy,Physiology,Statistics,Data Science,The Elites
0,Christopher,95,86,77,56,98,False
1,Ariel,52,82,84,95,77,False
2,Samantha,78,66,83,76,99,False
3,Robert,84,54,62,66,72,False
4,Gabriel,88,99,90,56,53,False
5,Thomas,67,53,53,64,53,False
6,Louie Lob,69,52,92,89,61,False
7,Tiana,92,70,55,61,71,False
8,Karina,72,89,50,57,75,False
9,Danielle,83,52,61,51,89,False
