# DataFrame

The object DataFrame of the package pandas represents a table of data. Each column is a Series; the columns share a common index.

In [2]:
# print all the outputs in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import numpy as np

In [4]:
%autosave 0

Autosave disabled


## Create a DataFrame

### From cmd

In [8]:
pd.DataFrame({'Austin':[30], 'Nicole':[50]})

Unnamed: 0,Austin,Nicole
0,30,50


### Write to a file

In [12]:
df_fruit = pd.DataFrame(
    {'Austin':[30,40], 'Nicole':[50,60]},
    index = ['2017', '2018']
)
df_fruit

Unnamed: 0,Austin,Nicole
2017,30,50
2018,40,60


### From a file

Place the data file in the same folder as the ipynb file. Then, read it as follows:

In [14]:
df = pd.read_csv('students.csv')

In [16]:
df

Unnamed: 0,Name,hw1,hw2,program
0,Demetria,2.0,4.0,MSIS
1,Dorian,10.0,10.0,MSIS
2,Garland,9.0,1.0,MSIS
3,Iluminada,2.0,,MBA
4,Jeannine,6.0,7.0,MSIS
5,Jenny,8.0,,
6,John,,10.0,MSIS
7,Lucy,7.0,7.0,MSIS
8,Mercy,5.0,6.0,MSIS
9,Michael,6.0,10.0,MBA


By the default, the index is 0, 1, ... 

Let us set the index as the column "Name".

In [20]:
df.set_index('Name')

Unnamed: 0_level_0,hw1,hw2,program
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Demetria,2.0,4.0,MSIS
Dorian,10.0,10.0,MSIS
Garland,9.0,1.0,MSIS
Iluminada,2.0,,MBA
Jeannine,6.0,7.0,MSIS
Jenny,8.0,,
John,,10.0,MSIS
Lucy,7.0,7.0,MSIS
Mercy,5.0,6.0,MSIS
Michael,6.0,10.0,MBA


## index, columns, values

<b>index</b> returns the index labels

In [22]:
df.index

RangeIndex(start=0, stop=11, step=1)

<b>columns</b> returns the list of column names (as an index object)

In [24]:
df.columns

Index(['Name', 'hw1', 'hw2', 'program'], dtype='object')

<b>values</b> returns a (2-dimensional) ndarray of values

In [26]:
df.values

array([['Demetria', 2.0, 4.0, 'MSIS'],
       ['Dorian', 10.0, 10.0, 'MSIS'],
       ['Garland', 9.0, 1.0, 'MSIS'],
       ['Iluminada', 2.0, nan, 'MBA'],
       ['Jeannine', 6.0, 7.0, 'MSIS'],
       ['Jenny', 8.0, nan, nan],
       ['John', nan, 10.0, 'MSIS'],
       ['Lucy', 7.0, 7.0, 'MSIS'],
       ['Mercy', 5.0, 6.0, 'MSIS'],
       ['Michael', 6.0, 10.0, 'MBA'],
       ['Shelby', 1.0, 10.0, 'MSIS']], dtype=object)

## df.iloc[x, y]

Access using the positional index. 
<ul>
<li><b>x</b> is the information needed to select the rows: positional index or range of integers</li>
<li><b>y (optional)</b> is the information needed to select the columns: positional index or range of integers</li>
</ul>

Access one row by specifying a positional index

In [32]:
df.iloc[0]

Name       Demetria
hw1             2.0
hw2             4.0
program        MSIS
Name: 0, dtype: object

Or, more simply:

In [36]:
df.iloc[:, 1]

0      2.0
1     10.0
2      9.0
3      2.0
4      6.0
5      8.0
6      NaN
7      7.0
8      5.0
9      6.0
10     1.0
Name: hw1, dtype: float64

Access one column by specifying positional index of the column

In [50]:
df.iloc[3,3]

'MBA'

Access one specific value

Access a subset of rows and of columns

In [56]:
df.iloc[0:5, [1,2]]

Unnamed: 0,hw1,hw2
0,2.0,4.0
1,10.0,10.0
2,9.0,1.0
3,2.0,
4,6.0,7.0


## df.loc[x, y]

Access using the index labels. 
<ul>
<li><b>x</b> is the information needed to select the rows: label index, range of index labels, or boolean masks</li>
<li><b>y (optional)</b> is the information needed to select the columns: label index, range of index labels, or boolean masks</li>
</ul>

Acccess one specific value by specifying index label and column name

In [58]:
df.loc[2, 'hw2']

1.0

Access one row by specifying index label

In [60]:
df.loc[2]

Name       Garland
hw1            9.0
hw2            1.0
program       MSIS
Name: 2, dtype: object

or, more simply:

Access one column by specifying index label

In [62]:
df.loc[1]

Name       Dorian
hw1          10.0
hw2          10.0
program      MSIS
Name: 1, dtype: object

Or, more simply:

In [90]:
df=pd.read_csv('students.csv', index_col='Name')
df

Unnamed: 0_level_0,hw1,hw2,program
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Demetria,2.0,4.0,MSIS
Dorian,10.0,10.0,MSIS
Garland,9.0,1.0,MSIS
Iluminada,2.0,,MBA
Jeannine,6.0,7.0,MSIS
Jenny,8.0,,
John,,10.0,MSIS
Lucy,7.0,7.0,MSIS
Mercy,5.0,6.0,MSIS
Michael,6.0,10.0,MBA


Common mistake: get the whole row about Lucy

In [96]:
df.loc['Lucy', :]

hw1         7.0
hw2         7.0
program    MSIS
Name: Lucy, dtype: object

Select those students whose name starts with 'J'

In [102]:
df.loc[df.index.str.startswith('J')]

Unnamed: 0_level_0,hw1,hw2,program
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jeannine,6.0,7.0,MSIS
Jenny,8.0,,
John,,10.0,MSIS


## Problems

In [104]:
df

Unnamed: 0_level_0,hw1,hw2,program
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Demetria,2.0,4.0,MSIS
Dorian,10.0,10.0,MSIS
Garland,9.0,1.0,MSIS
Iluminada,2.0,,MBA
Jeannine,6.0,7.0,MSIS
Jenny,8.0,,
John,,10.0,MSIS
Lucy,7.0,7.0,MSIS
Mercy,5.0,6.0,MSIS
Michael,6.0,10.0,MBA


#### Retrieve Shelby's hw1 grade

In [106]:
df.loc['Shelby', 'hw1']

1.0

#### Retrieve Shelby's information

In [108]:
df.loc['Shelby', :]

hw1         1.0
hw2        10.0
program    MSIS
Name: Shelby, dtype: object

#### Find all information about those students that obtained the highest grade in hw2. Note that there are ties

In [128]:
df.hw2.nlargest()

Name
Dorian      10.0
John        10.0
Michael     10.0
Shelby      10.0
Jeannine     7.0
Name: hw2, dtype: float64

#### Find those students who obtained the same score in hw1 and in hw2.

In [130]:
df[df.hw1 == df.hw2]

Unnamed: 0_level_0,hw1,hw2,program
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dorian,10.0,10.0,MSIS
Lucy,7.0,7.0,MSIS


#### Find the average hw1 score of those students who got a hw2 score greater than 5.

In [136]:
df[df.hw2 >= 5].hw1.mean()

5.833333333333333

## sort_values()

Sort the table based on the values of a set of columns (parameter <b>by</b>). 

Sorting by one column

In [148]:
df.sort_values(by=['hw1', 'hw2'], ascending=[True, False])

Unnamed: 0_level_0,hw1,hw2,program
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Shelby,1.0,10.0,MSIS
Demetria,2.0,4.0,MSIS
Iluminada,2.0,,MBA
Mercy,5.0,6.0,MSIS
Michael,6.0,10.0,MBA
Jeannine,6.0,7.0,MSIS
Lucy,7.0,7.0,MSIS
Jenny,8.0,,
Garland,9.0,1.0,MSIS
Dorian,10.0,10.0,MSIS


Sorting by more columns. For example, by hw1 descending and, in case of ties, by hw2 ascending

In [150]:
df.sort_values(by=['hw1', 'hw2'], ascending=[False, True])

Unnamed: 0_level_0,hw1,hw2,program
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dorian,10.0,10.0,MSIS
Garland,9.0,1.0,MSIS
Jenny,8.0,,
Lucy,7.0,7.0,MSIS
Jeannine,6.0,7.0,MSIS
Michael,6.0,10.0,MBA
Mercy,5.0,6.0,MSIS
Demetria,2.0,4.0,MSIS
Iluminada,2.0,,MBA
Shelby,1.0,10.0,MSIS


## sort_index

## head and tail

Returns the first (or last) n rows

In [156]:
df.head(1)
df.tail(1)
df

Unnamed: 0_level_0,hw1,hw2,program
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Demetria,2.0,4.0,MSIS


Unnamed: 0_level_0,hw1,hw2,program
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Shelby,1.0,10.0,MSIS


Unnamed: 0_level_0,hw1,hw2,program
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Demetria,2.0,4.0,MSIS
Dorian,10.0,10.0,MSIS
Garland,9.0,1.0,MSIS
Iluminada,2.0,,MBA
Jeannine,6.0,7.0,MSIS
Jenny,8.0,,
John,,10.0,MSIS
Lucy,7.0,7.0,MSIS
Mercy,5.0,6.0,MSIS
Michael,6.0,10.0,MBA


## Problems

#### Sort the MSIS students by hw2 descending.

In [162]:
df[df.program=='MSIS'].sort_values(by='hw2', ascending=False)

Unnamed: 0_level_0,hw1,hw2,program
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dorian,10.0,10.0,MSIS
John,,10.0,MSIS
Shelby,1.0,10.0,MSIS
Jeannine,6.0,7.0,MSIS
Lucy,7.0,7.0,MSIS
Mercy,5.0,6.0,MSIS
Demetria,2.0,4.0,MSIS
Garland,9.0,1.0,MSIS


####  Show <b>only</b> the field <i>hw1</i> of the students with the largest hw2 grade

In [190]:
df[df.hw2==df.hw2.max()].loc[:,'hw1']

Name
Dorian     10.0
John        NaN
Michael     6.0
Shelby      1.0
Name: hw1, dtype: float64

## mean, min, max, etc

Aggregate functions are broadcasted to all columns (axis = 0, which is the default) or rows (axis = 1). Numeric aggregators will be executed only on numeric data.

The average for each hw

In [194]:
df['hw1'].mean()
df.hw1.mean()

5.6

5.6

The average for each student

In [202]:
df.mean(axis=1,numeric_only=True)

Name
Demetria      3.0
Dorian       10.0
Garland       5.0
Iluminada     2.0
Jeannine      6.5
Jenny         8.0
John         10.0
Lucy          7.0
Mercy         5.5
Michael       8.0
Shelby        5.5
dtype: float64

## Problems

#### Compute the spread (i.e., highest minus lowest hw grade) of each student

In [208]:
df

Unnamed: 0_level_0,hw1,hw2,program
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Demetria,2.0,4.0,MSIS
Dorian,10.0,10.0,MSIS
Garland,9.0,1.0,MSIS
Iluminada,2.0,,MBA
Jeannine,6.0,7.0,MSIS
Jenny,8.0,,
John,,10.0,MSIS
Lucy,7.0,7.0,MSIS
Mercy,5.0,6.0,MSIS
Michael,6.0,10.0,MBA


let's try to use .max(axis=1) , find each row/student's max. Use .min(axis=1) to find the min.

In [212]:
max = df.max(axis=1, numeric_only=True)
min = df.min(axis=1, numeric_only=True)
print(max-min)

Name
Demetria     2.0
Dorian       0.0
Garland      8.0
Iluminada    0.0
Jeannine     1.0
Jenny        0.0
John         0.0
Lucy         0.0
Mercy        1.0
Michael      4.0
Shelby       9.0
dtype: float64


Or 

#### Who has the largest spread?

In [216]:
(max-min).nlargest()

Name
Shelby      9.0
Garland     8.0
Michael     4.0
Demetria    2.0
Jeannine    1.0
dtype: float64

## Modifying DataFrames

Make a copy of the data frame

In [218]:
df

Unnamed: 0_level_0,hw1,hw2,program
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Demetria,2.0,4.0,MSIS
Dorian,10.0,10.0,MSIS
Garland,9.0,1.0,MSIS
Iluminada,2.0,,MBA
Jeannine,6.0,7.0,MSIS
Jenny,8.0,,
John,,10.0,MSIS
Lucy,7.0,7.0,MSIS
Mercy,5.0,6.0,MSIS
Michael,6.0,10.0,MBA


### Add rows

A new student has joined. His name is Oliver and he is the MSIS program; his hw1 is missing and his hw2 score is 8.

In [226]:
import pandas as pd
import numpy as np

# 假设 df 是原有的 DataFrame
new_student = pd.DataFrame({'hw1': [np.nan], 'hw2': [8], 'program': ['MSIS']}, index=['Oliver'])

# 使用 pd.concat() 追加数据
df = pd.concat([df, new_student])

print(df)


            hw1   hw2 program
Demetria    2.0   4.0    MSIS
Dorian     10.0  10.0    MSIS
Garland     9.0   1.0    MSIS
Iluminada   2.0   NaN     MBA
Jeannine    6.0   7.0    MSIS
Jenny       8.0   NaN     NaN
John        NaN  10.0    MSIS
Lucy        7.0   7.0    MSIS
Mercy       5.0   6.0    MSIS
Michael     6.0  10.0     MBA
Shelby      1.0  10.0    MSIS
Oliver      NaN   8.0    MSIS


A new student has joined. Her name is Caroline and she got 4 in hw2. She is not in any program yet.

In [228]:
new_student = pd.DataFrame({'hw1':[np.nan], 'hw2':4, "program":[np.nan]}, index=['Caroline'])
df = pd.concat([df, new_student])
df

Unnamed: 0,hw1,hw2,program
Demetria,2.0,4.0,MSIS
Dorian,10.0,10.0,MSIS
Garland,9.0,1.0,MSIS
Iluminada,2.0,,MBA
Jeannine,6.0,7.0,MSIS
Jenny,8.0,,
John,,10.0,MSIS
Lucy,7.0,7.0,MSIS
Mercy,5.0,6.0,MSIS
Michael,6.0,10.0,MBA


### Add columns

Add an "empty" column <b>hw3</b>

In [240]:
df['hw3'] = 0
df

Unnamed: 0,hw1,hw2,program,hw3
Demetria,2.0,4.0,MSIS,0
Dorian,10.0,10.0,MSIS,0
Garland,9.0,1.0,MSIS,0
Iluminada,2.0,,MBA,0
Jeannine,6.0,7.0,MSIS,0
Jenny,8.0,,,0
John,,10.0,MSIS,0
Lucy,7.0,7.0,MSIS,0
Mercy,5.0,6.0,MSIS,0
Michael,6.0,10.0,MBA,0


### Add calculated columns

In [244]:
df['finalGrade'] = 0.2 * df['hw1'] + 0.8 * df['hw2']
df

Unnamed: 0,hw1,hw2,program,hw3,finalGrade
Demetria,2.0,4.0,MSIS,0,3.6
Dorian,10.0,10.0,MSIS,0,10.0
Garland,9.0,1.0,MSIS,0,2.6
Iluminada,2.0,,MBA,0,
Jeannine,6.0,7.0,MSIS,0,6.8
Jenny,8.0,,,0,
John,,10.0,MSIS,0,
Lucy,7.0,7.0,MSIS,0,7.0
Mercy,5.0,6.0,MSIS,0,5.8
Michael,6.0,10.0,MBA,0,9.2


Let's add a column with the final grade. It is computed as 0.2\*hw1 + 0.8\*hw2.