In [1]:
import numpy as np
import pandas as pd

In [4]:
data_dict = {"Department" : ["Software", "Software", "Marketing", "Marketing", "HR", "Law"],
             "Name" : ["John", "Gabriele", "Nicolo", "Antonio", "Goytacaz", "Wendel"],
             "Wage" : [100, 150, 200, 300, 400, 500]
            }

In [5]:
df = pd.DataFrame(data_dict)
df

Unnamed: 0,Department,Name,Wage
0,Software,John,100
1,Software,Gabriele,150
2,Marketing,Nicolo,200
3,Marketing,Antonio,300
4,HR,Goytacaz,400
5,Law,Wendel,500


In [6]:
df["Department"].unique()

array(['Software', 'Marketing', 'HR', 'Law'], dtype=object)

In [7]:
df["Department"].nunique()

4

In [8]:
df["Department"].value_counts() # how many persons are working in each department

Software     2
Marketing    2
HR           1
Law          1
Name: Department, dtype: int64

### Applying a Function to a DataFrame

In [9]:
def calculate_new_wage(wage):
    return wage * 0.66

In [11]:
df["Wage"].apply(calculate_new_wage)

0     66.0
1     99.0
2    132.0
3    198.0
4    264.0
5    330.0
Name: Wage, dtype: float64

### Is there a nan value?

In [12]:
df.isnull() # no nan value in our table

Unnamed: 0,Department,Name,Wage
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
5,False,False,False


#### 

### Pivot Table

In [13]:
data_dict = {
    "Character class" : ["Simpson", "Simpson", "Simpson", "South Park", "South Park", "South Park"],
    "Character name" : ["Homer", "Bart", "Marge", "Cartman", "Kenny", "Kyle"],
    "Character age" : [9, 10, 50, 20, 25, 35]
}

In [14]:
df = pd.DataFrame(data_dict)
df

Unnamed: 0,Character class,Character name,Character age
0,Simpson,Homer,9
1,Simpson,Bart,10
2,Simpson,Marge,50
3,South Park,Cartman,20
4,South Park,Kenny,25
5,South Park,Kyle,35


In [15]:
# the first index can be multi index
df.pivot_table("Character age", index = ["Character class", "Character name"])

Unnamed: 0_level_0,Unnamed: 1_level_0,Character age
Character class,Character name,Unnamed: 2_level_1
Simpson,Bart,10
Simpson,Homer,9
Simpson,Marge,50
South Park,Cartman,20
South Park,Kenny,25
South Park,Kyle,35


In [16]:
data_dict2 = {
    "Character class" : ["Simpson", "Simpson", "Simpson", "South Park", "South Park", "South Park", "South Park"],
    "Character name" : ["Homer", "Bart", "Marge", "Cartman", "Kenny", "Kyle", "Kyle"],
    "Character age" : [9, 10, 50, 20, 25, 35, 10]
}

df2 = pd.DataFrame(data_dict2)
df2

Unnamed: 0,Character class,Character name,Character age
0,Simpson,Homer,9
1,Simpson,Bart,10
2,Simpson,Marge,50
3,South Park,Cartman,20
4,South Park,Kenny,25
5,South Park,Kyle,35
6,South Park,Kyle,10


In [18]:
# we add a Kyle to data, in pivot table, the the age of Kyle will appear
# as the average of the previous and new value.
df2.pivot_table("Character age", index = ["Character class", "Character name"])

Unnamed: 0_level_0,Unnamed: 1_level_0,Character age
Character class,Character name,Unnamed: 2_level_1
Simpson,Bart,10.0
Simpson,Homer,9.0
Simpson,Marge,50.0
South Park,Cartman,20.0
South Park,Kenny,25.0
South Park,Kyle,22.5


In [19]:
# if the new value is wanted to be sum
df2.pivot_table("Character age", index = ["Character class", "Character name"], aggfunc=np.sum)

Unnamed: 0_level_0,Unnamed: 1_level_0,Character age
Character class,Character name,Unnamed: 2_level_1
Simpson,Bart,10
Simpson,Homer,9
Simpson,Marge,50
South Park,Cartman,20
South Park,Kenny,25
South Park,Kyle,45
