# Modifying DataFrames

## Setup

In [1]:
import pandas as pd

## Creation

Creation of an example DataFrame (starting from a dictionary of dictionaries):

In [2]:
data = {
    "Capital": {
        "Spain": "Madrid",
        "Belgium": "Brussels",
        "France": "Paris",
        "Italy": "Roma",
        "Germany": "Berlin",
        "Portugal": "Lisbon",
        "Norway": "Oslo",
        "Greece": "Athens",
    },
    "Population": {
        "Spain": 46733038,
        "Belgium": 11449656,
        "France": 67076000,
        "Italy": 60390560,
        "Germany": 83122889,
        "Portugal": 10295909,
        "Norway": 5391369,
        "Greece": 10718565,
    },
    "Monarch": {
        "Spain": "Felipe VI",
        "Belgium": "Philippe",
        "Norway": "Harald V",
    },
    "Area": {
        "Spain": 505990,
        "Belgium": 30688,
        "France": 640679,
        "Italy": 301340,
        "Germany": 357022,
        "Portugal": 92212,
        "Norway": 385207,
        "Greece": 131957,
    },
}

In [3]:
# For now, let's forget about these steps:
df = pd.DataFrame(data)
df["Capital"] = df["Capital"].astype("string")
df["Monarch"] = df["Monarch"].astype("string")

Apple stock data, taken from the [`matplotlib` sample datasets](https://github.com/matplotlib/sample_data/blob/master/aapl.csv)

In [4]:
# For now, let's forget about these steps:
apple = pd.read_csv("AAPL.csv")
apple["Date"] = apple["Date"].astype("datetime64[ns]")
apple = apple.set_index("Date")
apple = apple.sort_index()

## Demo 1: Create or modify a column

In [5]:
df

Unnamed: 0,Capital,Population,Monarch,Area
Spain,Madrid,46733038,Felipe VI,505990
Belgium,Brussels,11449656,Philippe,30688
France,Paris,67076000,,640679
Italy,Roma,60390560,,301340
Germany,Berlin,83122889,,357022
Portugal,Lisbon,10295909,,92212
Norway,Oslo,5391369,Harald V,385207
Greece,Athens,10718565,,131957


Add a new column:

In [6]:
df["Population"]

Spain       46733038
Belgium     11449656
France      67076000
Italy       60390560
Germany     83122889
Portugal    10295909
Norway       5391369
Greece      10718565
Name: Population, dtype: int64

In [7]:
df["Area"]

Spain       505990
Belgium      30688
France      640679
Italy       301340
Germany     357022
Portugal     92212
Norway      385207
Greece      131957
Name: Area, dtype: int64

In [8]:
df["Population"] / df["Area"]

Spain        92.359608
Belgium     373.098801
France      104.695175
Italy       200.406717
Germany     232.822876
Portugal    111.654763
Norway       13.996031
Greece       81.227711
dtype: float64

In [9]:
df["Density"] = df["Population"] / df["Area"]

In [10]:
df

Unnamed: 0,Capital,Population,Monarch,Area,Density
Spain,Madrid,46733038,Felipe VI,505990,92.359608
Belgium,Brussels,11449656,Philippe,30688,373.098801
France,Paris,67076000,,640679,104.695175
Italy,Roma,60390560,,301340,200.406717
Germany,Berlin,83122889,,357022,232.822876
Portugal,Lisbon,10295909,,92212,111.654763
Norway,Oslo,5391369,Harald V,385207,13.996031
Greece,Athens,10718565,,131957,81.227711


Modify a column:

In [15]:
df["Population"]

Spain       46.733038
Belgium     11.449656
France      67.076000
Italy       60.390560
Germany     83.122889
Portugal    10.295909
Norway       5.391369
Greece      10.718565
Name: Population, dtype: float64

In [16]:
df["Population"] / 1_000_000

Spain       0.000047
Belgium     0.000011
France      0.000067
Italy       0.000060
Germany     0.000083
Portugal    0.000010
Norway      0.000005
Greece      0.000011
Name: Population, dtype: float64

In [13]:
df["Population"] = df["Population"] / 1_000_000

In [17]:
df

Unnamed: 0,Capital,Population,Monarch,Area,Density
Spain,Madrid,46.733038,Felipe VI,505990,92.359608
Belgium,Brussels,11.449656,Philippe,30688,373.098801
France,Paris,67.076,,640679,104.695175
Italy,Roma,60.39056,,301340,200.406717
Germany,Berlin,83.122889,,357022,232.822876
Portugal,Lisbon,10.295909,,92212,111.654763
Norway,Oslo,5.391369,Harald V,385207,13.996031
Greece,Athens,10.718565,,131957,81.227711


## Exercise 1

In [18]:
apple.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1984-09-07,26.5,26.87,26.25,26.5,2981600,3.02
1984-09-10,26.5,26.62,25.87,26.37,2346400,3.01
1984-09-11,26.62,27.37,26.62,26.87,5444000,3.07
1984-09-12,26.87,27.0,26.12,26.12,4773600,2.98
1984-09-13,27.5,27.62,27.5,27.5,7429600,3.14


Create a new "Spread" column, with the difference between "High" and "Low":

In [19]:
apple["Spread"] = apple["High"] - apple["Low"]

In [20]:
apple

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Adj Close,Spread
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1984-09-07,26.50,26.87,26.25,26.50,2981600,3.02,0.62
1984-09-10,26.50,26.62,25.87,26.37,2346400,3.01,0.75
1984-09-11,26.62,27.37,26.62,26.87,5444000,3.07,0.75
1984-09-12,26.87,27.00,26.12,26.12,4773600,2.98,0.88
1984-09-13,27.50,27.62,27.50,27.50,7429600,3.14,0.12
...,...,...,...,...,...,...,...
2008-10-08,85.91,96.33,85.68,89.79,78847900,89.79,10.65
2008-10-09,93.35,95.80,86.60,88.74,57763700,88.74,9.20
2008-10-10,85.70,100.00,85.00,96.80,79260700,96.80,15.00
2008-10-13,104.55,110.53,101.02,110.26,54967000,110.26,9.51


## Demo 2: Reorder columns

In [21]:
df

Unnamed: 0,Capital,Population,Monarch,Area,Density
Spain,Madrid,46.733038,Felipe VI,505990,92.359608
Belgium,Brussels,11.449656,Philippe,30688,373.098801
France,Paris,67.076,,640679,104.695175
Italy,Roma,60.39056,,301340,200.406717
Germany,Berlin,83.122889,,357022,232.822876
Portugal,Lisbon,10.295909,,92212,111.654763
Norway,Oslo,5.391369,Harald V,385207,13.996031
Greece,Athens,10.718565,,131957,81.227711


Reorder columns:

In [22]:
df.columns

Index(['Capital', 'Population', 'Monarch', 'Area', 'Density'], dtype='object')

In [23]:
columns = ["Population", "Area", "Density", "Capital", "Monarch"]

In [24]:
df = df.reindex(columns=columns)

In [25]:
df

Unnamed: 0,Population,Area,Density,Capital,Monarch
Spain,46.733038,505990,92.359608,Madrid,Felipe VI
Belgium,11.449656,30688,373.098801,Brussels,Philippe
France,67.076,640679,104.695175,Paris,
Italy,60.39056,301340,200.406717,Roma,
Germany,83.122889,357022,232.822876,Berlin,
Portugal,10.295909,92212,111.654763,Lisbon,
Norway,5.391369,385207,13.996031,Oslo,Harald V
Greece,10.718565,131957,81.227711,Athens,


In [26]:
df.columns

Index(['Population', 'Area', 'Density', 'Capital', 'Monarch'], dtype='object')

## Exercise 2

In [27]:
apple.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Adj Close,Spread
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1984-09-07,26.5,26.87,26.25,26.5,2981600,3.02,0.62
1984-09-10,26.5,26.62,25.87,26.37,2346400,3.01,0.75
1984-09-11,26.62,27.37,26.62,26.87,5444000,3.07,0.75
1984-09-12,26.87,27.0,26.12,26.12,4773600,2.98,0.88
1984-09-13,27.5,27.62,27.5,27.5,7429600,3.14,0.12


Reorder columns to have "Open" and "Close", then "High", "Low", and "Spread", followed by "Volume" ("Adj Close"  gets removed):

In [28]:
apple.reindex(columns=["Open","Close","High","Low","Spread","Volume"])

Unnamed: 0_level_0,Open,Close,High,Low,Spread,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1984-09-07,26.50,26.50,26.87,26.25,0.62,2981600
1984-09-10,26.50,26.37,26.62,25.87,0.75,2346400
1984-09-11,26.62,26.87,27.37,26.62,0.75,5444000
1984-09-12,26.87,26.12,27.00,26.12,0.88,4773600
1984-09-13,27.50,27.50,27.62,27.50,0.12,7429600
...,...,...,...,...,...,...
2008-10-08,85.91,89.79,96.33,85.68,10.65,78847900
2008-10-09,93.35,88.74,95.80,86.60,9.20,57763700
2008-10-10,85.70,96.80,100.00,85.00,15.00,79260700
2008-10-13,104.55,110.26,110.53,101.02,9.51,54967000


In [29]:
apple = apple.reindex(columns=["Open","Close","High","Low","Spread","Volume"])

## Demo 3: Rename columns

In [30]:
df

Unnamed: 0,Population,Area,Density,Capital,Monarch
Spain,46.733038,505990,92.359608,Madrid,Felipe VI
Belgium,11.449656,30688,373.098801,Brussels,Philippe
France,67.076,640679,104.695175,Paris,
Italy,60.39056,301340,200.406717,Roma,
Germany,83.122889,357022,232.822876,Berlin,
Portugal,10.295909,92212,111.654763,Lisbon,
Norway,5.391369,385207,13.996031,Oslo,Harald V
Greece,10.718565,131957,81.227711,Athens,


Rename columns:

In [31]:
df.columns

Index(['Population', 'Area', 'Density', 'Capital', 'Monarch'], dtype='object')

In [32]:
columns = {
    "Population": "Population [M]",
    "Area": "Area [km*km]",
    "Monarch": "King/Queen",
}

In [33]:
df = df.rename(columns=columns)

In [34]:
df

Unnamed: 0,Population [M],Area [km*km],Density,Capital,King/Queen
Spain,46.733038,505990,92.359608,Madrid,Felipe VI
Belgium,11.449656,30688,373.098801,Brussels,Philippe
France,67.076,640679,104.695175,Paris,
Italy,60.39056,301340,200.406717,Roma,
Germany,83.122889,357022,232.822876,Berlin,
Portugal,10.295909,92212,111.654763,Lisbon,
Norway,5.391369,385207,13.996031,Oslo,Harald V
Greece,10.718565,131957,81.227711,Athens,


In [35]:
df.columns

Index(['Population [M]', 'Area [km*km]', 'Density', 'Capital', 'King/Queen'], dtype='object')

## Exercise 3

In [36]:
apple.head()

Unnamed: 0_level_0,Open,Close,High,Low,Spread,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1984-09-07,26.5,26.5,26.87,26.25,0.62,2981600
1984-09-10,26.5,26.37,26.62,25.87,0.75,2346400
1984-09-11,26.62,26.87,27.37,26.62,0.75,5444000
1984-09-12,26.87,26.12,27.0,26.12,0.88,4773600
1984-09-13,27.5,27.5,27.62,27.5,0.12,7429600


Rename the "Spread" column to "Difference", and the "Open" to "Open Value":

In [37]:
new_cols = {
    "Spread":"Difference",
    "Open":"Open Value"
}

In [38]:
apple = apple.rename(columns=new_cols)

In [39]:
apple

Unnamed: 0_level_0,Open Value,Close,High,Low,Difference,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1984-09-07,26.50,26.50,26.87,26.25,0.62,2981600
1984-09-10,26.50,26.37,26.62,25.87,0.75,2346400
1984-09-11,26.62,26.87,27.37,26.62,0.75,5444000
1984-09-12,26.87,26.12,27.00,26.12,0.88,4773600
1984-09-13,27.50,27.50,27.62,27.50,0.12,7429600
...,...,...,...,...,...,...
2008-10-08,85.91,89.79,96.33,85.68,10.65,78847900
2008-10-09,93.35,88.74,95.80,86.60,9.20,57763700
2008-10-10,85.70,96.80,100.00,85.00,15.00,79260700
2008-10-13,104.55,110.26,110.53,101.02,9.51,54967000
