# How to create new columns derived from existing columns?

In [1]:
import pandas as pd
import numpy as np

In [6]:
url = "https://raw.githubusercontent.com/pandas-dev/pandas/master/doc/data/air_quality_no2.csv"
air_quality = pd.read_csv(url, index_col=0, parse_dates=True)
air_quality.head()

Unnamed: 0_level_0,station_antwerp,station_paris,station_london
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-05-07 02:00:00,,,23.0
2019-05-07 03:00:00,50.5,25.0,19.0
2019-05-07 04:00:00,45.0,27.7,19.0
2019-05-07 05:00:00,,50.4,16.0
2019-05-07 06:00:00,,61.9,


# I want to express the NO2 concentration of the station in London in mg/m3
# (If we assume temperature of 25 degrees Celsius and pressure of 1013 hPa, the conversion factor is 1.882)

In [7]:
air_quality["london_mg_per_cubic"] = air_quality["station_london"] * 1.882 
#To create a new column, use the [] brackets with the new column name at the left side of the assignment.
air_quality.head()

Unnamed: 0_level_0,station_antwerp,station_paris,station_london,london_mg_per_cubic
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-05-07 02:00:00,,,23.0,43.286
2019-05-07 03:00:00,50.5,25.0,19.0,35.758
2019-05-07 04:00:00,45.0,27.7,19.0,35.758
2019-05-07 05:00:00,,50.4,16.0,30.112
2019-05-07 06:00:00,,61.9,,


In [8]:
air_quality["paris_mg_per_cubic"] = air_quality["station_paris"] * 1.882
air_quality.head()

Unnamed: 0_level_0,station_antwerp,station_paris,station_london,london_mg_per_cubic,paris_mg_per_cubic
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-05-07 02:00:00,,,23.0,43.286,
2019-05-07 03:00:00,50.5,25.0,19.0,35.758,47.05
2019-05-07 04:00:00,45.0,27.7,19.0,35.758,52.1314
2019-05-07 05:00:00,,50.4,16.0,30.112,94.8528
2019-05-07 06:00:00,,61.9,,,116.4958


In [9]:
air_quality['ratio_parisVsantwerp'] = (air_quality['station_paris']/air_quality['station_antwerp'])
air_quality.head()

Unnamed: 0_level_0,station_antwerp,station_paris,station_london,london_mg_per_cubic,paris_mg_per_cubic,ratio_parisVsantwerp
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-05-07 02:00:00,,,23.0,43.286,,
2019-05-07 03:00:00,50.5,25.0,19.0,35.758,47.05,0.49505
2019-05-07 04:00:00,45.0,27.7,19.0,35.758,52.1314,0.615556
2019-05-07 05:00:00,,50.4,16.0,30.112,94.8528,
2019-05-07 06:00:00,,61.9,,,116.4958,


# I want to rename the data columns to the corresponding station identifiers used by openAQ

In [10]:
air_quality_renamed = air_quality.rename(columns={
    "station_antwerp": "BETR801",
    "station_paris": "FR04014",
    "station_london": "London Westminster",
})
air_quality_renamed.head()

Unnamed: 0_level_0,BETR801,FR04014,London Westminster,london_mg_per_cubic,paris_mg_per_cubic,ratio_parisVsantwerp
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-05-07 02:00:00,,,23.0,43.286,,
2019-05-07 03:00:00,50.5,25.0,19.0,35.758,47.05,0.49505
2019-05-07 04:00:00,45.0,27.7,19.0,35.758,52.1314,0.615556
2019-05-07 05:00:00,,50.4,16.0,30.112,94.8528,
2019-05-07 06:00:00,,61.9,,,116.4958,


In [12]:
air_quality_renamed = air_quality_renamed.rename(columns=str.upper)
air_quality_renamed.head()

Unnamed: 0_level_0,BETR801,FR04014,LONDON WESTMINSTER,LONDON_MG_PER_CUBIC,PARIS_MG_PER_CUBIC,RATIO_PARISVSANTWERP
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-05-07 02:00:00,,,23.0,43.286,,
2019-05-07 03:00:00,50.5,25.0,19.0,35.758,47.05,0.49505
2019-05-07 04:00:00,45.0,27.7,19.0,35.758,52.1314,0.615556
2019-05-07 05:00:00,,50.4,16.0,30.112,94.8528,
2019-05-07 06:00:00,,61.9,,,116.4958,


# Can use Axis styles as well
#df.rename({"one": "foo", "two": "bar"}, axis="columns")
#df.rename({"one": "foo", "two": "bar"}, axis="index")

# REMEMBER
Create a new column by assigning the output to the DataFrame with a new column name in between the [].

Operations are element-wise, no need to loop over rows.

Use rename with a dictionary or function to rename row labels or column names.

in the loop column value can not change effect


for index, row in df.iterrows():
    row["a"] = 10