# Using Data Structures Effectively 

## pandas DataFrames

### DataFrame functionality

In [1]:
import pandas as pd

In [2]:
usa_data = pd.Series(
    [13.33, 14.02, 14.02, 14.25], index=["2000", "2001", "2002", "2003"]
)

In [3]:
usa_data

2000    13.33
2001    14.02
2002    14.02
2003    14.25
dtype: float64

In [7]:
nepal_data = pd.Series([9.02, 9.01, 8.84, 8.84], index=["2000", "2001", "2002", "2003"])

In [8]:
df = pd.DataFrame({"USA": usa_data, "Nepal": nepal_data})

In [9]:
df

Unnamed: 0,USA,Nepal
2000,13.33,9.02
2001,14.02,9.01
2002,14.02,8.84
2003,14.25,8.84


### DataFrame performance

In [10]:
df = pd.read_excel("../data/SG_GEN_PARL.xlsx")
df = df.drop(
    [
        "Goal",
        "Target",
        "Indicator",
        "SeriesCode",
        "SeriesDescription",
        "GeoAreaCode",
        "Reporting Type",
        "Sex",
        "Units",
    ],
    axis=1,
)
df = df.set_index("GeoAreaName").transpose()

In [11]:
df

GeoAreaName,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,Austria,...,United Republic of Tanzania,United States of America,Uruguay,Uzbekistan,Vanuatu,Venezuela (Bolivarian Republic of),Viet Nam,Yemen,Zambia,Zimbabwe
2000,,5.16,3.16,7.14,15.45,5.26,28.02,3.05,22.45,26.78,...,22.18,13.33,12.12,6.8,0.0,12.14,26.0,0.66,10.13,14.0
2001,,5.16,3.42,7.14,15.45,5.26,26.46,3.05,22.97,26.78,...,22.18,14.02,12.12,7.2,0.0,9.7,26.0,0.66,10.13,9.33
2002,,5.71,3.42,14.29,15.45,5.26,30.74,3.05,25.33,26.78,...,22.26,14.02,12.12,7.2,0.0,9.7,26.0,0.67,12.03,10.0
2003,,5.71,6.17,14.29,15.45,5.26,30.74,3.05,25.33,33.88,...,22.26,14.25,12.12,7.2,1.92,9.7,27.31,0.67,12.03,10.0
2004,,5.71,6.17,14.29,15.45,5.26,30.74,4.58,25.33,33.88,...,21.36,14.25,12.12,7.2,1.92,9.7,27.31,0.33,12.03,10.0
2005,,6.43,6.17,14.29,15.0,10.53,33.73,5.34,24.67,33.88,...,21.36,14.94,12.12,17.5,3.85,9.7,27.31,0.33,12.03,10.0
2006,27.31,7.14,6.17,28.57,15.0,10.53,36.19,5.34,24.67,33.88,...,30.41,15.17,11.11,17.5,3.85,17.37,27.31,0.33,12.66,16.0
2007,27.31,7.14,6.17,28.57,15.0,10.53,35.02,5.34,24.67,32.24,...,30.41,16.32,11.11,17.5,3.85,17.96,27.31,0.33,14.65,16.67
2008,27.69,7.14,7.71,25.0,15.0,10.53,40.0,9.16,26.67,32.79,...,30.41,16.78,12.12,17.5,3.85,18.56,25.76,0.33,15.19,16.0
2009,27.69,7.14,7.71,25.0,37.27,10.53,40.0,8.4,26.67,27.32,...,30.41,17.01,12.12,17.5,3.85,18.56,25.76,0.33,15.19,15.24


In [12]:
%%timeit
df["Nepal_fraction"] = df["Nepal"] / 100

129 µs ± 14.7 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [13]:
%%timeit
df["Nepal_fraction"] = df["Nepal"].apply(lambda x: x / 100)

122 µs ± 4.06 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [14]:
%%timeit
df["Nepal_fraction"] = [row["Nepal"] / 100 for index, row in df.iterrows()]

842 µs ± 5.94 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
