In [1]:
import pandas as pd
import numpy as np
import gc
import timeit

## ***Pandorable Data Science***

In [2]:
# idiomatic Pandas code often ivnolves onliners with method chaining
# this relies on the condition that the previous methdo call returns a copy of the original dataframe, which can be mnodified by subsequent 
# method calls on the returnee
# if one method does the modification in-place, method chaining will err!

data = pd.read_csv(r"D:/Introduction-to-Data-Science-in-Python/week-3/datasets/census.csv")
data.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
0,40,3,6,1,0,Alabama,Alabama,4779736,4780127,4785161,...,0.002295,-0.193196,0.381066,0.582002,-0.467369,1.030015,0.826644,1.383282,1.724718,0.712594
1,50,3,6,1,1,Alabama,Autauga County,54571,54571,54660,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183193,...,14.83296,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
3,50,3,6,1,5,Alabama,Barbour County,27457,27457,27341,...,-4.728132,-2.50069,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
4,50,3,6,1,7,Alabama,Bibb County,22915,22919,22861,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861


In [3]:
data.shape

(3193, 100)

In [4]:
# here's a method chaining example

data.where(data.SUMLEV == 50) \
    .dropna(axis = 0) \
    .set_index(["STNAME", "CTYNAME"]) \
    .drop(["STATE", "COUNTY"], axis = 1) \
    .head(4)

Unnamed: 0_level_0,Unnamed: 1_level_0,SUMLEV,REGION,DIVISION,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
STNAME,CTYNAME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Alabama,Autauga County,50.0,3.0,6.0,54571.0,54571.0,54660.0,55253.0,55175.0,55038.0,55290.0,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333
Alabama,Baldwin County,50.0,3.0,6.0,182265.0,182265.0,183193.0,186659.0,190396.0,195126.0,199713.0,...,14.83296,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
Alabama,Barbour County,50.0,3.0,6.0,27457.0,27457.0,27341.0,27226.0,27159.0,26973.0,26815.0,...,-4.728132,-2.50069,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
Alabama,Bibb County,50.0,3.0,6.0,22915.0,22919.0,22861.0,22733.0,22642.0,22512.0,22549.0,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861


In [6]:
# even though method chaining looks elegant & readable, it is often the case that these type of idioms suffer performance penalties!!
# here's a comparison

In [27]:
def pandorableDataProcessing() -> pd.DataFrame:
    """
    pick state & city names as multiple indices, and rename a column
    finally return a copy of the original dataframe
    """
    global data
    return (data.query("SUMLEV == 50") \
        .dropna(axis = 0) \
        .set_index(["STNAME", "CTYNAME"]) \
        .rename(columns = {"ESTIMATESBASE2010": "ESTIMATE BASE 2010"}))

In [29]:
pandorableDataProcessing().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,SUMLEV,REGION,DIVISION,STATE,COUNTY,CENSUS2010POP,ESTIMATE BASE 2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
STNAME,CTYNAME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Alabama,Autauga County,50,3,6,1,1,54571,54571,54660,55253,55175,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333
Alabama,Baldwin County,50,3,6,1,3,182265,182265,183193,186659,190396,...,14.83296,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
Alabama,Barbour County,50,3,6,1,5,27457,27457,27341,27226,27159,...,-4.728132,-2.50069,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
Alabama,Bibb County,50,3,6,1,7,22915,22919,22861,22733,22642,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861
Alabama,Blount County,50,3,6,1,9,57322,57322,57373,57711,57776,...,1.807375,-1.177622,-1.748766,-2.062535,-1.36997,1.859511,-0.84858,-1.402476,-1.577232,-0.884411


In [41]:
def notSoPandorableDataProcessing() -> pd.DataFrame:
    global data
    copy = data.loc[data.SUMLEV == 50, :]
    copy.set_index(["STNAME", "CTYNAME"], inplace = True)
    return copy.rename(columns = {"ESTIMATESBASE2010": "ESTIMATE BASE 2010"})

In [42]:
notSoPandorableDataProcessing().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,SUMLEV,REGION,DIVISION,STATE,COUNTY,CENSUS2010POP,ESTIMATE BASE 2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
STNAME,CTYNAME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Alabama,Autauga County,50,3,6,1,1,54571,54571,54660,55253,55175,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333
Alabama,Baldwin County,50,3,6,1,3,182265,182265,183193,186659,190396,...,14.83296,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
Alabama,Barbour County,50,3,6,1,5,27457,27457,27341,27226,27159,...,-4.728132,-2.50069,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
Alabama,Bibb County,50,3,6,1,7,22915,22919,22861,22733,22642,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861
Alabama,Blount County,50,3,6,1,9,57322,57322,57373,57711,57776,...,1.807375,-1.177622,-1.748766,-2.062535,-1.36997,1.859511,-0.84858,-1.402476,-1.577232,-0.884411


In [43]:
timeit.timeit(pandorableDataProcessing, number = 100)

0.8535013999999137

In [44]:
timeit.timeit(notSoPandorableDataProcessing, number = 100)

0.49057649999986097

### ***Stylistic idioms may suffer from performance penalties***

In [46]:
# a function that maps the provided function to every element of the DataFrame

data.applymap(lambda cell: cell.upper() if type(cell) == str else None).head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
0,,,,,,ALABAMA,ALABAMA,,,,...,,,,,,,,,,
1,,,,,,ALABAMA,AUTAUGA COUNTY,,,,...,,,,,,,,,,
2,,,,,,ALABAMA,BALDWIN COUNTY,,,,...,,,,,,,,,,
3,,,,,,ALABAMA,BARBOUR COUNTY,,,,...,,,,,,,,,,
4,,,,,,ALABAMA,BIBB COUNTY,,,,...,,,,,,,,,,


In [47]:
data.min()

SUMLEV                 40
REGION                  1
DIVISION                1
STATE                   1
COUNTY                  0
                  ...    
RNETMIG2011   -128.205128
RNETMIG2012   -147.727273
RNETMIG2013   -102.988031
RNETMIG2014   -178.947368
RNETMIG2015    -62.322703
Length: 100, dtype: object

In [60]:
data \
    .loc[:, ["POPESTIMATE" + str(year) for year in range(2010, 2016)]] \
    .apply(lambda column: np.mean(column), axis = 0)

POPESTIMATE2010    193765.651738
POPESTIMATE2011    195251.398058
POPESTIMATE2012    196744.518008
POPESTIMATE2013    198200.685875
POPESTIMATE2014    199754.087692
POPESTIMATE2015    201327.165675
dtype: float64

In [63]:
data \
    .loc[:, ["POPESTIMATE" + str(year) for year in range(2010, 2016)]] \
    .apply(lambda row: {"Min": np.min(row), "Max": np.max(row)}, axis = 1)

0       {'Min': 4785161, 'Max': 4858979}
1           {'Min': 54660, 'Max': 55347}
2         {'Min': 183193, 'Max': 203709}
3           {'Min': 26489, 'Max': 27341}
4           {'Min': 22512, 'Max': 22861}
                      ...               
3188        {'Min': 43593, 'Max': 45162}
3189        {'Min': 21297, 'Max': 23125}
3190        {'Min': 20822, 'Max': 21102}
3191          {'Min': 8316, 'Max': 8545}
3192          {'Min': 7065, 'Max': 7234}
Length: 3193, dtype: object

In [65]:
data \
    .loc[:, ["POPESTIMATE" + str(year) for year in range(2010, 2016)]] \
    .apply(lambda row: {"Min Year": range(2010, 2016)[np.argmin(row)], "Max Year": range(2010, 2016)[np.argmax(row)]}, axis = 1)

0       {'Min Year': 2010, 'Max Year': 2015}
1       {'Min Year': 2010, 'Max Year': 2015}
2       {'Min Year': 2010, 'Max Year': 2015}
3       {'Min Year': 2015, 'Max Year': 2010}
4       {'Min Year': 2013, 'Max Year': 2010}
                        ...                 
3188    {'Min Year': 2010, 'Max Year': 2013}
3189    {'Min Year': 2010, 'Max Year': 2015}
3190    {'Min Year': 2015, 'Max Year': 2010}
3191    {'Min Year': 2014, 'Max Year': 2010}
3192    {'Min Year': 2012, 'Max Year': 2015}
Length: 3193, dtype: object