https://machinelearningmastery.com/a-gentle-introduction-to-the-bootstrap-method/

In [2]:
# import libraries using common alias names
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# check what version of packages are installed.
print("NumPy version",np.__version__, "pandas version ",pd.__version__, "seaborn version",sns.__version__  )  # '1.16.2'

# set print options with floating point precision if 4, summarise long arrays using threshold of 5, suppress small results
np.set_printoptions(precision=4, threshold=5, suppress=True)  # set floating point precision to 4
pd.options.display.max_rows=8 # set options to display max number of rows

NumPy version 1.16.2 pandas version  0.24.2 seaborn version 0.9.0


In [7]:
df = pd.read_csv("WorldHappinessData2018.csv")
df.head()
df.columns

Index(['Unnamed: 0', 'Region', 'Year', 'Life Ladder', 'Log GDP per capita',
       'Social support', 'Healthy life expectancy at birth',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption', 'Positive affect', 'Negative affect',
       'Confidence in national government', 'Democratic Quality',
       'Delivery Quality', 'Standard deviation of ladder by country-year',
       'Standard deviation/Mean of ladder by country-year',
       'GINI index (World Bank estimate)',
       'GINI index (World Bank estimate), average 2000-16',
       'gini of household income reported in Gallup, by wp5-year',
       'Most people can be trusted, Gallup',
       'Most people can be trusted, WVS round 1981-1984',
       'Most people can be trusted, WVS round 1989-1993',
       'Most people can be trusted, WVS round 1994-1998',
       'Most people can be trusted, WVS round 1999-2004',
       'Most people can be trusted, WVS round 2005-2009',
       'Most people can be tru

In [8]:
# rename the first column as country name
df.rename(columns={'Unnamed: 0':'Country'}, inplace=True)
df.head()

Unnamed: 0,Country,Region,Year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,...,GINI index (World Bank estimate),"GINI index (World Bank estimate), average 2000-16","gini of household income reported in Gallup, by wp5-year","Most people can be trusted, Gallup","Most people can be trusted, WVS round 1981-1984","Most people can be trusted, WVS round 1989-1993","Most people can be trusted, WVS round 1994-1998","Most people can be trusted, WVS round 1999-2004","Most people can be trusted, WVS round 2005-2009","Most people can be trusted, WVS round 2010-2014"
0,Afghanistan,Southern Asia,2008.0,3.72359,7.16869,0.450662,50.799999,0.718114,0.177889,0.881686,...,,,,,,,,,,
1,Afghanistan,Southern Asia,2009.0,4.401778,7.33379,0.552308,51.200001,0.678896,0.200178,0.850035,...,,,0.441906,0.286315,,,,,,
2,Afghanistan,Southern Asia,2010.0,4.758381,7.386629,0.539075,51.599998,0.600127,0.134353,0.706766,...,,,0.327318,0.275833,,,,,,
3,Afghanistan,Southern Asia,2011.0,3.831719,7.415019,0.521104,51.919998,0.495901,0.172137,0.731109,...,,,0.336764,,,,,,,
4,Afghanistan,Southern Asia,2012.0,3.782938,7.517126,0.520637,52.240002,0.530935,0.244273,0.77562,...,,,0.34454,,,,,,,


In [9]:
df.shape

(1669, 27)

In [10]:
df18 = df.loc[df.loc[:,'Year']==2018]

In [11]:
df18.head()

Unnamed: 0,Country,Region,Year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,...,GINI index (World Bank estimate),"GINI index (World Bank estimate), average 2000-16","gini of household income reported in Gallup, by wp5-year","Most people can be trusted, Gallup","Most people can be trusted, WVS round 1981-1984","Most people can be trusted, WVS round 1989-1993","Most people can be trusted, WVS round 1994-1998","Most people can be trusted, WVS round 1999-2004","Most people can be trusted, WVS round 2005-2009","Most people can be trusted, WVS round 2010-2014"
10,Afghanistan,Southern Asia,2018.0,2.694303,7.494588,0.507516,52.599998,0.373536,-0.084888,0.927606,...,,,0.290681,,,,,,,
21,Albania,Central and Eastern Europe,2018.0,5.004403,9.412399,0.683592,68.699997,0.824212,0.005385,0.899129,...,,0.30325,0.456174,,,,0.243243,0.232,,
28,Algeria,Middle East and Northern Africa,2018.0,5.043086,9.557952,0.798651,65.900002,0.583381,-0.172413,0.758704,...,,0.276,0.667872,,,,,0.107644,,0.179286
45,Argentina,Latin America and Caribbean,2018.0,5.792797,9.809972,0.899912,68.800003,0.845895,-0.206937,0.855255,...,,0.460938,0.405356,,0.270073,0.223553,0.170844,0.150154,0.174058,0.193531
58,Armenia,Central and Eastern Europe,2018.0,5.062449,9.119424,0.814449,66.900002,0.807644,-0.149109,0.676826,...,,0.31925,0.406403,,,,0.235,,,0.109136


In [1]:
from sklearn.utils import resample
# data sample
data = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
# prepare bootstrap sample
boot = resample(data, replace=True, n_samples=4, random_state=1)
print('Bootstrap Sample: %s' % boot)
# out of bag observations
oob = [x for x in data if x not in boot]
print('OOB Sample: %s' % oob)

Bootstrap Sample: [0.6, 0.4, 0.5, 0.1]
OOB Sample: [0.2, 0.3]


In [13]:
data= df18['Log GDP per capita'].dropna()
boot = resample(data, replace=True, n_samples=4, random_state=1)
print('Bootstrap Sample: %s' % boot)
# out of bag observations
oob = [x for x in data if x not in boot]
print('OOB Sample: %s' % oob)

Bootstrap Sample: 481      7.524517
1427    10.975945
167      8.860531
963      9.956448
Name: Log GDP per capita, dtype: float64
OOB Sample: [7.494587898254394, 9.412399291992188, 9.55795192718506, 9.809971809387207, 9.119423866271973, 10.721020698547363, 10.74189281463623, 9.678013801574707, 8.220746040344237, 9.778738975524902, 10.672445297241213, 7.663907051086426, 8.860530853271484, 9.40272617340088, 9.68022632598877, 9.55793285369873, 9.873218536376953, 7.47052001953125, 6.541032791137695, 8.253352165222168, 8.13347053527832, 10.701248168945312, 7.472574710845947, 10.065919876098633, 9.694375991821287, 9.511734008789062, 7.26014232635498, 8.473278999328612, 9.669425964355467, 10.065751075744627, 10.419473648071287, 10.755594253540039, 9.626997947692873, 9.274455070495604, 9.293959617614746, 8.911958694458008, 10.32410717010498, 7.524516582489014, 10.636059761047363, 10.573351860046387, 9.718316078186037, 9.229100227355957, 10.730944633483887, 10.132058143615724, 8.92342853546142

In [16]:
data.shape

(125,)

In [19]:
type(oob)

list

In [20]:
from statsmodels.distributions.empirical_distribution import ECDF

In [21]:
ecdf=ECDF(data)

In [23]:
ecdf

<statsmodels.distributions.empirical_distribution.ECDF at 0x1a26ca7c50>