# Population & Sample

**Method 1**

Using package random 

In [16]:
from random import sample 
  
# Defining population
list1 = [1, 2, 3, 4, 5]  
  
# Extracting random samples from population    
sample(list1,3)
# sample(list1,5)
# list1.sample(3)...this willnot workbecause sample is not an attribute if list. 


[5, 3, 4]

In [4]:
list2 = [4,4,4,4,4,4,4]
sample(list2, 7)

[4, 4, 4, 4, 4, 4, 4]

**Method 2**

Using pandas

In [7]:
import pandas as pd

In [8]:
# Defining population
data = pd.read_csv("Health.csv")
data

Unnamed: 0,Ethnicity,Height (CM),Weight (Kg),Will survive till 70
0,White,186.0,90.0,Yes
1,African,185.0,98.0,No
2,Asian,175.0,80.0,No
3,White,180.0,88.0,Yes
4,Asian,178.0,,No
5,Asian,172.0,72.0,Yes
6,African,178.0,75.0,No
7,White,,89.0,Yes
8,African,186.0,90.0,Yes


In [12]:
# Extracting random samples from population    
data.sample(3)

Unnamed: 0,Ethnicity,Height (CM),Weight (Kg),Will survive till 70
4,Asian,178.0,,No
5,Asian,172.0,72.0,Yes
8,African,186.0,90.0,Yes


In [15]:
data.sample(4).shape

(4, 4)

In [14]:
data.shape

(9, 4)

Only 9 data points are available

In [17]:
# This will throw error! Uncomment to see.
# Cannot extract more samples (i.e. 10) than the data points available (i.e. 9)
data.sample(10)

ValueError: Cannot take a larger sample than population when 'replace=False'

In [20]:
# Hence using replacement with replace=True parameter. Here more number of samples than available in the population 
# can be extracted 
data.sample(10, replace=True)

Unnamed: 0,Ethnicity,Height (CM),Weight (Kg),Will survive till 70
4,Asian,178.0,,No
0,White,186.0,90.0,Yes
8,African,186.0,90.0,Yes
0,White,186.0,90.0,Yes
6,African,178.0,75.0,No
0,White,186.0,90.0,Yes
7,White,,89.0,Yes
0,White,186.0,90.0,Yes
1,African,185.0,98.0,No
8,African,186.0,90.0,Yes


In [None]:
# Let's learn to read help and explore deeper
help(sample)

## Stratified Random Sampling

In [21]:
# Build X and y matrices.
X = data[["Ethnicity","Height (CM)","Weight (Kg)"]]
y = data["Will survive till 70"]

In [22]:
from sklearn.model_selection import train_test_split
# Stratify based on "Will survive till 70"
# We create two samples, "Train sample" and "Test sample"
# Each sample will have a similar distribution of points as in the population
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y,  # This paramter will be used for stratifying
                                                    test_size=0.25)


In [23]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6, 3), (3, 3), (6,), (3,))

In [24]:
X_train

Unnamed: 0,Ethnicity,Height (CM),Weight (Kg)
2,Asian,175.0,80.0
4,Asian,178.0,
6,African,178.0,75.0
7,White,,89.0
0,White,186.0,90.0
5,Asian,172.0,72.0


In [25]:
y_train

2     No
4     No
6     No
7    Yes
0    Yes
5    Yes
Name: Will survive till 70, dtype: object

As we can see from y_train above, equal number of samples for Yes and No have been selected through random stratification

In [None]:
X_test

In [26]:
y_test

1     No
8    Yes
3    Yes
Name: Will survive till 70, dtype: object

In [27]:
# Let's explore in detail
help(train_test_split)

Help on function train_test_split in module sklearn.model_selection._split:

train_test_split(*arrays, **options)
    Split arrays or matrices into random train and test subsets
    
    Quick utility that wraps input validation and
    ``next(ShuffleSplit().split(X, y))`` and application to input data
    into a single call for splitting (and optionally subsampling) data in a
    oneliner.
    
    Read more in the :ref:`User Guide <cross_validation>`.
    
    Parameters
    ----------
    *arrays : sequence of indexables with same length / shape[0]
        Allowed inputs are lists, numpy arrays, scipy-sparse
        matrices or pandas dataframes.
    
    test_size : float, int or None, optional (default=None)
        If float, should be between 0.0 and 1.0 and represent the proportion
        of the dataset to include in the test split. If int, represents the
        absolute number of test samples. If None, the value is set to the
        complement of the train size. If ``train_s

## Impact on Stratified samples, if the underlying distribution changes

In [28]:
alt_y = y.copy()
print("Original distribution of y")
print(alt_y)

# Let's alter the classes and see the impact on stratified samples (we reduce the number of No)
alt_y[0:7] = 'Yes'
alt_y[7:9] = 'No'
print("-"*40)
print("Modified distribution of y")
print(alt_y)

Original distribution of y
0    Yes
1     No
2     No
3    Yes
4     No
5    Yes
6     No
7    Yes
8    Yes
Name: Will survive till 70, dtype: object
----------------------------------------
Modified distribution of y
0    Yes
1    Yes
2    Yes
3    Yes
4    Yes
5    Yes
6    Yes
7     No
8     No
Name: Will survive till 70, dtype: object


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, alt_y,
                                                    stratify=y,  # This paramter will be used for stratifying
                                                    test_size=0.25)

In [30]:
print("Training sample")
print(y_train)

Training sample
0    Yes
1    Yes
7     No
3    Yes
6    Yes
4    Yes
Name: Will survive till 70, dtype: object


In [31]:
print("Test sample")
print(y_test)

Test sample
2    Yes
5    Yes
8     No
Name: Will survive till 70, dtype: object


In [None]:
# You are encouraged to try and change the distribution and see it's impact on startified samples
# Make number of No > number of Yes
# Make them equal

# Descriptive Statistics

Descriptive Statistics can be calculated in mutliple ways in Python

## Method 1

`df.describe()` gives a concise view of mean, median, min, max, std, IQR, etc.

In [32]:
data.describe()

Unnamed: 0,Height (CM),Weight (Kg)
count,8.0,8.0
mean,180.0,85.25
std,5.264436,8.762746
min,172.0,72.0
25%,177.25,78.75
50%,179.0,88.5
75%,185.25,90.0
max,186.0,98.0


In [35]:
data

Unnamed: 0,Ethnicity,Height (CM),Weight (Kg),Will survive till 70
0,White,186.0,90.0,Yes
1,African,185.0,98.0,No
2,Asian,175.0,80.0,No
3,White,180.0,88.0,Yes
4,Asian,178.0,,No
5,Asian,172.0,72.0,Yes
6,African,178.0,75.0,No
7,White,,89.0,Yes
8,African,186.0,90.0,Yes


In [36]:
data.Ethnicity.describe()

count           9
unique          3
top       African
freq            3
Name: Ethnicity, dtype: object

In [37]:
data.describe(include='object')

Unnamed: 0,Ethnicity,Will survive till 70
count,9,9
unique,3,2
top,African,Yes
freq,3,5


In [38]:
data.describe(include='all')

Unnamed: 0,Ethnicity,Height (CM),Weight (Kg),Will survive till 70
count,9,8.0,8.0,9
unique,3,,,2
top,African,,,Yes
freq,3,,,5
mean,,180.0,85.25,
std,,5.264436,8.762746,
min,,172.0,72.0,
25%,,177.25,78.75,
50%,,179.0,88.5,
75%,,185.25,90.0,


## Method 2

Using `scipy` package

In [39]:
import scipy

In [40]:
scipy.mean(data)

  """Entry point for launching an IPython kernel.


Height (CM)    180.00
Weight (Kg)     85.25
dtype: float64

In [41]:
scipy.std(data)

  """Entry point for launching an IPython kernel.


Height (CM)    4.924429
Weight (Kg)    8.196798
dtype: float64

In [42]:
from scipy import stats

In [44]:
stats.mode(data.Ethnicity)

ModeResult(mode=array(['African'], dtype=object), count=array([3]))

In [46]:
x=[62,65,68,70,72,74,76,78,80,82,96,101]

In [47]:
y=pd.Series(x)

In [49]:
y

0      62
1      65
2      68
3      70
4      72
5      74
6      76
7      78
8      80
9      82
10     96
11    101
dtype: int64

In [51]:
y.describe()

count     12.000000
mean      77.000000
std       11.700816
min       62.000000
25%       69.500000
50%       75.000000
75%       80.500000
max      101.000000
dtype: float64