In [20]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris

In [21]:
data = load_iris()
df = pd.DataFrame(data.data, columns=data.feature_names)

In [22]:
df.shape

(150, 4)

Random Sampling

In [23]:
# Using predefined function

# Get precentage of data as a sample
df.sample(frac=0.50)

# Get n number of rows in sample
df.sample(n=80)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
43,5.0,3.5,1.6,0.6
11,4.8,3.4,1.6,0.2
26,5.0,3.4,1.6,0.4
48,5.3,3.7,1.5,0.2
63,6.1,2.9,4.7,1.4
...,...,...,...,...
118,7.7,2.6,6.9,2.3
16,5.4,3.9,1.3,0.4
13,4.3,3.0,1.1,0.1
75,6.6,3.0,4.4,1.4


In [24]:
# conditional sampling
conditional_df = df[df['sepal width (cm)'] < 3]

Systematic Sampling

In [25]:
sys_sample = df[::5]

In [26]:
sys_sample.shape

(30, 4)

In [27]:
# Another way of systematic sampling is to get indexes of those rows and create another df with rows from only those indexes
nth_indexes = np.arange(0, len(df),step = 5)
systematic_sample = df.iloc[nth_indexes]

In [28]:
systematic_sample.shape

(30, 4)

Stratified Sampling

In [29]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [45]:
#df.groupby('petal width (cm)').count()
df['petal width (cm)'].value_counts()

0.2    29
1.3    13
1.8    12
1.5    12
1.4     8
2.3     8
1.0     7
0.4     7
0.3     7
2.1     6
2.0     6
0.1     5
1.2     5
1.9     5
1.6     4
2.5     3
2.2     3
2.4     3
1.1     3
1.7     2
0.6     1
0.5     1
Name: petal width (cm), dtype: int64

In [51]:
# To get proportions of categories in your data
df['petal width (cm)'].value_counts() / len(df) * 100

0.2    19.333333
1.3     8.666667
1.8     8.000000
1.5     8.000000
1.4     5.333333
2.3     5.333333
1.0     4.666667
0.4     4.666667
0.3     4.666667
2.1     4.000000
2.0     4.000000
0.1     3.333333
1.2     3.333333
1.9     3.333333
1.6     2.666667
2.5     2.000000
2.2     2.000000
2.4     2.000000
1.1     2.000000
1.7     1.333333
0.6     0.666667
0.5     0.666667
Name: petal width (cm), dtype: float64

In [38]:
#Proportionate Stratified Sampling
prop_strat_sample = df.groupby('petal width (cm)', group_keys=False).apply(lambda x: x.sample(frac=0.6))

In [39]:
prop_strat_sample.shape

(91, 4)

In [42]:
df.shape

(150, 4)

In [50]:
# Disproportionate Stratified Sampling
# In this case we can pass value as only 1 cause 0.6 and 0.5 petal width have only 1 record so choosing more than 1 won't be appropriate as tehre is not second record present for those petal_width
disprop_strat_sample = df.groupby('petal width (cm)', group_keys =False).apply(lambda x:x.sample(1))

Cluster Sampling

In [85]:
def get_cluster_samples(df, cluster_size,num_of_clusters_to_select):
  n = len(df)
  data = None
  k = n//cluster_size #k no. of possible clusters
  print(k)
  for i in range(k): # will make k clusters
    sample_k = df.sample(cluster_size)
    sample_k['cluster'] = np.repeat(i,len(sample_k))
    df = df.drop(index = sample_k.index)
    data = pd.concat([data, sample_k],axis=0)
  #print(data['cluster'].unique())
  random_chosen_clusters = np.random.randint(0, k, size = num_of_clusters_to_select)
  samples = data[data.cluster.isin(random_chosen_clusters)]
  return (samples)

In [86]:
cluster_sample = get_cluster_samples(df, 50, 2)
cluster_sample

3
[0 1 2]


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),cluster
29,4.7,3.2,1.6,0.2,1
117,7.7,3.8,6.7,2.2,1
86,6.7,3.1,4.7,1.5,1
56,6.3,3.3,4.7,1.6,1
0,5.1,3.5,1.4,0.2,1
102,7.1,3.0,5.9,2.1,1
59,5.2,2.7,3.9,1.4,1
139,6.9,3.1,5.4,2.1,1
71,6.1,2.8,4.0,1.3,1
1,4.9,3.0,1.4,0.2,1


In [72]:
cluster_sample

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),cluster
