# Additional Experiment : Clustering

In [None]:
import numpy as np
import pandas as pd

In [2]:
# Define total number of groups
number_of_groups = 10
# Create data dictionary
data = {'Id': np.arange(1, number_of_groups+1).tolist(),
        'Group Number': np.arange(1,number_of_groups+1).tolist(),
        'Value': [19, 71, 58, 62, 12, 91, 60, 75, 38, 51]
        }
# Transform dictionary into a data frame
df1 = pd.DataFrame(data)
 
display(df1)

Unnamed: 0,Id,Group Number,Value
0,1,1,19
1,2,2,71
2,3,3,58
3,4,4,62
4,5,5,12
5,6,6,91
6,7,7,60
7,8,8,75
8,9,9,38
9,10,10,51


In [3]:
# Define systematic sampling function
def systematic_sampling(df, step):
 
    indexes = np.arange(0, len(df), step=step)
    systematic_sample = df.iloc[indexes]
    return systematic_sample

In [4]:
# Obtain a systematic sample and save it in a new variable
systematic_sample = systematic_sampling(df1, 4)
# View sampled data frame
display(systematic_sample)

Unnamed: 0,Id,Group Number,Value
0,1,1,19
4,5,5,12
8,9,9,38


In [5]:
# Create a dictionary of students
students = {
    'Name': ['Lisa', 'Kate', 'Ben', 'Kim', 'Josh',
             'Alex', 'Evan', 'Greg', 'Sam', 'Ella'],
    'ID': ['001', '002', '003', '004', '005', '006', 
           '007', '008', '009', '010'],
    'Grade': ['A', 'A', 'C', 'B', 'B', 'B', 'C', 
              'A', 'A', 'A'],
    
    'Category': [2, 3, 1, 3, 2, 3, 3, 1, 2, 1]
}
  
# Create dataframe from students dictionary
df = pd.DataFrame(students)
  
# View the dataframe
df

Unnamed: 0,Name,ID,Grade,Category
0,Lisa,1,A,2
1,Kate,2,A,3
2,Ben,3,C,1
3,Kim,4,B,3
4,Josh,5,B,2
5,Alex,6,B,3
6,Evan,7,C,3
7,Greg,8,A,1
8,Sam,9,A,2
9,Ella,10,A,1


In [7]:
df.groupby('Category', group_keys=False).apply(lambda x: x.sample(2))

Unnamed: 0,Name,ID,Grade,Category
2,Ben,3,C,1
9,Ella,10,A,1
0,Lisa,1,A,2
4,Josh,5,B,2
1,Kate,2,A,3
3,Kim,4,B,3


In [None]:
df.groupby('Grade', group_keys=False).apply(lambda x: x.sample(frac=0.6))

Unnamed: 0,Name,ID,Grade,Category
7,Greg,8,A,1
1,Kate,2,A,3
0,Lisa,1,A,2
3,Kim,4,B,3
4,Josh,5,B,2
2,Ben,3,C,1


In [None]:
#Make this example reproducible
np.random.seed(0)

#Create DataFrame
df = pd.DataFrame({'tour': np.repeat(np.arange(1,11), 20),
                   'experience': np.random.normal(loc=7, scale=1, size=200)})
df.head(21)

Unnamed: 0,tour,experience
0,1,8.764052
1,1,7.400157
2,1,7.978738
3,1,9.240893
4,1,8.867558
5,1,6.022722
6,1,7.950088
7,1,6.848643
8,1,6.896781
9,1,7.410599


In [None]:
#Randomly choose 4 tour groups out of the 10
clusters = np.random.choice(np.arange(1,11), size=3, replace=False)
print(clusters)
#Define sample as all members who belong to one of the 4 tour groups
cluster_sample = df[df['tour'].isin(clusters)]

#View first six rows of sample
#cluster_sample.head(60)

[ 6  7 10]


In [None]:
#Find how many observations came from each tour group
cluster_sample['tour'].value_counts()

3     20
7     20
10    20
Name: tour, dtype: int64

# Additional Experiment: Sampling

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('/Titanic fda.csv')
df

Unnamed: 0,Name,PClass,Age,Sex,Survived
0,"Allen, Miss Elisabeth Walton",1st,29.00,female,1
1,"Allison, Miss Helen Loraine",1st,2.00,female,0
2,"Allison, Mr Hudson Joshua Creighton",1st,30.00,male,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.00,female,0
4,"Allison, Master Hudson Trevor",1st,0.92,male,1
...,...,...,...,...,...
1308,"Zakarian, Mr Artun",3rd,27.00,male,0
1309,"Zakarian, Mr Maprieder",3rd,26.00,male,0
1310,"Zenni, Mr Philip",3rd,22.00,male,0
1311,"Lievens, Mr Rene",3rd,24.00,male,0


In [None]:
df.isnull().sum()

Name          0
PClass        1
Age         557
Sex           0
Survived      0
dtype: int64

In [None]:
df['Age'].round(decimals = 0)

0       29.0
1        2.0
2       30.0
3       25.0
4        1.0
        ... 
1308    27.0
1309    26.0
1310    22.0
1311    24.0
1312    29.0
Name: Age, Length: 1313, dtype: float64

In [None]:
df['Age'].fillna(df.Age.mean(), inplace=True)
df

Unnamed: 0,Name,PClass,Age,Sex,Survived
0,"Allen, Miss Elisabeth Walton",1st,29.00,female,1
1,"Allison, Miss Helen Loraine",1st,2.00,female,0
2,"Allison, Mr Hudson Joshua Creighton",1st,30.00,male,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.00,female,0
4,"Allison, Master Hudson Trevor",1st,0.92,male,1
...,...,...,...,...,...
1308,"Zakarian, Mr Artun",3rd,27.00,male,0
1309,"Zakarian, Mr Maprieder",3rd,26.00,male,0
1310,"Zenni, Mr Philip",3rd,22.00,male,0
1311,"Lievens, Mr Rene",3rd,24.00,male,0


In [None]:
df.dropna() #Drop NaN values

Unnamed: 0,Name,PClass,Age,Sex,Survived
0,"Allen, Miss Elisabeth Walton",1st,29.00,female,1
1,"Allison, Miss Helen Loraine",1st,2.00,female,0
2,"Allison, Mr Hudson Joshua Creighton",1st,30.00,male,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.00,female,0
4,"Allison, Master Hudson Trevor",1st,0.92,male,1
...,...,...,...,...,...
1308,"Zakarian, Mr Artun",3rd,27.00,male,0
1309,"Zakarian, Mr Maprieder",3rd,26.00,male,0
1310,"Zenni, Mr Philip",3rd,22.00,male,0
1311,"Lievens, Mr Rene",3rd,24.00,male,0


In [None]:
df.duplicated().sum() #Check for duplicate values

0

In [None]:
# A discontinuous class column 'Age_group' is made

age_group = np.arange(0,100,10)
age_group_labels = [f"{i} - {i+10}" for i in range(0,90,10)]
df['Age_group'] = pd.cut(df['Age'], bins=age_group, labels=age_group_labels)

# **Stratified Sampling On PClass**

In [None]:
df['PClass'].value_counts()

3rd    711
1st    322
2nd    279
Name: PClass, dtype: int64

In [None]:
'''There are 54.2% 3rd class passengers, 24.54% 1st class passengers, and 21.3% 
1st class passengers. Create a sample of 787 passengers disproportionately 
(equal number of passengers from each PClass stratum)

Dispropotionate Sampling: Using pandas groupby, seperate the passengers into groups
based on their grade i.e. A, B and C and randomly sample 262 passengers from each 
class group using the sample function
'''
df1 = df.groupby('PClass', group_keys=False).apply(lambda x: x.sample(262))
df1

Unnamed: 0,Name,PClass,Age,Sex,Survived,Age_group
290,"Bazzani, Ms Albi",1st,30.397989,female,1,30 - 40
233,"Smart, Mr John Montgomery",1st,56.000000,male,0,50 - 60
49,"Carter, Miss Lucile Polk",1st,14.000000,female,1,10 - 20
275,"White, Mr Richard Frasar",1st,21.000000,male,0,20 - 30
70,"Cornell, Mrs Robert Clifford (Malvi Helen Lamson)",1st,55.000000,female,1,50 - 60
...,...,...,...,...,...,...
1133,"Petroff, Mr Pentcho",3rd,30.397989,male,0,30 - 40
812,"Franklin, Mr Charles",3rd,30.397989,male,0,30 - 40
886,"Johanson, Mr Jakob Alfred",3rd,34.000000,male,0,30 - 40
1309,"Zakarian, Mr Maprieder",3rd,26.000000,male,0,20 - 30


In [None]:
df1['PClass'].value_counts()

1st    262
3rd    262
2nd    262
Name: PClass, dtype: int64

In [None]:
'''
Sample out 60% of passengers proportionately (create population samples from each stratum
based on its propotion in the sample)

Proportionate Sampling: Using pandas groupby, seperate the passengers in groups based on their PClass
i.e. 1st, 2nd, 3rd, and random sample from each group based on population proportion. 
The total sample size is 60% (0.6) of the population.
'''
df2 = df.groupby('PClass', group_keys=False).apply(lambda x: x.sample(frac=0.6))
df2

Unnamed: 0,Name,PClass,Age,Sex,Survived,Age_group
240,"Spedden, Mr Frederick Oakley",1st,45.000000,male,1,40 - 50
155,"Kenyon, Mrs Frederick R (Marion)",1st,30.397989,female,1,30 - 40
256,"Taussig, Mr Emil",1st,52.000000,male,0,50 - 60
275,"White, Mr Richard Frasar",1st,21.000000,male,0,20 - 30
199,"Pears, Mr Thomas",1st,30.397989,male,0,30 - 40
...,...,...,...,...,...,...
691,"Brocklebank, Mr William Alfred",3rd,35.000000,male,0,30 - 40
685,"Bradley, Miss Bridget Delia",3rd,18.000000,female,1,10 - 20
982,"Madigan, Miss Margaret",3rd,30.397989,female,1,30 - 40
834,"Guest, Mr Robert",3rd,30.397989,male,0,30 - 40


In [None]:
df2['PClass'].value_counts()

3rd    427
1st    193
2nd    167
Name: PClass, dtype: int64

In [None]:
'''
Notice that even in the sample, there are 54.2% 3rd class passengers,
24.54% 1st class passengers, and 21.3% 1st class passengers.
'''

'\nNotice that even in the sample, there are 54.2% 3rd class passengers,\n24.54% 1st class passengers, and 21.3% 1st class passengers.\n'

# **Cluster Sampling on Age**

In [None]:
df.isnull().sum()

Name         0
PClass       1
Age          0
Sex          0
Survived     0
Age_group    0
dtype: int64

In [None]:
df.dropna()

Unnamed: 0,Name,PClass,Age,Sex,Survived,Age_group
0,"Allen, Miss Elisabeth Walton",1st,29.00,female,1,20 - 30
1,"Allison, Miss Helen Loraine",1st,2.00,female,0,0 - 10
2,"Allison, Mr Hudson Joshua Creighton",1st,30.00,male,0,20 - 30
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.00,female,0,20 - 30
4,"Allison, Master Hudson Trevor",1st,0.92,male,1,0 - 10
...,...,...,...,...,...,...
1308,"Zakarian, Mr Artun",3rd,27.00,male,0,20 - 30
1309,"Zakarian, Mr Maprieder",3rd,26.00,male,0,20 - 30
1310,"Zenni, Mr Philip",3rd,22.00,male,0,20 - 30
1311,"Lievens, Mr Rene",3rd,24.00,male,0,20 - 30


In [None]:
df['Age_group'].value_counts()

30 - 40    707
20 - 30    260
10 - 20    117
40 - 50    104
0 - 10      55
50 - 60     48
60 - 70     19
70 - 80      3
80 - 90      0
Name: Age_group, dtype: int64

In [None]:
# Randomly choose 525 age groups out of 1312
clusters = np.random.choice(list(df.Age_group.unique()), size=4, replace=False)

# Define sample as all passenegrs belong to one of the 21 age groups
cluster_sample = df[df['Age_group'].isin(clusters)]

# View first 6 rows of sample
cluster_sample

Unnamed: 0,Name,PClass,Age,Sex,Survived,Age_group
1,"Allison, Miss Helen Loraine",1st,2.000000,female,0,0 - 10
4,"Allison, Master Hudson Trevor",1st,0.920000,male,1,0 - 10
6,"Andrews, Miss Kornelia Theodosia",1st,63.000000,female,1,60 - 70
7,"Andrews, Mr Thomas, jr",1st,39.000000,male,0,30 - 40
11,"Astor, Mrs John Jacob (Madeleine Talmadge Force)",1st,19.000000,female,1,10 - 20
...,...,...,...,...,...,...
1302,"Yalsevac, Mr Ivan",3rd,30.397989,male,1,30 - 40
1304,"Yasbeck, Mrs Antoni",3rd,15.000000,female,1,10 - 20
1305,"Youssef, Mr Gerios",3rd,30.397989,male,0,30 - 40
1306,"Zabour, Miss Hileni",3rd,30.397989,female,0,30 - 40


In [None]:
# find how many observations came from each age group
cluster_sample['Age_group'].value_counts()

30 - 40    707
10 - 20    117
0 - 10      55
60 - 70     19
80 - 90      0
70 - 80      0
50 - 60      0
40 - 50      0
20 - 30      0
Name: Age_group, dtype: int64