# Fixing sampling bias
This notebook aims to show a simple way of dealing with sampling bias, by using weights for inference.

In [14]:
import pandas as pd

In [15]:
#create DataFrame
df = pd.DataFrame({'gender': ['M', 'M', 'M', 'M', 'M', 'M', 'F', 'F', 'F', 'M'],
                   'age_range': ['20-25', '20-25', '25-30', '30-35', '20-25', '20-25', '30-35', '25-30', '20-25', '30-35'],
                   'answer1': [5, 7, 7, 8, 5, 7, 6, 9, 8, 9],
                   'answer2': ['A', 'A', 'B', 'A', 'C', 'B', 'C', 'A', 'C', 'C']})

df = pd.concat(1000*[df]) # we do this in order to have enough data to sample from
df

Unnamed: 0,gender,age_range,answer1,answer2
0,M,20-25,5,A
1,M,20-25,7,A
2,M,25-30,7,B
3,M,30-35,8,A
4,M,20-25,5,C
...,...,...,...,...
5,M,20-25,7,B
6,F,30-35,6,C
7,F,25-30,9,A
8,F,20-25,8,C


In [16]:
# We calculate the sample class frequencies

prev_weights = df.groupby(['gender','age_range']).size().reset_index().rename(columns={0:'prev_weight'})
df = df.merge(prev_weights, on=['gender', 'age_range'])

df

Unnamed: 0,gender,age_range,answer1,answer2,prev_weight
0,M,20-25,5,A,4000
1,M,20-25,7,A,4000
2,M,20-25,5,C,4000
3,M,20-25,7,B,4000
4,M,20-25,5,A,4000
...,...,...,...,...,...
9995,F,20-25,8,C,1000
9996,F,20-25,8,C,1000
9997,F,20-25,8,C,1000
9998,F,20-25,8,C,1000


In [17]:
# We add the population class frequencies, taken from census data

props = pd.DataFrame({'gender': ['M', 'M', 'M', 'F', 'F', 'F'],
                      'age_range': ['20-25', '25-30', '30-35', '20-25','25-30', '30-35'],
                      'weight': [0.15, 0.16, 0.17, 0.17, 0.16, 0.19]
                     })
df = df.merge(props, on=['gender', 'age_range'])
df

Unnamed: 0,gender,age_range,answer1,answer2,prev_weight,weight
0,M,20-25,5,A,4000,0.15
1,M,20-25,7,A,4000,0.15
2,M,20-25,5,C,4000,0.15
3,M,20-25,7,B,4000,0.15
4,M,20-25,5,A,4000,0.15
...,...,...,...,...,...,...
9995,F,20-25,8,C,1000,0.17
9996,F,20-25,8,C,1000,0.17
9997,F,20-25,8,C,1000,0.17
9998,F,20-25,8,C,1000,0.17


In [18]:
# This last step helps us ponderate the weights by the original sample frequencies.
# The more biased the original sample is, the more we have to compensate it.

df['weight'] = df['weight']/df['prev_weight']

### Original data distribution

In [25]:
df.groupby(['gender','age_range'])['answer1'].count()/10000

gender  age_range
F       20-25        0.1
        25-30        0.1
        30-35        0.1
M       20-25        0.4
        25-30        0.1
        30-35        0.2
Name: answer1, dtype: float64

### Population distribution

In [21]:
props.groupby(['gender','age_range']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,weight
gender,age_range,Unnamed: 2_level_1
F,20-25,0.17
F,25-30,0.16
F,30-35,0.19
M,20-25,0.15
M,25-30,0.16
M,30-35,0.17


### Weighted data distribution

In [24]:
df\
    .sample(n=1000, weights='weight', random_state=1)\
    .reset_index(drop=True)\
    .groupby(['gender','age_range'])\
    .count()['answer1']/10

gender  age_range
F       20-25        16.7
        25-30        16.5
        30-35        18.2
M       20-25        16.6
        25-30        14.5
        30-35        17.5
Name: answer1, dtype: float64

### Calculating average answers

In [53]:
df['answer1'].mean() # Answer we would have obtained with our original sample

7.1

In [54]:
(df['answer1']*df['weight']).sum() # Answer obtained using weights

7.405000000000001

In [72]:
df.groupby(['answer2'])['answer2'].count()/df.shape[0] # Answer we would have obtained with our original sample

answer2
A    0.4
B    0.2
C    0.4
Name: answer2, dtype: float64

In [74]:
df.groupby(['answer2'])['weight'].sum() # Answer obtained using weights

answer2
A    0.3200
B    0.1975
C    0.4825
Name: weight, dtype: float64