# Case 1: The Sample with the Built-in Bias
The crook (C) and the reporter (R) are analyzing data to know the proportion between red and blue marbles by taking samples from a jar.

# Libraries and functions

In [1]:
# Importing objects for data structuring
from numpy import array, concatenate
from pandas import DataFrame as df

# Library to query data with SQL commands
from pandasql import sqldf

# Taking random samples
from random import seed, randint

# Set seed for code reproducibility
seed(101)

# Data

In [2]:
# Subsets of colored marbles
red_marbles  = array(['red']  * 634)
blue_marbles = array(['blue'] * 870)

# Population containing both sets of marbles (Jar of marbles)
jar_of_marbles = concatenate([red_marbles, blue_marbles])

# Statistical Analyses

## > The CROOK's approach
C analyses small and biased samples. From the jar of marbles, C takes 3 samples and purposefully takes marbles were most of the red are, then takes some from were some blue are

In [3]:
# ! Notice that the first 634 marbles in the "jar" are red
# The crook defines the marbles positions (mp) for analysis as follows:
mp = [randint(0, 629), randint(0, 629), 632]

biased_sample = df({"Samples":concatenate([jar_of_marbles[mp[0]:mp[0]+5]
                            , jar_of_marbles[mp[1]:mp[1]+5]
                            , jar_of_marbles[mp[2]:mp[2]+5]])})

sqldf("""Select Samples, cast(count(*) as float) / 15 * 100 as percentage
      from biased_sample 
      group by 1""")

Unnamed: 0,Samples,percentage
0,blue,20.0
1,red,80.0


## > The REPORTER's approach
From the jar of marbles, R takes 200 samples to 

In [4]:
mixed_sample = df({"Samples": jar_of_marbles}).sample(200,  replace=False)

sqldf("""Select Samples, cast(count(*) as float) / 200 * 100 as percentage
      from mixed_sample 
      group by 1""")

Unnamed: 0,Samples,percentage
0,blue,57.0
1,red,43.0


# Source:
"How to lie with statistics" by  Darrel Huff