# Cap 1 Probability

In [1]:
import pandas as pd
import numpy as np

In [2]:
gss = pd.read_csv('data/gss_bayes.csv', index_col=0)

In [3]:
gss.head()

Unnamed: 0_level_0,year,age,sex,polviews,partyid,indus10
caseid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1974,21.0,1,4.0,2.0,4970.0
2,1974,41.0,1,5.0,0.0,9160.0
5,1974,58.0,2,6.0,1.0,2670.0
6,1974,30.0,1,5.0,4.0,6870.0
7,1974,48.0,1,5.0,4.0,7860.0


The columns are

    caseid: Respondent id (which is the index of the table).

    year: Year when the respondent was surveyed.

    age: Respondent’s age when surveyed.

    sex: Male or female.

    polviews: Political views on a range from liberal to conservative.

    partyid: Political party affiliation, Democrat, Independent, or Republican.

    indus10: Code for the industry the respondent works in.


In [4]:
gss.shape

(49290, 6)

In [5]:
# The code for "Banking and related activities" is 6870
banker = gss.indus10 == 6870
banker.head()

caseid
1    False
2    False
5    False
6     True
7    False
Name: indus10, dtype: bool

In [6]:
banker.value_counts()

False    48562
True       728
Name: indus10, dtype: int64

In [7]:
banker.value_counts(normalize=True) * 100

False    98.523027
True      1.476973
Name: indus10, dtype: float64

### If we use the sum function on this Series, it treats Trues as 1 and False as 0, so the total is the number of bankers:

In [8]:
banker.sum()

728

In [9]:
# There are 728 bankers
banker.mean()

####
# or banker.sum()/49290

0.014769730168391155

In [10]:
banker.sum() / banker.size

0.014769730168391155

In [11]:
print(f'If we choose a random person from the dataset, the probability they are a banker is about: {round(banker.mean(), 3) * 100}%')

If we choose a random person from the dataset, the probability they are a banker is about: 1.5%


## The probability function

In [12]:
def prob(A):
    """Computes the probability of a proposition, A.
       A: Boolean series
       returns: probability
    """
    assert isinstance(A, pd.Series)
    assert A.dtype == 'bool'
    return A.mean()

In [13]:
prob(banker)

0.014769730168391155

In [14]:
prob(banker) * 100

1.4769730168391155

In [15]:
female = gss.sex == 2
female.head()

caseid
1    False
2    False
5     True
6    False
7    False
Name: sex, dtype: bool

In [16]:
prob(female)

0.5378575776019476

In [41]:
male = gss.sex == 1
male.head()

caseid
1     True
2     True
5    False
6     True
7     True
Name: sex, dtype: bool

In [42]:
prob(male)

0.46214242239805237

In [43]:
1 - prob(female)

0.46214242239805237

In [17]:
round(prob(female) * 100, 3)

53.786

### Political views and parties
I´ll define liberal to be True for anyone whose response is "Extremely liberal", "Liberal", or "Slightly liberal".

In [18]:
liberal = gss.polviews <= 3
liberal.head()

caseid
1    False
2    False
5    False
6    False
7    False
Name: polviews, dtype: bool

In [19]:
prob(liberal)

0.27374721038750255

In [20]:
round((prob(liberal) * 100),3)

27.375

In [21]:
democrat = gss.partyid <= 1
democrat.head()

caseid
1    False
2     True
5     True
6    False
7    False
Name: partyid, dtype: bool

In [22]:
prob(democrat)

0.3662609048488537

In [23]:
round(prob(democrat) * 100, 3)

36.626

## Conjunction
If you have two <b>proposistions</b>, A and B, the conjunction A and B is True if BOTH A and B are True, and False otherwise.

In [24]:
prob(banker & democrat)

0.004686548995739501

In [25]:
print(f'The probability of being a banker and a democrat is: {round(prob(banker & democrat) * 100, 3)} %')

The probability of being a banker and a democrat is: 0.469 %


#### Conjunction table

In [46]:
A = pd.Series((True, True, False, False))
B = pd.Series((True, False, True, False))


table = pd.DataFrame()
table['A'] = A
table['B'] = B
table['A & B'] = A & B
table

Unnamed: 0,A,B,A & B
0,True,True,True
1,True,False,False
2,False,True,False
3,False,False,False


### Conditional probability
is a probability that depends on a condition <br>
. what is the probability that a respondent is a democrat, given that they are liberal? <br>
. what is the probability that a respondent is female, given that they are a banker? <br>
. what is the probability that a respondent is liberal, given taht they are female?

In [26]:
print('To select liberal respondents, we can use the bracket operator, [], like this:')
selected = democrat[liberal]

To select liberal respondents, we can use the bracket operator, [], like this:


In [27]:
len(selected)

13493

In [28]:
prob(selected)

0.5206403320240125

In [29]:
def prob_100(A):
    '''Probability, but in percentage'''
    print(f'{round(A.mean()* 100, 3)} %')

In [30]:
prob_100(selected)

52.064 %


In [31]:
selected = female[banker]
prob(selected)

0.7706043956043956

In [32]:
prob_100(selected)

77.06 %


In [33]:
def conditional(proposition, given):
    return prob(proposition[given])

In [34]:
conditional(liberal, given=female)

0.27581004111500884

About 27,58% of female respondents are liberal.

In [35]:
conditional(democrat, given=liberal)

0.5206403320240125

## Conditional probability is NOT commutative
Conjunction is commutative; that is, prob(A & B) is always equal to prob(B & A). <br>
But conditional probability is NOT commutative; that is, conditional(A, B is not the same as conditional(B, A).

In [36]:
conditional(female, given=liberal)

0.5419106203216483

In [37]:
conditional(liberal, given=female)

0.27581004111500884

### Condition and conjunction
We can combine conditional probability and conjunction.

In [38]:
conditional(female, given=liberal & democrat)

0.576085409252669

About 57,60% of liberal democrats are female.

In [39]:
conditional(liberal & female, given=banker)

0.17307692307692307

About 17,30% of bankers are liberal women.