#  <font color='blue'> Review of probability theory using python </font> 
 
 
Install tabulate package: pip install tabulate on mac

We will use pandas to read a CSV file and to store data

Documentation of pandas https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html

In [39]:
import pandas as pd
import numpy as np


## Download student-mat.csv from ICON

The csv file is downloaded from Kaggle 
https://www.kaggle.com/uciml/student-alcohol-consumption/data?select=student-mat.csv


In [40]:

df = pd.read_csv('student-mat.csv')
df.head(10)  # Select the first 3 rows of data


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10
5,GP,M,16,U,LE3,T,4,3,services,other,...,5,4,2,1,2,5,10,15,15,15
6,GP,M,16,U,LE3,T,2,2,other,other,...,4,4,4,1,1,3,0,12,12,11
7,GP,F,17,U,GT3,A,4,4,other,teacher,...,4,1,4,1,1,1,6,6,5,6
8,GP,M,15,U,LE3,A,3,2,services,other,...,4,2,2,1,1,1,0,16,18,19
9,GP,M,15,U,GT3,T,3,4,other,other,...,5,5,1,1,1,5,0,14,15,15


## Create a smaller data frame with only two columns

Grade A: G3  > 80%

Absences: high absences, if a student missed 10 or more classes.


In [41]:
newlist = pd.DataFrame()
newlist['Grade'] = np.where(df['G3']*5 >= 80, 1, 0)     # df short notation for data frame
newlist['Absences'] = np.where(df['absences'] >= 10, 1, 0)
newlist['count'] = 1
newlist.head(10)

Unnamed: 0,Grade,Absences,count
0,0,0,1
1,0,0,1
2,0,1,1
3,0,0,1
4,0,0,1
5,0,1,1
6,0,0,1
7,0,0,1
8,1,0,1
9,0,0,1


## Compute joint probabilities


In [42]:
Joint_table = pd.pivot_table(
    newlist, 
    values='count', 
    index=['Grade'], 
    columns=['Absences'], 
    aggfunc=np.size, 
    fill_value=0
)

print("Joint Histogram")
print("-------------------")
print(Joint_table)
print("-------------------")

Joint_table = Joint_table.to_numpy()
Joint_Probabilities = Joint_table/len(newlist)

print("Joint Probabilities")
print("-------------------")
print(Joint_Probabilities)
print("-------------------")


Joint Histogram
-------------------
Absences    0   1
Grade            
0         277  78
1          35   5
-------------------
Joint Probabilities
-------------------
[[0.70126582 0.19746835]
 [0.08860759 0.01265823]]
-------------------


## Compute marginal, and conditional probabilities

Marginal probability
$$P(X) = -\sum_j p(x_i,y_j)$$

Conditional probability P(X|Y=y): probability of x, if Y is fixed to a specific value
$$P(X|Y) = p(x,y)/p(y)$$


In [43]:
# P(A) Marginal probability of Absences: Sum along the grades axis (rows; axis = 0) 

PA = np.sum(Joint_Probabilities,axis=0)

print("Marginal probability of Absences")
print("-------------------")
print('P(A)', PA)
print("-------------------\n")


# Conditional probabilities of Grades, given A
# Broadcasting along the rows

PGgivenA = Joint_Probabilities/PA[None,:]

print("Conditional probability of Grades given Absences P(G|A)")
print("---------------------------------")
print(PGgivenA)
print("---------------------------------")


Marginal probability of Absences
-------------------
P(A) [0.78987342 0.21012658]
-------------------

Conditional probability of Grades given Absences P(G|A)
---------------------------------
[[0.88782051 0.93975904]
 [0.11217949 0.06024096]]
---------------------------------


## <font color=red> To do: Evaluate the probability of getting a grade A, if the student has been absent for more than ten times P(Grade=A|Absenses>=10)</font>

1. Compute 'P(Grade=A|Absenses>=10')

2. Compute 'P(Grade=A|Absenses<10')

3. Add the two; what do you expect to get if you add the two? i.e. P(Grade=A|Absenses>=10) + P(Grade=A|Absenses<10)

4. What will you get if you add P(Grade=A|Absenses>=10') and P(Grade<A|Absenses>=10')

In [44]:
# YOUR CODE HERE
newlist = pd.DataFrame()
newlist['Grade'] = np.where(df['G3']*5 == 80, 1, 0)     # df short notation for data frame
newlist['Absences'] = np.where(df['absences'] >= 10, 1, 0)
newlist['count'] = 1
newlist.head(10)

Joint_table = pd.pivot_table(
    newlist, 
    values='count', 
    index=['Grade'], 
    columns=['Absences'], 
    aggfunc=np.size, 
    fill_value=0
)

Joint_table = Joint_table.to_numpy()
Joint_Probabilities = Joint_table/len(newlist)

PA = np.sum(Joint_Probabilities,axis=0)

PGgivenA1 = Joint_Probabilities/PA[None,:]

print("P(Grade=A|Absenses>=10)=",PGgivenA1)

# YOUR CODE HERE
newlist['Absences'] = np.where(df['absences'] < 10, 1, 0)

# Conditional probabilities of Grades, given A
# Broadcasting along the rows

PGgivenA2 = Joint_Probabilities/PA[None,:]

print("P(Grade=A|Absenses<10)=",PGgivenA2)
# Was expecting to get 1.0 for the sum of the two probabilities
print("P(Grade=A|Absenses>=10) + P(Grade=A|Absenses<10)=",PGgivenA1+PGgivenA2)

# YOUR CODE HERE
newlist['Grade'] = np.where(df['G3']*5 < 80, 1, 0)     # df short notation for data frame
newlist['Absences'] = np.where(df['absences'] >= 10, 1, 0)

PGgivenA3 = Joint_Probabilities/PA[None,:]

print("P(Grade=A|Absenses>=10)=",PGgivenA3)

# Was expecting to get 1.0 for the sum of the two probabilities
print("P(Grade=A|Absenses>=10) + P(Grade<A|Absenses>=10)=",PGgivenA1+PGgivenA3)


P(Grade=A|Absenses>=10)= [[0.95192308 0.98795181]
 [0.04807692 0.01204819]]
P(Grade=A|Absenses<10)= [[0.93975904 0.88782051]
 [0.06024096 0.11217949]]
P(Grade=A|Absenses>=10) + P(Grade=A|Absenses<10)= [[1.89168211 1.87577232]
 [0.10831789 0.12422768]]
P(Grade=A|Absenses>=10)= [[0.11217949 0.06024096]
 [0.88782051 0.93975904]]
P(Grade=A|Absenses>=10) + P(Grade<A|Absenses>=10)= [[1.06410256 1.04819277]
 [0.93589744 0.95180723]]


## <font color=red> To do: compute the marginal probability P(g) and conditional probability P(A|G) </font>


In [46]:
# YOUR CODE HERE
# YOUR CODE HERE
newlist = pd.DataFrame()
newlist['Grade'] = np.where(df['G3']*5 == 80, 1, 0)     # df short notation for data frame
newlist['Absences'] = np.where(df['absences'] >= 10, 1, 0)
newlist['count'] = 1
newlist.head(10)

Joint_table = pd.pivot_table(
    newlist, 
    values='count', 
    index=['Grade'], 
    columns=['Absences'], 
    aggfunc=np.size, 
    fill_value=0
)

Joint_table = Joint_table.to_numpy()
Joint_Probabilities = Joint_table/len(newlist)

PG = np.sum(Joint_Probabilities,axis=1)
print("Margina probability of Grades P(G)=",PG)
PAgivenG = Joint_Probabilities/PG[None,:]

print("P(A|G)=",PAgivenG)


Margina probability of Grades P(G)= [0.95949367 0.04050633]
P(A|G)= [[0.78364116 5.125     ]
 [0.03957784 0.0625    ]]
