In [1]:
import pandas as pd

In [2]:
ds = pd.read_csv('cmc.data.csv')
ds.head()

Unnamed: 0,24,2,3,3.1,1,1.1,2.1,3.2,0,1.2
0,45,1,3,10,1,1,3,4,0,1
1,43,2,3,7,1,1,3,4,0,1
2,42,3,2,9,1,1,3,3,0,1
3,36,3,3,8,1,1,3,2,0,1
4,19,4,4,0,1,1,3,3,0,1
...,...,...,...,...,...,...,...,...,...,...
1467,33,4,4,2,1,0,2,4,0,3
1468,33,4,4,3,1,1,1,4,0,3
1469,39,3,3,8,1,0,1,4,0,3
1470,33,3,3,4,1,0,2,2,0,3


In [3]:
ds.columns = ["wife_age", "wife_education", "husband_education", "number_of_children", "wife_religion", "wife_work", "husband_occupation", "standard_of_living", "media_exposure", "contraceptive_method_used"]
ds.head()

Unnamed: 0,wife_age,wife_education,husband_education,number_of_children,wife_religion,wife_work,husband_occupation,standard_of_living,media_exposure,contraceptive_method_used
0,45,1,3,10,1,1,3,4,0,1
1,43,2,3,7,1,1,3,4,0,1
2,42,3,2,9,1,1,3,3,0,1
3,36,3,3,8,1,1,3,2,0,1
4,19,4,4,0,1,1,3,3,0,1


In [4]:
ds.shape

(1472, 10)

In [5]:
# print max and min no of children
print("Max no of children: ", ds['number_of_children'].max())
print("Min no of children: ", ds['number_of_children'].min())

Max no of children:  16
Min no of children:  0


In [6]:
# split age 0: young 1:middle 2:old

ds['wife_age'] = pd.cut(ds['wife_age'], bins=[0, 25, 40, 200], labels=[0, 1, 2])


In [7]:
# group children as 0-3, 4-6, 7-10, 10+
ds['number_of_children'] = pd.cut(ds['number_of_children'], bins=[-1, 3, 6, 10, 200], labels=[0, 1, 2, 3])


In [8]:
ds.head()

Unnamed: 0,wife_age,wife_education,husband_education,number_of_children,wife_religion,wife_work,husband_occupation,standard_of_living,media_exposure,contraceptive_method_used
0,2,1,3,2,1,1,3,4,0,1
1,2,2,3,2,1,1,3,4,0,1
2,2,3,2,2,1,1,3,3,0,1
3,1,3,3,2,1,1,3,2,0,1
4,0,4,4,0,1,1,3,3,0,1


In [9]:
# check if nan values is present

ds.isnull().values.any()

False

In [10]:
# probability of wife age
age_rows = ds['wife_age'].cat.categories
age_dict = {}

for i in age_rows:
    age_dict[i] = len(ds[ds['wife_age'] == i]) / len(ds)

age_dict

{0: 0.24116847826086957, 1: 0.5509510869565217, 2: 0.2078804347826087}

In [11]:
# print datatypes of all columns
ds.dtypes

wife_age                     category
wife_education                  int64
husband_education               int64
number_of_children           category
wife_religion                   int64
wife_work                       int64
husband_occupation              int64
standard_of_living              int64
media_exposure                  int64
contraceptive_method_used       int64
dtype: object

In [12]:
# probability of wife education
education_rows = ds['wife_education'].unique()
education_dict = {}
education_rows.sort()

for i in education_rows:
    education_dict[i] = len(ds[ds['wife_education'] == i]) / len(ds)

education_dict

{1: 0.10326086956521739,
 2: 0.2262228260869565,
 3: 0.27853260869565216,
 4: 0.3919836956521739}

In [13]:
# probability of husband education
husband_education_rows = ds['husband_education'].unique()
husband_education_dict = {}
husband_education_rows.sort()

for i in husband_education_rows:
    husband_education_dict[i] = len(ds[ds['husband_education'] == i]) / len(ds)

husband_education_dict

{1: 0.029891304347826088,
 2: 0.12092391304347826,
 3: 0.23845108695652173,
 4: 0.610733695652174}

In [14]:
# probability of wife religion

religion_rows = ds['wife_religion'].unique()
religion_dict = {}
religion_rows.sort()

for i in religion_rows:
    religion_dict[i] = len(ds[ds['wife_religion'] == i]) / len(ds)

religion_dict

{0: 0.14945652173913043, 1: 0.8505434782608695}

In [15]:
# probability of wife work depends on wife education

work_rows = ds['wife_work'].unique()
work_dict = {}
prob_work = {}

for i in work_rows:
    ls = {}
    temp = 0
    for j in education_rows:
        ls[j] = len(ds[(ds['wife_work'] == i) & (ds['wife_education'] == j)]) / len(ds[ds['wife_education'] == j])
        temp += ls[j] * education_dict[j]
    work_dict[i] = ls
    prob_work[i] = temp

print(work_dict)
print(prob_work)


{1: {1: 0.7697368421052632, 2: 0.7747747747747747, 3: 0.7829268292682927, 4: 0.7053726169844021}, 0: {1: 0.23026315789473684, 2: 0.22522522522522523, 3: 0.21707317073170732, 4: 0.29462738301559793}}
{1: 0.7493206521739131, 0: 0.250679347826087}


In [16]:
# probability of husband occupation depends on husband education

occupation_rows = ds['husband_occupation'].unique()
occupation_dict = {}
prob_occupation = {}

for i in occupation_rows:
    ls = {}
    temp = 0
    for j in husband_education_rows:
        ls[j] = len(ds[(ds['husband_occupation'] == i) & (ds['husband_education'] == j)]) / len(ds[ds['husband_education'] == j])
        temp += ls[j] * husband_education_dict[j]
    occupation_dict[i] = ls
    prob_occupation[i] = temp

print(prob_occupation)
# pretty print occupation_dict
import pprint
pprint.pprint(occupation_dict)


{3: {1: 0.5, 2: 0.550561797752809, 3: 0.5498575498575499, 4: 0.3025583982202447}, 2: {1: 0.4090909090909091, 2: 0.37640449438202245, 3: 0.3504273504273504, 4: 0.24026696329254726}, 1: {1: 0.022727272727272728, 2: 0.033707865168539325, 3: 0.08547008547008547, 4: 0.44382647385984425}, 4: {1: 0.06818181818181818, 2: 0.03932584269662921, 3: 0.014245014245014245, 4: 0.013348164627363738}}
{3: 0.39741847826086957, 2: 0.28804347826086957, 1: 0.2961956521739131, 4: 0.018342391304347824}
{1: {1: 0.022727272727272728,
     2: 0.033707865168539325,
     3: 0.08547008547008547,
     4: 0.44382647385984425},
 2: {1: 0.4090909090909091,
     2: 0.37640449438202245,
     3: 0.3504273504273504,
     4: 0.24026696329254726},
 3: {1: 0.5,
     2: 0.550561797752809,
     3: 0.5498575498575499,
     4: 0.3025583982202447},
 4: {1: 0.06818181818181818,
     2: 0.03932584269662921,
     3: 0.014245014245014245,
     4: 0.013348164627363738}}


In [32]:
# probability of standard of living depends on wife work and husband work

living_rows = ds['standard_of_living'].unique()
living_dict = {}
prob_living = {}
# sort living rows
living_rows.sort()

for i in living_rows:
    ls = {}
    temp = 0
    for j in work_rows:
        ls2 = {}
        for k in occupation_rows:
            ls2[k] = len(ds[(ds['standard_of_living'] == i) & (ds['wife_work'] == j) & (ds['husband_occupation'] == k)]) / len(ds[(ds['wife_work'] == j) & (ds['husband_occupation'] == k)])
            temp += ls2[k] * prob_work[j] * prob_occupation[k]
        ls[j] = ls2
    living_dict[i] = ls
    prob_living[i] = temp

# print(living_dict)

# pretty print living_dict
for i in living_dict:
    print(i, ":")
    for j in living_dict[i]:
        print("\t", j, ":")
        for k in living_dict[i][j]:
            print("\t\t", k, ": ", living_dict[i][j][k])



1 :
	 1 :
		 3 :  0.1434878587196468
		 2 :  0.11326860841423948
		 1 :  0.018461538461538463
		 4 :  0.1875
	 0 :
		 3 :  0.09090909090909091
		 2 :  0.043478260869565216
		 1 :  0.009009009009009009
		 4 :  0.18181818181818182
2 :
	 1 :
		 3 :  0.1986754966887417
		 2 :  0.1715210355987055
		 1 :  0.08
		 4 :  0.1875
	 0 :
		 3 :  0.23484848484848486
		 2 :  0.19130434782608696
		 1 :  0.02702702702702703
		 4 :  0.09090909090909091
3 :
	 1 :
		 3 :  0.33774834437086093
		 2 :  0.313915857605178
		 1 :  0.24
		 4 :  0.25
	 0 :
		 3 :  0.3409090909090909
		 2 :  0.2
		 1 :  0.26126126126126126
		 4 :  0.09090909090909091
4 :
	 1 :
		 3 :  0.3200883002207506
		 2 :  0.40129449838187703
		 1 :  0.6615384615384615
		 4 :  0.375
	 0 :
		 3 :  0.3333333333333333
		 2 :  0.5652173913043478
		 1 :  0.7027027027027027
		 4 :  0.6363636363636364


In [18]:
# find probability of no of children

children_rows = ds['number_of_children'].unique()
children_dict = {}
prob_children = {}

for i in children_rows:
    ls = {}
    temp = 0
    for j in living_rows:
        ls[j] = len(ds[(ds['number_of_children'] == i) & (ds['standard_of_living'] == j)]) / len(ds[ds['standard_of_living'] == j])
        temp += ls[j] * prob_living[j]
    children_dict[i] = ls
    prob_children[i] = temp

print(children_dict)

{2: {1: 0.11627906976744186, 2: 0.08296943231441048, 3: 0.09534883720930233, 4: 0.07017543859649122}, 0: {1: 0.6356589147286822, 2: 0.5851528384279476, 3: 0.6116279069767442, 4: 0.6257309941520468}, 1: {1: 0.23255813953488372, 2: 0.31877729257641924, 3: 0.27906976744186046, 4: 0.29385964912280704}, 3: {1: 0.015503875968992248, 2: 0.013100436681222707, 3: 0.013953488372093023, 4: 0.01023391812865497}}


In [19]:
# find probability of media exposure, does not depend on anything

media_rows = ds['media_exposure'].unique()
media_dict = {}
prob_media = {}

for i in media_rows:
    media_dict[i] = len(ds[ds['media_exposure'] == i]) / len(ds)
    prob_media[i] = media_dict[i]

print(media_dict)

{0: 0.9259510869565217, 1: 0.07404891304347826}


In [21]:
ds.head()

Unnamed: 0,wife_age,wife_education,husband_education,number_of_children,wife_religion,wife_work,husband_occupation,standard_of_living,media_exposure,contraceptive_method_used
0,2,1,3,2,1,1,3,4,0,1
1,2,2,3,2,1,1,3,4,0,1
2,2,3,2,2,1,1,3,3,0,1
3,1,3,3,2,1,1,3,2,0,1
4,0,4,4,0,1,1,3,3,0,1


In [27]:
# probability of contraceptive method
# depends on media_exposure and no_of_children


method_rows = ds['contraceptive_method_used'].unique()
method_dict = {}
prob_method = {}

for i in method_rows:
    ls = {}
    temp = 0
    for j in media_rows:
        ls2 = {}
        for k in children_rows:
            ls2[k] = len(ds[(ds['contraceptive_method_used'] == i) & (ds['media_exposure'] == j) & (ds['number_of_children'] == k)]) / len(ds[(ds['media_exposure'] == j) & (ds['number_of_children'] == k)])
            temp += ls2[k] * prob_media[j] * prob_children[k]
        ls[j] = ls2
    method_dict[i] = ls
    prob_method[i] = temp

print(prob_method)

{1: 0.42422090100141174, 2: 0.2270201346784538, 3: 0.3487589643201346}


In [30]:
# print percentage of each contraceptive method

for i in prob_method:
    print(i, ": ", prob_method[i] * 100, "%")
    

1 :  42.422090100141176 %
2 :  22.70201346784538 %
3 :  34.87589643201346 %
