In [1]:
import numpy as np
import pandas as pd

dept = ['sales', 'system', 'marketing', 'secretary']
age = ['21-25', '26-30', '31-35', '36-40', '41-45', '46-50']
salary = ['26k-30k', '31k-35k', '36k-40k', '41k-45k', '46k-50k', '51k-55k', '56k-60k', '61k-65k', '66k-70k']
status = ['senior', 'junior']

data = pd.read_csv("data.csv")
data

Unnamed: 0,department,status,age,salary
0,sales,senior,31-35,46k-50k
1,sales,junior,26-30,26k-30k
2,sales,junior,31-35,31k-35k
3,systems,junior,21-25,46k-50k
4,systems,senior,31-35,66k-70k
5,systems,junior,26-30,46k-50k
6,systems,senior,41-45,66k-70k
7,marketing,senior,36-40,46k-50k
8,marketing,junior,31-35,41k-45k
9,secretary,senior,46-50,36k-40k


In [6]:
# Training
laplace = 1
def prior_prob(data):
    status_unique = data.status.unique()
    prior_probability = np.zeros(len(data.status.unique()))
    for i in range(0, len(status_unique)):
        s = sum(data['status'] == status_unique[i]) + laplace
        total_sum = len(data['status']) + 2*laplace
        prior_probability[i] = s/total_sum
    return prior_probability
        
prior_probabality = prior_prob(data)
prior_probabality

array([0.46153846, 0.53846154])

In [7]:
def dept_probs(data):
    status_unique = data.status.unique()
    dept_unique = data.department.unique()
    cond_prob = np.zeros((len(status_unique), len(dept_unique)))
    for a in range(0, len(status_unique)):
        for b in range(0, len(dept_unique)):
            count = data.loc[(data['department'] == dept_unique[b]) & (data['status'] == status_unique[a]),].shape[0] + laplace
            total = sum(data["status"] == status_unique[a]) + len(dept_unique) * laplace
            cond_prob[a][b] = count / total
            
    cond_prob = pd.DataFrame(cond_prob, columns=dept_unique, index=status_unique)   
    return cond_prob
dept_prob_df = dept_probs(data)
dept_prob_df

Unnamed: 0,sales,systems,marketing,secretary
senior,0.222222,0.333333,0.222222,0.222222
junior,0.3,0.3,0.2,0.2


In [8]:
def age_probs(data):
    status_unique = data.status.unique()
    age_unique = data.age.unique()
    cond_prob = np.zeros((len(status_unique), len(age_unique)))
    for a in range(0, len(status_unique)):
        for b in range(0, len(age_unique)):
            count = data.loc[(data['age'] == age_unique[b]) & (data['status'] == status_unique[a]),].shape[0] + laplace
            total = sum(data["status"] == status_unique[a]) + len(age_unique) * laplace
            cond_prob[a][b] = count / total
            
    cond_prob = pd.DataFrame(cond_prob, columns=age_unique, index=status_unique)   
    return cond_prob
age_prob_df = age_probs(data)
age_prob_df

Unnamed: 0,31-35,26-30,21-25,41-45,36-40,46-50
senior,0.272727,0.090909,0.090909,0.181818,0.181818,0.181818
junior,0.25,0.333333,0.166667,0.083333,0.083333,0.083333


In [9]:
def salary_probs(data):
    status_unique = data.status.unique()
    salary_unique = data.salary.unique()
    cond_prob = np.zeros((len(status_unique), len(salary_unique)))
    for a in range(0, len(status_unique)):
        for b in range(0, len(salary_unique)):
            count = data.loc[(data['salary'] == salary_unique[b]) & (data['status'] == status_unique[a]),].shape[0] + laplace
            total = sum(data["status"] == status_unique[a]) + len(salary_unique) * laplace
            cond_prob[a][b] = count / total
            
    cond_prob = pd.DataFrame(cond_prob, columns=salary_unique, index=status_unique)   
    return cond_prob
salary_prob_df = salary_probs(data)
salary_prob_df

Unnamed: 0,46k-50k,26k-30k,31k-35k,66k-70k,41k-45k,36k-40k
senior,0.272727,0.090909,0.090909,0.272727,0.090909,0.181818
junior,0.25,0.25,0.166667,0.083333,0.166667,0.083333


In [12]:
def prediction(dept, age, salary):
    p0 = prior_probabality[0] * dept_prob_df[dept]['senior']*age_prob_df[age]['senior']*salary_prob_df[salary]['senior']
    p1 = prior_probabality[1] * dept_prob_df[dept]['junior']*age_prob_df[age]['junior']*salary_prob_df[salary]['junior']
    
    if p0 > p1:
        return 'senior', 
    else: 
        return 'junior'

In [13]:
prediction('marketing', '31-35', '46k-50k')

'senior'

In [14]:
prediction('sales', '31-35', '66k-70k')

'senior'

In [15]:
prediction('systems', '26-30', '46k-50k')

'junior'