# Tarea 02: 1R, 0R y Naive Bayes

## Peguntas a responder
    •    Cuál es la respuesta del 0R y el 1R al "query":  <(Outlook, Rainy), (Temperature, 76.3), (Humidity, 81.6), (Windy, False)>?
    •    Si el query tuviera un "missing value" en el atributo de "Humidity": <(Outlook, Rainy), (Temperature, 76.3), (Windy, False)>. Cuál sería la respuesta del 0R y cuál del 1R?
    •    Si quitamos las últimas 4 instancias de la tabla de la página 9, cuál sería la nueva tabla de errores del 1R y cuál sería la nueva regla ganadora?
    •    Con esta nueva tabla, cuál sería la respuesta del 1R al mismo query del primer punto?
    •    Si quitamos las últimas 4 instancias de la tabla de la página 9, cuál sería la nueva tabla de probabilidades del Naive Bayes?
    •    Con esta nueva tabla, cuál sería la respuesta del Naive Bayes al mismo query del primer punto?

# Codigo

## Importar Modulos

In [234]:
from collections import defaultdict
from pathlib import Path

import pandas as pd
from pprint import pprint
from scipy.stats import norm

## Importar Datos

In [4]:
csv_file = Path('..') / 'data' / 'weather_baseball.csv'
weather = pd.read_csv(csv_file,header=0)
weather

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
0,sunny,85,85,False,no
1,sunny,80,90,True,no
2,overcast,83,86,False,yes
3,rainy,70,96,False,yes
4,rainy,68,80,False,yes
5,rainy,65,70,True,no
6,overcast,64,65,True,yes
7,sunny,72,95,False,no
8,sunny,69,70,False,yes
9,rainy,75,80,False,yes


## Implementacion del 1R y del 0R

### Funciones de ayuda

In [168]:
def get_rule(df, attribute, target):
    rule = {'_err': 0}
    is_numerical = df[attribute].dtype in (pd.np.int64, pd.np.float64)
    if is_numerical:
        rule = numerical_rule(df, attribute, target)
    else:
        rule = categorical_rule(df, attribute, target)
    return is_numerical, rule

def categorical_rule(df, attribute, target):
    values = df[attribute].unique()
    target_values = df[target].unique()
    rule = {'_err': 0}
    for value in values:
        df2 = df[df[attribute] == value]
        best_choice = max(target_values, key=lambda x: len(df2[df2[target] == x]))
        rule[value] = best_choice
        rule['_err'] += len(df2[df2[target] != best_choice])
    return rule

def numerical_rule(df, attribute, target):
    attr_df = df[[attribute, target]].sort_values(attribute)
    counter = defaultdict(int)
    rule = []
    prev_attr = prev_target = None
    for idx, attr, target in attr_df.itertuples():
        counter[target] += 1
        if target != prev_target and counter[prev_target] >= 3:
            rule.append(((attr+prev_attr)/2, prev_target))
            counter.clear()
            counter[target] +=1
        prev_target = target
        prev_attr = attr
    rule.append((pd.np.inf, prev_target))
    rule = simplify_numerical(rule)
    return {'rule': rule}

def simplify_numerical(rule):
    prev_clause = None
    final_rule = []
    for clause in reversed(rule):
        if clause[1] == prev_clause:
            continue
        else:
            final_rule.insert(0, clause)
        prev_clause = clause[1]
    return final_rule

def predict_r(attribute, rule, data, is_numerical=False):
    if attribute not in data:
        return 'Error: Cannot predict if the data has no {} value.'.format(attribute)
    if is_numerical:
        return predict_numerical(attribute, rule, data)
    else:
        return predict_categorical(attribute, rule, data)

def predict_numerical(attribute, rule, data):
    value = data[attribute]
    for limit, prediction in rule['rule']:
        if value <= limit:
            return prediction

def predict_categorical(attribute, rule, data):
    value = data[attribute]
    return rule[value]

def num_errors(df, all_rules, attribute, target):
    rule = all_rules[attribute]
    if '_err' in rule:
        return (rule['_err'], len(rule))
    else:
        error = 0
        for row in df.itertuples():
            if row._asdict()[target] != predict_numerical(attribute, rule, row._asdict()):
                error += 1
        rule['_err'] = error
        return (rule['_err'], len(rule['rule']))

### Algoritmo principal

In [151]:
def one_r(df, target, print_rules=False):
    """Aprende de los datos y regresa la mejor regla a aplicar."""
    rules = {}
    for column in df.columns:
        if column == target:
            continue
        numerical, rule = get_rule(df, column, target)
        rules[column] = rule
    best_rule = min(rules, key=lambda x: num_errors(df, rules, x, target))
    if print_rules:
        pprint(rules, width=20)
    return best_rule, numerical, rules[best_rule]

def zero_r(df, target):
    """Regresa el valor objetivo que mas se repite."""
    return df[target].value_counts().index[0]

## Aplicacion a los datos

In [167]:
learned_rule = one_r(weather, 'Play')
learned_rule

Sin simplificar [(70.5, 'yes'), (77.5, 'yes'), (inf, 'no')]
Sin simplificar [(82.5, 'yes'), (95.5, 'no'), (inf, 'yes')]


('Humidity',
 False,
 {'rule': [(82.5, 'yes'), (95.5, 'no'), (inf, 'yes')], '_err': 3})

In [147]:
attribute, is_numerical, rule = learned_rule

#### Cuál es la respuesta del 0R y el 1R al "query": <(Outlook, Rainy), (Temperature, 76.3), (Humidity, 81.6), (Windy, False)>?

In [148]:
query = {'Outlook': 'rainy', 'Temperature': 76.3, 'Humidity': 81.6, 'Windy': False}
zero_r_result = zero_r(weather, 'Play')
one_r_result = predict_r(attribute, rule, query, is_numerical=True)
print('Resultado 0R: {}\nResultado 1R: {}'.format(zero_r_result, one_r_result))

Resultado 0R: yes
Resultado 1R: yes


#### Si el query tuviera un "missing value" en el atributo de "Humidity": <(Outlook, Rainy), (Temperature, 76.3), (Windy, False)>. Cuál sería la respuesta del 0R y cuál del 1R?

In [149]:
query = {'Outlook': 'rainy', 'Temperature': 76.3, 'Windy': False}
zero_r_result = zero_r(weather, 'Play')
one_r_result = predict_r(attribute, rule, query, is_numerical=True)
print('Resultado 0R: {}\nResultado 1R: {}'.format(zero_r_result, one_r_result))

Resultado 0R: yes
Resultado 1R: Error: Cannot predict if the data has no Humidity value.


#### Si quitamos las últimas 4 instancias de la tabla de la página 9, cuál sería la nueva tabla de errores del 1R y cuál sería la nueva regla ganadora?

In [153]:
weather[:-4]

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
0,sunny,85,85,False,no
1,sunny,80,90,True,no
2,overcast,83,86,False,yes
3,rainy,70,96,False,yes
4,rainy,68,80,False,yes
5,rainy,65,70,True,no
6,overcast,64,65,True,yes
7,sunny,72,95,False,no
8,sunny,69,70,False,yes
9,rainy,75,80,False,yes


In [156]:
weather[:-4].sort_values('Temperature')

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
6,overcast,64,65,True,yes
5,rainy,65,70,True,no
4,rainy,68,80,False,yes
8,sunny,69,70,False,yes
3,rainy,70,96,False,yes
7,sunny,72,95,False,no
9,rainy,75,80,False,yes
1,sunny,80,90,True,no
2,overcast,83,86,False,yes
0,sunny,85,85,False,no


In [169]:
learned_rule = one_r(weather[:-4], 'Play', print_rules=True)
learned_rule

{'Humidity': {'_err': 2,
              'rule': [(82.5,
                        'yes'),
                       (95.5,
                        'no'),
                       (inf,
                        'yes')]},
 'Outlook': {'_err': 2,
             'overcast': 'yes',
             'rainy': 'yes',
             'sunny': 'no'},
 'Temperature': {'_err': 3,
                 'rule': [(71.0,
                           'yes'),
                          (inf,
                           'no')]},
 'Windy': {False: 'yes',
           True: 'no',
           '_err': 3}}


('Humidity',
 False,
 {'rule': [(82.5, 'yes'), (95.5, 'no'), (inf, 'yes')], '_err': 2})

#### Con esta nueva tabla, cuál sería la respuesta del 1R al mismo query del primer punto?

In [170]:
attribute, is_numerical, rule = learned_rule
query = {'Outlook': 'rainy', 'Temperature': 76.3, 'Humidity': 81.6, 'Windy': False}
zero_r_result = zero_r(weather, 'Play')
one_r_result = predict_r(attribute, rule, query, is_numerical=True)
print('Resultado 0R: {}\nResultado 1R: {}'.format(zero_r_result, one_r_result))

Resultado 0R: yes
Resultado 1R: yes


## Naive Bayes

### Funciones de Ayuda 

In [321]:
def freq(df, attr, target, laplace_fix=True):
    df = df.dropna(subset=[attr])
    attr_freqs = {}
    for attr_value in df[attr].unique():
        attr_freqs[attr_value] = {}
        for target_value in df[target].unique():
            df1 = df[(df[attr] == attr_value) & (df[target] == target_value)]
            total = len(df[df[target] == target_value])
            if laplace_fix:
                total += len(df[attr].unique())
                freq = (len(df1) + 1) / total
            else:
                freq = len(df1) / total
            attr_freqs[attr_value][target_value] = freq
    return attr_freqs

def numeric_stats(df, attr, target):
    numeric_stats = {}
    for target_value in df[target].unique():
        values = df[df[target] == target_value][attr]
        mean = float(values.values.mean())
        stdev = float(values.values.std(ddof=1))
        numeric_stats[target_value] = mean, stdev
    return numeric_stats

def gaussian_probability(x, mean, stdev):
    return norm.pdf(x, loc=mean, scale=stdev)
    
def predict_bayes(bayes_table, query, target, print_likelihoods=False):
    likelihoods = {}
    for tgt_val in bayes_table[target]:
        likelihood = 1.0
        for attr in query:
            if type(query[attr]) in (int, float):
                evidence = gaussian_probability(query[attr], *bayes_table[attr][tgt_val])
                print(evidence)
                likelihood *= evidence
            else:
                evidence =  bayes_table[attr][query[attr]][tgt_val]
                print(evidence)
                likelihood *= evidence
        apriori = bayes_table[target][tgt_val]
        print(apriori)
        likelihood *= apriori
        likelihoods[tgt_val] = likelihood
    if print_likelihoods:
        pprint(likelihoods)
    best_choice = max(likelihoods, key=lambda x: likelihoods[x])
    return best_choice, likelihoods[best_choice]

### Algoritmo Principal

In [322]:
def naive_bayes(df, target, print_rules=False, laplace_fix=True):
    bayes_table = {}
    for col in df.columns:
        if col == target:
            bayes_table[target] = {}
            for tgt_value in df[target].unique():
                bayes_table[target][tgt_value] = len(df[df[target] == tgt_value]) / len(df)
        elif df[col].dtype in (pd.np.int64, pd.np.float64):
            bayes_table[col] = numeric_stats(df, col, target)
        else:
            bayes_table[col] = freq(df, col, target, laplace_fix)
    if print_rules:
        pprint(bayes_table)
    return bayes_table

In [313]:
table = naive_bayes(weather, 'Play', True)

{'Humidity': {'no': (86.2, 9.731392500562292),
              'yes': (79.11111111111111, 10.215728613814635)},
 'Outlook': {'overcast': {'no': 0.125, 'yes': 0.4166666666666667},
             'rainy': {'no': 0.375, 'yes': 0.3333333333333333},
             'sunny': {'no': 0.5, 'yes': 0.25}},
 'Play': {'no': 0.35714285714285715, 'yes': 0.6428571428571429},
 'Temperature': {'no': (74.6, 7.893034904268446),
                 'yes': (73.0, 6.164414002968976)},
 'Windy': {False: {'no': 0.42857142857142855, 'yes': 0.6363636363636364},
           True: {'no': 0.5714285714285714, 'yes': 0.36363636363636365}}}


#### Si quitamos las últimas 4 instancias de la tabla de la página 9, cuál sería la nueva tabla de probabilidades del Naive Bayes?

In [314]:
table = naive_bayes(weather[:-4], 'Play', True)

{'Humidity': {'no': (85.0, 10.801234497346433),
              'yes': (79.5, 11.095043938624128)},
 'Outlook': {'overcast': {'no': 0.14285714285714285, 'yes': 0.3333333333333333},
             'rainy': {'no': 0.2857142857142857, 'yes': 0.4444444444444444},
             'sunny': {'no': 0.5714285714285714, 'yes': 0.2222222222222222}},
 'Play': {'no': 0.4, 'yes': 0.6},
 'Temperature': {'no': (75.5, 8.812869377601524),
                 'yes': (71.5, 6.6558245169174945)},
 'Windy': {False: {'no': 0.5, 'yes': 0.75}, True: {'no': 0.5, 'yes': 0.25}}}


#### Con esta nueva tabla, cuál sería la respuesta del Naive Bayes al mismo query del primer punto?

In [315]:
query = {'Outlook': 'rainy', 'Temperature': 76.3, 'Humidity': 81.6, 'Windy': False}
prediction = predict_bayes(table, query,'Play', True)
print("Bayes Result: {} with a likelihood of {}".format(*prediction))

{'no': 9.054944602792232e-05, 'yes': 0.00032644037417296816}
Bayes Result: yes with a likelihood of 0.00032644037417296816


#### Revisando ejemplos de las diapositivas

In [324]:
query = {'Outlook': 'sunny', 'Temperature': 66, 'Humidity': 90, 'Windy': True}
table = naive_bayes(weather[:-4], 'Play', True,laplace_fix=False)
prediction = predict_bayes(table, query,'Play', True,)
print("Bayes Result: {} with a likelihood of {}".format(*prediction))

{'Humidity': {'no': (85.0, 10.801234497346433),
              'yes': (79.5, 11.095043938624128)},
 'Outlook': {'overcast': {'no': 0.0, 'yes': 0.3333333333333333},
             'rainy': {'no': 0.25, 'yes': 0.5},
             'sunny': {'no': 0.75, 'yes': 0.16666666666666666}},
 'Play': {'no': 0.4, 'yes': 0.6},
 'Temperature': {'no': (75.5, 8.812869377601524),
                 'yes': (71.5, 6.6558245169174945)},
 'Windy': {False: {'no': 0.5, 'yes': 0.8333333333333334},
           True: {'no': 0.5, 'yes': 0.16666666666666666}}}
0.75
0.025320012213401355
0.03318219562524907
0.5
0.4
0.16666666666666666
0.04260204621576801
0.022977410196778368
0.16666666666666666
0.6
{'no': 0.00012602603977481693, 'yes': 1.6314744852030186e-05}
Bayes Result: no with a likelihood of 0.00012602603977481693
