In [324]:
import csv

In [325]:
def open_file(filename_in):
    f = open(filename_in,'r')
    datafile = csv.reader(f)
    data = []
    for line in datafile: #this gives you a list of lists
        data.append(line)
    headers = data[0] #take the first line of data as headers
    data.pop(0) #then remove the first line from data as you've now separated that to headers 
    return (data, headers)

In [326]:
def check_outcomes(data_in):
    '''takes in the data, returns summarised outcomes (a count of each outcome) as a dictionary, 
    and overall probabilities of each outcome as a dictionary'''
    possible_outcomes = [] #blank list to tot up possible outcomes
    summarised_outcomes = {} #results dict for count of each outcome 
    
    for line in data_in: #record all outcomes
        possible_outcomes.append(line[-1]) 
        
    outcomes_set = list(set(possible_outcomes)) #this removes the dupes
    
    for item in outcomes_set: #populate the summary
        summarised_outcomes[item] = possible_outcomes.count(item) 
        
    outcome_probabilities = {} # these are the probabilities of the final column - use to normalise
    for item in outcomes_set:
        outcome_probabilities[item] = summarised_outcomes[item]/len(possible_outcomes)
        
    return summarised_outcomes, outcome_probabilities


In [327]:
def parse_data(data_in, headers_in):
    '''takes data and headers in. summarises and returns probability of each outcome as dictionary'''
    
    summarised_outcomes, outcome_probabilities = check_outcomes(data_in)

    results = {} #set up a blank dictionary to hold summarised probability data for all columns
    for i in range(len(headers_in)-1): #cycle through the columns
        column_results = []
        summarised_column_results ={}
        for line in data_in:
            column_results.append((line[-1], line[i]))
        column_results_set = list(set(column_results))
        for item in column_results_set:
            summarised_column_results[item] = column_results.count(item) / summarised_outcomes[item[0]]
        results[headers_in[i]] = summarised_column_results
    
    return results, outcome_probabilities

In [328]:
def check_probability(instance_in, weights_in, outcomes_in):
    '''takes in (i) a set of circumstances as a dictionary 
    (ii) a dictionary of weights
    (iii) a dictionary of outcomes, with probabilities
    and returns the probability of each possible outcome, un-normalised as a dictionary'''
    results = {}
    for outcome_key, outcome_probability in outcomes_in.items():
        probability = outcome_probability
        for instance_key, instance_value in instance_in.items():
             probability *= weights_in[instance_key][(outcome_key,instance_value)]
        results[outcome_key] = probability
    return results

In [329]:
def normalise_results(probability_dict_in):
    '''takes in the un-normalised probability dictionary and returns a normalised probability dictionary'''
    normalised_results = {}
    for key, value in probability_dict_in.items():
        normalised_results[key] = value / sum(probability_dict_in.values())
    return normalised_results
        

In [330]:
#build_filename = 'play_data_set.csv'

In [331]:
#data , headers = open_file(build_filename) 

In [332]:
#results, outcomes = parse_data(data,headers)

In [333]:
#results

In [334]:
#outcomes

In [335]:
#test_scenario = check_probability({"humidity": "dry", "temperature": "hot", "weather": "rainy"}, results, outcomes)

In [336]:
#normalise_results(test_scenario)

In [337]:
def run_model(filename_in, scenario_in):
    data, headers = open_file(filename_in)
    results, outcomes = parse_data(data, headers)
    test_scenario = check_probability(scenario_in, results, outcomes)
    normalised_results = normalise_results(test_scenario)
    for k, v in normalised_results.items():
        print('Normalised probability of {}: {}'.format(k, v))

In [338]:
run_model('play_data_set.csv', {"humidity": "dry", "temperature": "hot", "weather": "rainy"})

Normalised probability of no: 0.3333333333333333
Normalised probability of yes: 0.6666666666666666
