In [129]:
import csv

In [130]:
build_filename = 'play_data_set.csv'

In [131]:
def open_file(filename_in):
    f = open(filename_in,'r')
    datafile = csv.reader(f)
    data = []
    for line in datafile: #this gives you a list of lists
        data.append(line)
    headers = data[0] #take the first line of data as headers
    data.pop(0) #then remove the first line from data as you've now separated that to headers 
    return (data, headers)

In [132]:
#incorporated into function
#data , headers = open_file(test_filename) #load in file and unpack to data and headers

In [133]:
#incorporated into function
#data #check data looks ok

In [134]:
headers #check the headers are ok

['weather', 'temperature', 'humidity', 'play']

In [135]:
def parse_data(data_in, headers_in):
    '''takes data and headers in. summarises and returns results as dictionary'''
    results = {} #set up a blank dictionary to hold summarised column data for all columns

    possible_outcomes = [] #now make a list of the possible outcomes
    for line in data_in:
        possible_outcomes.append(line[-1])
    possible_outcomes = list(set(possible_outcomes)) #this removes the dupes
    outcomes_dict  = {} #now set up a template dict based on to count outcomes
    for outcome in possible_outcomes:
        outcomes_dict[outcome] = 0 #add each possible outcome to the dictionary with an initial value of 0

    for i in range(len(headers_in)-1): #now cycle through each column
        characteristics = {} #initialise a blank dictionary to record what the characteristics for the column might be
        for line in data_in: #run through the data
            if line[i] not in characteristics: #if that record's characteristic is not in the doctionary
                characteristics[line[i]] = outcomes_dict.copy() #add it and also add the outcome dictionary so you can record results
                characteristics[line[i]][line[-1]] += 1 #record the results for the line
            else:
                characteristics[line[i]][line[-1]] += 1 #record the results for the line
        results[headers_in[i]] = characteristics #add the summarised data for the column to the results dictionary
    return results

In [136]:
#incorporated into function
#results = parse_data(data,headers)

In [137]:
def check_columns(results_dict_in):
    column_results = []
    '''takes in result dictionary and returns list of column results'''
    for key, value in results_dict_in.items(): #where your key is th column name and the value is the dictionary summarising the characteristics and results
        column_results.append([key, evaluate_column(value)]) #pass your summary dict to another function to process
    return column_results

In [138]:
def evaluate_column(column_data):
    '''Takes in column data as dict. Evaluates and returns overall accuracy'''
    cum_max = 0
    total = 0
    rules = {}
    for characteristic, outcome in column_data.items(): #cycles through the characteristics and outcomes
        cum_max += max(outcome.values()) #records max result for each characteristic...
        total += sum(outcome.values()) #and total number of records for that characteristic
        rules [characteristic] = find_rule(outcome) #pass the outcome dict to another function to find the rules and record that to rules dict
    column_accuracy = cum_max / total #work out total accuracy for column
    return (column_accuracy, rules)

In [139]:
def find_rule(outcome_dict_in):
    '''takes in outcome dict for a given characteristic, returns the most likely outcome'''
    #basically uses list comprehension to cycle through the keys and values in your outcome dict, 
    #reversing the keys and values and sorting them so the most frequent outcome comes first
    #then you pick the first item in the list and return the item at index at 1 which is the outcome
    sorted_items = sorted([(v,k) for (k,v) in outcome_dict_in.items()], reverse=True) 
    return sorted_items[0][1]

In [140]:
#incorporated into function
#column_results = check_columns(results)

In [141]:
#incorporated into function
#column_results

In [142]:
def best_column(column_results_in):
    best = column_results_in[1].copy()
    for column in column_results_in:
        if column[1][0] > best[1][0]:
            best = column
        else:
            pass
    return tuple(best)

In [143]:
#incorporated into function
#best_column(column_results)

In [144]:
#Now implement a function to test another CSV file

In [145]:
def build_rule(filename_in):
    '''brings everything above together'''
    data , headers = open_file(filename_in)
    results = parse_data(data, headers)
    column_results = check_columns(results)
    column, (accuracy, rule) = best_column(column_results)
    print('Best Column: ', column)
    print('Accuracy: ', accuracy)
    print('Rule: ',rule)
    return (column, rule)

In [146]:
#now test out the rule
def test_rule(filename_in, rule_in):
    '''takes in filename, and rule as a tuple in format (column name, rule)'''
    data , headers = open_file(filename_in)  
    column, rule = rule_in
    check_col_index = headers.index(column)
    score = 0
    for item in data:
        if rule[item[check_col_index]] == item[-1]:
            score += 1
    return( score / len(data))

In [147]:
test_file = 'play_data_set.csv' 
test_rule(test_file, build_rule(build_filename)) #tests out the rule, passing in the function to build the rule

Best Column:  temperature
Accuracy:  0.8333333333333334
Rule:  {'hot': 'yes', 'mild': 'no', 'cold': 'yes'}


0.8333333333333334