In [1]:
import pandas as pd
import numpy as np
import fnmatch

def clean_data(lines):
    #strip \n pattern out of lines
    lines2 = []
    for line in lines:
        lines2.append(line.rstrip('\n'))

    #strip \t pattern out of lines
    clean_lines = []
    for line in lines2:
        clean_lines.append(line.lstrip('\t'))

    clean_lines = clean_lines[3:]
    return clean_lines


def parse_cases_ans(clean_lines):
    #parse the data associated with every example case and every answer choice into a list of tuples
    A_data = []
    B_data = []
    C_data = []
    Ans1 = []
    Ans2 = []
    Ans3 = []
    Ans4 = []
    Ans5 = []
    Ans6 = []
    for idx,val in enumerate(clean_lines):
        if val == 'B':
            A_data.append(clean_lines[1:idx])
        if val == 'C':
            B_data.append(clean_lines[clean_lines.index('B')+1:idx])
        if val == '1':
            C_data.append(clean_lines[clean_lines.index('C')+1:idx])
        if val =='2':
            Ans1.append(clean_lines[clean_lines.index('1')+1:idx])
        if val =='3':
            Ans2.append(clean_lines[clean_lines.index('2')+1:idx])
        if val =='4':
            Ans3.append(clean_lines[clean_lines.index('3')+1:idx])
        if val =='5':
            Ans4.append(clean_lines[clean_lines.index('4')+1:idx])
        if val =='6':
            Ans5.append(clean_lines[clean_lines.index('5')+1:idx])
            Ans6.append(clean_lines[clean_lines.index('6')+1:])
    return [tuple(A_data[0]), tuple(B_data[0]), tuple(C_data[0]), tuple(Ans1[0]), \
                tuple(Ans2[0]), tuple(Ans3[0]), tuple(Ans4[0]), tuple(Ans5[0]), tuple(Ans6[0])]

#generate a dictionary with keys equal to case labels (A, B, and C), and values equal to associated data as a tuple
def gen_case_dict(parsed_cases):
    case_zip = zip(['A','B','C'],cases)
    case_dict = dict(case_zip)
    return case_dict
    
    
#generate a dictionary with keys equal to answer labels (1,2,3,4,5,and 6), and values equal to associated data as a tuple
def gen_ans_dict(parsed_ans):
    ans_zip = zip(['1','2','3','4','5','6'],answers)
    ans_dict = dict(ans_zip)
    return ans_dict

def parse_shape_data(all_values):
    #store all shape metadata in lists
    allZ_shapes = [] #Z shape data for every case
    allY_shapes = [] #Y shape data for every case
    allX_shapes = [] #X shape data for every case
    for item in all_values:
        Z_data = ['0']
        Y_data = ['0']
        X_data = []
        for idx,val in enumerate(list(item)):
            if 'Y' not in item:
                Z_data[0]=item[1:]
            elif 'X' not in item:
                if val == 'Y':
                    Z_data.append(item[1:idx])
                    Z_data.pop(0)
                    Y_data.append(item[idx+1:])
                    Y_data.pop(0)
            else:
                if val == 'Y':
                    Z_data.append(item[1:idx])
                    Z_data.pop(0)
                if val == 'X':
                    Y_data.append(item[item.index('Y')+1:idx])
                    Y_data.pop(0)
                    X_data.append(item[item.index('X')+1:])
        allZ_shapes.append(Z_data[0])
        if 'Y' in item:
            allY_shapes.append(Y_data[0])
        elif 'Y' not in item:
            allY_shapes.append('-')
            allX_shapes.append('-')
        if 'X' in item:
            allX_shapes.append(X_data[0])
        elif 'X' not in item:
            allX_shapes.append('-')
            
    allShapes = list(zip(allZ_shapes,allY_shapes,allX_shapes)) # list of 3 tuples - each tuple represents a case
    # each tuple has 3 elements - one for each possible shape label (Z,Y,X) --> if one does not exist in the case, 
    # "-" is the placeholder value
    return allShapes


def gen_sparse_shape_labels(parsed_cases):
    #function to generate a sparse vector of case labels (Z, Y, X, or null (-))
    shape_vect = []
    for case in parsed_cases:
        for idx,shape in enumerate(case):
            if idx == 0 and shape != '-':
                shape_vect.append('Z')
            elif idx == 1 and shape != '-':
                shape_vect.append('Y')
            elif idx == 2 and shape != '-':
                shape_vect.append('X')
            else:
                shape_vect.append('-')
    return shape_vect


def parse_shape_attribute(parsed_cases):
    #create a sparse vector of SHAPE attributes for each case and shape label (e.g.circle, square, triangle, or null etc)
    shapes = []
    for case in parsed_cases:
        for shape in case:
            for attribute in shape:
                if 'shape' in attribute:
                    shapes.append(attribute[6:])
                elif 'shape' not in attribute and attribute != '-':
                    pass
                else:
                    shapes.append('-')
    return shapes


#this function parses the associated metadata for each attribute/feature (e.g. 'shape','size','fill')
#into its own sparse vector, to be added to the sparse matrix of all features
def parse_attribute_metadata(parsed_cases, attribute): 
    #the attribute parameter is a string from the following list: 
    #['size','fill','inside','above','overlaps','angle','left-of','vertical-flip']
    metadata = []
    for case in parsed_cases:
        for shape in case:
            if fnmatch.filter(shape, attribute+'*') == []:
                metadata.append('-')
            elif fnmatch.filter(shape, attribute+'*') != []:
                metadata.append(fnmatch.filter(shape, attribute+'*')[0].split(':')[1])
            elif shape == '-':
                metadata.append('-')
    return metadata

def parse_attribute(clean_lines, parsed_cases, attribute):
    #this function checks if a given attribute exists in the .txt file, and calls parse_attribute_metadata if it does
    #if the attribute does not exist in the .txt file, that feature vector is populated with nulls (-)
    null_counter = 0
    for line in clean_lines:
        if len(line) == 1:
            null_counter += 1
        else:
            split = line.split(':')
            if split[0]==attribute:
                attributeData = parse_attribute_metadata(parsed_cases, attribute)
                return attributeData
                break
            else:
                null_counter += 1
    if null_counter == len(clean_lines):
        attributeData = ['-']*27
    return attributeData


def populate_dataframe(parsed_cases, features, delim): #features = list of column titles, delim = list of attribute strings
    #this is a generalized function to populate the sparse feature matrix for every feature column
    for feature,attribute in zip(features,delim):
        sparse_mat[feature] = parse_attribute_metadata(parsed_cases, attribute)
    return sparse_mat


#this function generates the relationships between rows (shapes) in the inputted cases
def get_relationships(dataframe, pair):  #Valid pair arguments: 'AB','C1','C2','C3','C4','C5','C6'
    #iterate though sparse matrix and write every item to a list, contiguously
    records = []
    for index, row in dataframe.iterrows():
        for col in dataframe:
            records.append(row[col])

    #create a list of tuples, where each tuple represents a row of the dataframe
    #e.g. ('A', 'Z', 'circle', '-', 'no', '-', '-', '-', '-', '-', '-')
    row_tuples = []
    last_idx = 0
    for idx,record in enumerate(records):
        if idx%11 == 0: #append every 11 items to a tuple (represents one row of data)
            row_tuples.append(tuple(records[last_idx:idx]))
            last_idx = idx
    row_tuples.append(tuple(records[286:])) #append the last row
    row_tuples.pop(0)

    #separate out all the records associated with unqiue cases into their own lists
    case_A_rows = row_tuples[0:3]
    case_B_rows = row_tuples[3:6]
    case_C_rows = row_tuples[6:9]
    case_1_rows = row_tuples[9:12]
    case_2_rows = row_tuples[12:15]
    case_3_rows = row_tuples[15:18]
    case_4_rows = row_tuples[18:21]
    case_5_rows = row_tuples[21:24]
    case_6_rows = row_tuples[24:27]

    #compare rows, keep track of changes - this checks whether ENTIRE ROWS are the same or different
    row_relationships = []
    #zip each pair of rows together and iterate through
    if pair == 'AB':
        pairs = list(zip(case_A_rows, case_B_rows))
    elif pair == 'C1':
        pairs = list(zip(case_C_rows,case_1_rows))
    elif pair == 'C2':
        pairs = list(zip(case_C_rows,case_2_rows))
    elif pair == 'C3':
        pairs = list(zip(case_C_rows,case_3_rows))
    elif pair == 'C4':
        pairs = list(zip(case_C_rows,case_4_rows))
    elif pair == 'C5':
        pairs = list(zip(case_C_rows,case_5_rows))
    elif pair =='C6':
        pairs = list(zip(case_C_rows,case_6_rows))  
    for one,two in pairs:
        if one[1:] == two[1:]:
            row_relationships.append('unchanged')
        else:
            row_relationships.append('changed')

    #zip each individual attribute together for every row pair
    attribute_pairs = []
    for item in pairs:
        attribute_pairs.append(list(zip(item[0],item[1])))

    #create a list of lists, where the contents of each nested list are the value pairs between each row pair, as tuples
    #if the two rows being compared are exactly the same, the value 'ROW UNCHANGED' is stored
    #if the two rows being compared have differences, a list of those differences is stored
    #e.g. ['changed', 'unchanged', 'unchanged', 'unchanged', 'changed', 'unchanged', 
    #      'unchanged', 'unchanged', 'unchanged', 'unchanged', 'unchanged']
    #the list of differences (if generated) will always have 11 elements (one for each column value in the dataframe)
    val_relationships = [[],[],[]]
    for idx,item in enumerate(attribute_pairs):
        for pair in item:
            if pair[0] == pair[1]:
                val_relationships[idx].append('unchanged')
            elif pair[0]!='-' and pair[1]=='-':
                val_relationships[idx].append('removed')
            elif pair[0]=='-' and pair[1]!='-':
                val_relationships[idx].append('added')
            elif pair[0]!='-' and pair[1]!='-' and pair[0]!=pair[1]:
                val_relationships[idx].append('changed')

    for idx,item in enumerate(val_relationships):
        if item == ['changed', 'unchanged', 'unchanged', 'unchanged', 'unchanged', \
                    'unchanged', 'unchanged', 'unchanged', 'unchanged', 'unchanged', 'unchanged']:
            val_relationships[idx] = 'ROW UNCHANGED'

    return val_relationships

        

In [2]:
from IPython.display import display, HTML

if __name__ == "__main__":
    print('The algorithm correctly selected the next image in the sequence 10/20 times')
    print('\n')
    predictions = []
    
    files = ['01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18','19','20']
    test_file = ['04'] #change to test an individual file
    for file in files: #change 'files' to 'test_file' to test an individual file
        
        #------------------------------------------READ AND CLEAN DATA---------------------------------------------------
        with open(r'C:\dev\python\BIA 662\HW1\2x1 Basic Problems [TXT]/2x1BasicProblem'+file+'.txt','r') as f:
            lines = f.readlines()
        answer = lines[2].rstrip('\n')
        print(f'PROBLEM #{file}')
        print('ANSWER: ' + answer)
        clean_lines = clean_data(lines)
        #----------------------------------------------------------------------------------------------------------------



        #----------------------------------------------PARSE DATA--------------------------------------------------------
        cases = parse_cases_ans(clean_lines)[0:3] #example cases A, B, and C
        case_dict = gen_case_dict(cases)
        case_data = list(case_dict.values())

        answers = parse_cases_ans(clean_lines)[3:] #answer options 1,2,3,4,5, and 6
        ans_dict = gen_ans_dict(answers)
        ans_data = list(ans_dict.values())

        all_values = case_data + ans_data

        parsed_cases = parse_shape_data(all_values)
        #----------------------------------------------------------------------------------------------------------------



        #--------------------------------------------POPULATE DATAFRAME--------------------------------------------------
        #set up basic structure for sparse matrix of features
        examples = ['A']*3 + ['B']*3 + ['C']*3 #every .txt file has 3 example cases A, B, and C, with varying attributes
        options = ['1']*3 + ['2']*3 + ['3']*3 + ['4']*3 + ['5']*3 + ['6']*3 #every .txt file has 6 answer options with varying attributes
        cases = examples+options # this creates 3 records for every example (A,B,C) and answer option (1,2,3,4,5,6)

        features = ['CASE','SHAPE LABEL','SHAPE','SIZE','FILL','INSIDE','ABOVE','OVERLAPS','ANGLE','LEFT-OF','VERTICAL-FLIP']
        sparse_mat = pd.DataFrame(index=range(27), columns=features) #matrix will always be 27 rows ()

        #populate dataframe with all case labels (A,B,C,1,2,3,4,5,6)
        sparse_mat['CASE'] = cases 

        #populate dataframe with all possible shape labels, and nulls (-) where no shape exists
        parsed_shape_labels = gen_sparse_shape_labels(parsed_cases)
        sparse_mat['SHAPE LABEL'] = parsed_shape_labels

        features = features[2:] #all features except 'CASE' and 'SHAPE LABEL' - these are hardcoded
        delim = map(lambda x: x.lower(),features)
        dataframe = populate_dataframe(parsed_cases,features,delim)

        #display(dataframe) #ENTIRE SPARSE MATRIX
        A_B_cases = dataframe.head(6)
        #--------------------------------------------------------------------------------------------------------------

        
        
        #-------------------------------------COMPARE VALUES & PREDICT ANSWERS-------------------------------------------
        #this function compares every row to a corresponding row (based on shape label - Az vs Bz, Ay vs By, Cz vs 1z, etc)
        #the output is a list of 3 lists [[],[],[]] - if the two rows compared are the same, output is 'ROW UNCHANGED'
        #if the two rows compared are different, the output is a list of changes 
        #e.g. ['changed','unchanged','added','changed','removed','unchanged',etc....]
        AB = get_relationships(dataframe,'AB')
        #print(AB)
        C1 = get_relationships(dataframe,'C1')
        C2 = get_relationships(dataframe,'C2')
        C3 = get_relationships(dataframe,'C3')
        C4 = get_relationships(dataframe,'C4')
        C5 = get_relationships(dataframe,'C5')
        C6 = get_relationships(dataframe,'C6')
        C_ans_relationships = [C1, C2, C3, C4, C5, C6] #list of relationships between case C, and each answer 1 through 6
            
        #compare AB relationships to C1, C2, C3 ..... relationships
        #generate predicted answers
        mappings = []
        for rel in C_ans_relationships:
            if AB == rel:
                mappings.append(True)
            else:
                mappings.append(False)
        if True in mappings:
            prediction = mappings.index(True)+1
            predictions.append(prediction)
            print('PREDICTED ANSWER: ' + str(prediction))
            print('--------------------------------------------------------------------------------------------------')
        else:
            prediction = 'no answer found'
            predictions.append(prediction)
            print('No answer found')
            print('--------------------------------------------------------------------------------------------------')  
        #print(mappings)
        mappings = [] #reset mappings to empty list for next iteration
        print('\n')
        
        
        
    problems = range(1,21)
    prob_nums = []
    for num in problems:
        prob_nums.append(f'PROBLEM #{num}')
    predict_dict = dict(zip(prob_nums, predictions))
    print(predict_dict)
    print('\n')
    
    

The algorithm correctly selected the next image in the sequence 10/20 times


PROBLEM #01
ANSWER: 5
PREDICTED ANSWER: 5
--------------------------------------------------------------------------------------------------


PROBLEM #02
ANSWER: 6
PREDICTED ANSWER: 6
--------------------------------------------------------------------------------------------------


PROBLEM #03
ANSWER: 4
PREDICTED ANSWER: 4
--------------------------------------------------------------------------------------------------


PROBLEM #04
ANSWER: 3
PREDICTED ANSWER: 3
--------------------------------------------------------------------------------------------------


PROBLEM #05
ANSWER: 2
PREDICTED ANSWER: 2
--------------------------------------------------------------------------------------------------


PROBLEM #06
ANSWER: 5
PREDICTED ANSWER: 5
--------------------------------------------------------------------------------------------------


PROBLEM #07
ANSWER: 2
PREDICTED ANSWER: 2
----------------------