## Execution Instructions

The user should be able to run the cells in chronological order.

Specify the input file name in the cell below.

Other than that, as long as the grader has all of our libraries installed in their environment, then everything is all set to go

In [185]:
######################################################
#################SPECIFY FILENAME HERE################
fileName = 'trainDataSet.csv'
######################################################
######################################################

In [186]:
import csv
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn import tree
from scipy import stats

In [187]:
# converts the raw csv data to board matrix and label pairs
def convertRawData(fileName):
    raw = [] 
    with open(fileName) as csvFile:
        rows = csv.reader(csvFile)
        for row in rows:
            raw.append(row)
    brds = []
    lbls = []
    for row in raw[1:]:
        brd = np.flip(np.rot90(np.array(row[:-1], dtype = int).reshape(7, 6)), 0)
        brds.append(brd)
        lbls.append((row[-1:])[0])
    lbls = np.asarray(lbls).astype(int)
    return brds, lbls

# prints board the way it's played
def printBoard(brd):
    print(np.flip(brd, 0))

# creates feature to determine which player has the bottom left corner
def bottomLeft(brds):
    bl = []
    for brd in brds:
        bl.append(brd[0][0])
    return np.asarray(bl).astype(int)

# creates feature to determine which player has the bottom right corner
def bottomRight(brds):
    br = []
    for brd in brds:
        br.append(brd[0][6])
    return np.asarray(br).astype(int)

# creates feature to determine which player has control of the center columns
def centerColumnsMajority(brds):
    ccm = []
    for brd in brds:
        p1count = 0
        p2count = 0
        for row in brd:
            for i in row[2:-2]:
                if i == 1:
                    p1count += 1
                elif i == 2:
                    p2count += 1
        if p1count > p2count:
            ccm.append(1)
        elif p2count > p1count:
            ccm.append(1)
        else:
            ccm.append(0)
    return np.asarray(ccm).astype(int)

# creates feature with a continuous function to evaluate board based on possible connect fours by board position:
# positive for player 1 and negative for player 2
def positionalPossibility(brds):
    scoring = [[3, 4,  5,  7,  5, 4, 3], 
               [4, 6,  8, 10,  8, 6, 4],
               [5, 8, 11, 13, 11, 8, 5], 
               [5, 8, 11, 13, 11, 8, 5],
               [4, 6,  8, 10,  8, 6, 4],
               [3, 4,  5,  7,  5, 4, 3]]
    scores = []
    for brd in brds:
        p1 = 0
        p2 = 0
        for row in range(len(brd)):
            for col in range(len(brd[0])):
                if brd[row][col] == 1:
                    p1 += scoring[row][col]
                elif brd[row][col] == 2:
                    p2 += scoring[row][col]
        if p1 > p2:
            scores.append(1)
        elif p2 > p1:
            scores.append(2)
        else:
            scores.append(0)
    return np.asarray(scores).astype(int)

# creates a feature that determines which player has pieces spread across more columns
def colSpread(brds):
    spread = []
    for brd in brds:
        p1 = 0
        p2 = 0
        for col in range(len(brd[0])):
            p1HasCol = False
            p2HasCol = False
            for row in range(len(brd)):
                if brd[row][col] == 1 and not p1HasCol:
                    p1 += 1
                    p1HasCol = True
                elif brd[row][col] == 2 and not p2HasCol:
                    p2 += 1
                    p2HasCol = True
        if p1 > p2:
            spread.append(1)
        elif p2 > p1:
            spread.append(2)
        else:
            spread.append(0)
    return np.asarray(spread).astype(int)

# creates a feature that determines which player has pieces spread across more rows
def rowSpread(brds):
    spread = []
    for brd in brds:
        p1 = 0
        p2 = 0
        for row in range(len(brd)):
            p1HasRow = False
            p2HasRow = False
            for col in range(len(brd[0])):
                if brd[row][col] == 1 and not p1HasRow:
                    p1 += 1
                    p1HasRow = True
                elif brd[row][col] == 2 and not p2HasRow:
                    p2 += 1
                    p2HasRow = True
        if p1 > p2:
            spread.append(1)
        elif p2 > p1:
            spread.append(2)
        else:
            spread.append(0)
    return np.asarray(spread).astype(int)

# trains deicion tree 500 times and returns average accuracy utilizing bootsrap method of cross validation
def decisionTreeAccuracy(feats, labels):
    feats = pd.DataFrame(data = feats).T
    epochs = 500
    accuracy = 0
    for i in range(epochs):
        featTrain, featTest, labelsTrain, labelsTest = train_test_split(feats, labels, test_size = 0.3)
        clf = DecisionTreeClassifier()
        clf.fit(featTrain, labelsTrain)
        labelsPred = clf.predict(featTest)
        accuracy += 1 / epochs * (accuracy_score(labelsTest, labelsPred) * 100)
    print(accuracy)
    return accuracy

# calculates the importance of each feature given by a random forest feature importance metric
def randomForestImportance(feats, labels):
    feats = pd.DataFrame(data = feats).T
    featTrain, featTest, labelsTrain, labelsTest = train_test_split(feats, labels, test_size = 0.3)
    clf = RandomForestClassifier()
    clf.fit(featTrain, labelsTrain)
    labelsPred = clf.predict(featTest)
    accuracy = accuracy_score(labelsTest, labelsPred) * 100
    return clf.feature_importances_

# creates matrix of ttest results for every feature prediction against one another
def featureTTests(feats, labels):
    predictions = []
    ttestResults = [[0] * len(feats)] * len(feats)
    feats = pd.DataFrame(data = feats).T
    featTrain, featTest, labelsTrain, labelsTest = train_test_split(feats, labels, test_size = 0.3)
    for feat in feats:
        clf = RandomForestClassifier()
        clf.fit(featTrain, labelsTrain)
        labelsPred = clf.predict(featTest)
        predictions.append(labelsPred)
    for i in range(len(predictions)):
        for j in range(len(predictions)):
            t, p = stats.ttest_ind(predictions[i], predictions[j])
            ttestResults[i][j] = ("%.5f" % p)
    return ttestResults

In [188]:
boards, labels = convertRawData(fileName)
bottomLeftFeat = bottomLeft(boards)
bottomRightFeat = bottomRight(boards)
centerColumnsMajorityFeat = centerColumnsMajority(boards)
positionalPossibilityFeat = positionalPossibility(boards)
rowSpreadFeat = rowSpread(boards)
colSpreadFeat = colSpread(boards)

In [189]:
featureTTests([bottomLeftFeat, bottomRightFeat, centerColumnsMajorityFeat, positionalPossibilityFeat, rowSpreadFeat, colSpreadFeat], labels)

[['0.24267', '0.37723', '0.32779', '0.48991', '0.43130', '1.00000'],
 ['0.24267', '0.37723', '0.32779', '0.48991', '0.43130', '1.00000'],
 ['0.24267', '0.37723', '0.32779', '0.48991', '0.43130', '1.00000'],
 ['0.24267', '0.37723', '0.32779', '0.48991', '0.43130', '1.00000'],
 ['0.24267', '0.37723', '0.32779', '0.48991', '0.43130', '1.00000'],
 ['0.24267', '0.37723', '0.32779', '0.48991', '0.43130', '1.00000']]

In [191]:
print("accuracy with all the features:")
decisionTreeAccuracy([bottomLeftFeat, bottomRightFeat, centerColumnsMajorityFeat, positionalPossibilityFeat, rowSpreadFeat, colSpreadFeat], labels)
print("accuracy with all the features (excpect center column majority):")
decisionTreeAccuracy([bottomLeftFeat, bottomRightFeat, positionalPossibilityFeat, rowSpreadFeat, colSpreadFeat], labels)
print("accuracy of feature bottom left:")
decisionTreeAccuracy([bottomLeftFeat], labels)
print("accuracy of feature bottom right:")
decisionTreeAccuracy([bottomRightFeat], labels)
print("accuracy of feature center columns majority:")
decisionTreeAccuracy([centerColumnsMajorityFeat], labels)
print("accuracy of feature positional possibility:")
decisionTreeAccuracy([positionalPossibilityFeat], labels)
print("accuracy of feature row spread:")
decisionTreeAccuracy([rowSpreadFeat], labels)
print("accuracy of feature col spread:")
decisionTreeAccuracy([colSpreadFeat], labels)

accuracy with all the features:
76.72999999999998
accuracy with all the features (excpect center column majority):
76.72666666666672
accuracy of feature bottom left:
73.81199999999997
accuracy of feature bottom right:
73.63133333333334
accuracy of feature center columns majority:
73.65866666666663
accuracy of feature positional possibility:
75.26933333333331
accuracy of feature row spread:
73.63466666666656
accuracy of feature col spread:
73.77999999999999


73.77999999999999

In [193]:
randomForestImportance([bottomLeftFeat, bottomRightFeat, centerColumnsMajorityFeat, positionalPossibilityFeat, rowSpreadFeat, colSpreadFeat], labels)

array([0.15452018, 0.12480869, 0.07119044, 0.36759828, 0.13398161,
       0.14790081])

In [None]:
# we were trying another feature but it didn't end up working out...
def ridgePositionalPossibility(brds):
    scores = []
    for brd in brds:
        score = 0
        for row in range(len(brd)):
            for col in range(len(row[0])):
                if anyAdjOpen(brd, row, col) and brd[row][col] != 0:
                    score += evalPosition(brd, row, col)
        scores += score
    return scores

def anyAdjOpen(brd, row, col):
    bl = False
    b = False
    br = False
    r = False
    tr = False
    t = False
    tl = False
    l = False;
    if row == 0: 
        br = True
        b = True
        bl = True
    if row == len(brd) - 1:
        t = True
        tr = True
        tl = True
    if col == 0:
        l = True
        bl = True
        tl = True
    if col == len(brd[0]) - 1:
        r = True
        tr = True
        br = True
    if not bl:
        if brd[row-1][col-1] == 0: return True
    if not b:
        if brd[row-1][col] == 0: return True
    if not br:
        if brd[row-1][col+1] == 0: return True
    if not r:
        if brd[row][col+1] == 0: return True
    if not tr:
        if brd[row+1][col+1] == 0: return True
    if not t:
        if brd[row+1][col] == 0: return True
    if not tl:
        if brd[row+1][col-1] == 0: return True
    if not l:
        if brd[row][col-1] == 0: return True
    return False

def evalPosition(brd, row, col):
    score = 0
    score += evalPartition(brd, brd[row][col], getVert(brd, brd[row][col], row, col))
    score += evalPartition(brd, brd[row][col], getHori(brd, brd[row][col], row, col))
    score += evalPartition(brd, brd[row][col], getDiagUp(brd, brd[row][col], row, col))
    score += evalPartition(brd, brd[row][col], getDiagDown(brd, brd[row][col], row, col))
    return score

def evalPartition(brd, turn, partition):
    score = 0
    if len(partition) < 4: return score
    for i in range(len(partition) - 4):
        count = 0
        for j in range(4):
            count += partition[i+j]
        if turn == 2: count /= 2
        else: score += math.pow(count, 2)
    if turn == 2: return -score
    else: return score
    
def getVert(brd, turn, row, col):
    count = 0
    top = [None] * 3
    bot = [None] * 3
    i = row - 1
    while i >= 0 and row - i + 1 < 4:
        if brd[i][col] == turn: 
            bot[row-i-1] = brd[i][col]
            count += 1
            i -= 1
        else: break
    i = row + 1
    while i < len(brd) and i - row - 1 < 4:
        if brd[i][col] == 0 or brd[i][col] == turn:
            top[i-row-1] = brd[i][col]
            count += 1
            i += 1
        else: break
    vert = [0] * (count + 1)
    count = 0
    for i in reversed(0, len(bot)):
        if bot[i] == None: continue
        else:
            vert[count] = bot[i]
            count += 1
    vert[count] = turn
    count += 1
    for i in range(len(top)):
        if top != None:
            vert[count] = top[i]
            count += 1
        else: break
    return vert