In [22]:
import xml.etree.ElementTree as et
from pathlib import Path as pt
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import normalize
from sklearn.neighbors import KNeighborsClassifier
import sys, os
import numpy as np
from statistics import mean

In [2]:
textXmlCollection = './textXml'
nontextXmlCollection = './nontextXml'

In [3]:
def height(wf):
    a = wf[0].attrib
    return int(a['Bottom']) - int(a['Top'])

def width(wf):
    a = wf[0].attrib
    return int(a['Right']) - int(a['Left'])

def area(wf):
    return height(wf) * width(wf)

def bl(wf):
    a = wf.attrib
    return int(a['BlackCount'])

def wtHl(wf):
    a = wf.attrib
    return int(a['WhiteHolesCount'])

def horStr(wf):
    a = wf.attrib
    return int(a['HorzStrokesCount'])

def vertStr(wf):
    a = wf.attrib
    return int(a['VertStrokesCount'])

def maxHorStr(wf):
    a = wf.attrib
    return int(a['MaxHorzStrokeLength'])

In [4]:
def feature1(wf):
    return (area(wf) - bl(wf)) / ((horStr(wf) + height(wf)) * height(wf))

def feature2(wf):
    return (horStr(wf) + vertStr(wf)) / max(width(wf), height(wf))

def feature3(wf):
    return bl(wf) / (horStr(wf) + vertStr(wf))

def feature4(wf):
    return bl(wf) / area(wf)

def feature5(wf):
    return wtHl(wf) / area(wf)

def feature6(wf):
    return horStr(wf) / area(wf)

def feature7(wf):
    return vertStr(wf) / area(wf)

def feature8(wf):
    return maxHorStr(wf) / width(wf)

In [5]:
def features(wf):
    return [feature1(wf),
            feature2(wf),
            feature3(wf),
            feature4(wf),
            feature5(wf),
            feature6(wf),
            feature7(wf),
            feature8(wf)]

In [6]:
def answersAndFeatures(xml, isText):
    tree = et.parse(xml)
    
    x = []
    y = []
    for wf in tree.iter("WordFragment"):
        x.append(features(wf))
        y.append(int(isText))
        
    return x, y

In [7]:
def answersAndFeaturesFromCollection(collection, isText):
    X = []
    Y = []
    for xml in pt(collection).iterdir():
        
        if not str(xml).endswith('.xml'):
            continue
        
        x, y = answersAndFeatures(xml, isText)
        X += x
        Y += y
    
    return X, Y

In [8]:
textX, textY = answersAndFeaturesFromCollection(textXmlCollection, True)
nontextX, nontextY = answersAndFeaturesFromCollection(nontextXmlCollection, False)

X = textX + nontextX
Y = textY + nontextY

In [9]:
def cvs(model):
    print(mean(cross_val_score(model, X, Y, cv=10, n_jobs=-1, scoring='accuracy')))

In [10]:
rf = RandomForestClassifier()
cvs(rf)

0.879993194473


In [23]:
gb = GradientBoostingClassifier()
cvs(gb)

0.860857540886


In [11]:
lsvc = LinearSVC()
cvs(lsvc)

0.795717600952


In [12]:
knc = KNeighborsClassifier()
cvs(knc)

0.848279010833


In [13]:
svc = SVC()
cvs(svc)

0.844629437213


In [26]:
def tuneRf():
    r = np.random
    paramGrid = [{'n_estimators': [15, 20, 25, 30, 35, 40, 45],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'criterion': ['gini', 'entropy'],
                  'random_state': [None, 42, r]}]
    
    clf = GridSearchCV(RandomForestClassifier(n_jobs=-1), paramGrid, cv=10, scoring='accuracy')
    clf.fit(X, Y)
    print(clf.best_params_)
    print(max(clf.cv_results_['mean_test_score']))

In [27]:
tuneRf()

{'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 45, 'random_state': None}
0.888209444535
