#### Features:
1. Fraction of clauses that are unit clauses. <!-- exactly one literal -->
2. Fraction of clauses that are Horn clauses. <!-- at most one non-negated literal -->
3. Fraction of clauses that are ground Clauses. <!-- ? -->
4. Fraction of clauses that are demodulators. <!-- equality used as rule to rewrite newly inferred clause -->
5. Fraction of clauses that are rewrite rules (oriented demodulators). <!-- ? -->
6. Fraction of clauses that are purely positive.
7. Fraction of clauses that are purely negative.
8. Fraction of clauses that are mixed positive and negative.
9. Maximum clause length. <!-- number of literals -->
10. Average clause length.
11. Maximum clause depth. <!-- see below -->
12. Average clause depth.
13. Maximum clause weight. <!-- defined by prover; probably its symbol count, excluding commas, parentheses, negation symbols, and disjunction symbols -->
14. Average clause weight.

<!-- 
Depth of Term, Atom, Literal, Clause
* depth of variable, constant, or propositional atom: 0;
* depth of term or atom with arguments: one more than the maximum argument depth;
* depth of literal: depth of its atom (negation signs don't count);
* depth of clause: maximum of depths of literals;
* For example, p(x) | -p(f(x)) has depth 2.
-->

In [3]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv("data/all-data-raw.csv", header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.73684,0.00188,0.73872,0.073308,0.18797,-100.0,-100.0,-100.0,-100.0,-100.0
1,0.83307,0.99682,0.83307,0.76948,0,0.77107,0.068363,0.16057,6,1.2734,...,0.74248,0.00188,0.74436,0.067669,0.18797,0.08,0.08,0.2,0.08,0.08
2,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.7406,0.00188,0.74248,0.069549,0.18797,-100.0,-100.0,-100.0,-100.0,-100.0
3,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.72932,0.00188,0.7312,0.080827,0.18797,-100.0,-100.0,-100.0,-100.0,-100.0
4,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.7312,0.00188,0.73308,0.078947,0.18797,-100.0,-100.0,-100.0,-100.0,-100.0


In [4]:
def best_heuristic(row, time_cols):
    n_heuristics = 5
    h_times = row[time_cols].reset_index(drop=True)
    h_times.replace({-100.0 : np.nan}, inplace=True)
    idx, min_time = h_times.idxmin(), h_times.min()
    if np.isnan(min_time):
       return 0
    else:
       return idx+1

time_cols = list(range(53, 58))
df['heuristic'] = df.apply(lambda r : best_heuristic(r, time_cols), axis=1)
df.drop(time_cols, axis=1, inplace=True)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,heuristic
0,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.020202,0.80639,0.99624,0.80263,0.73684,0.00188,0.73872,0.073308,0.18797,0
1,0.83307,0.99682,0.83307,0.76948,0,0.77107,0.068363,0.16057,6,1.2734,...,0.020202,0.80639,0.99624,0.80263,0.74248,0.00188,0.74436,0.067669,0.18797,1
2,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.020202,0.80639,0.99624,0.80263,0.7406,0.00188,0.74248,0.069549,0.18797,0
3,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.020202,0.80639,0.99624,0.80263,0.72932,0.00188,0.7312,0.080827,0.18797,0
4,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.020202,0.80639,0.99624,0.80263,0.7312,0.00188,0.73308,0.078947,0.18797,0


In [5]:
df['heuristic'].value_counts().sort_index()
print(df.shape)

(6118, 54)


In [21]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline

X, y = df.drop(['heuristic'], axis=1).astype('float64'), df['heuristic']

random_seed = 42
X, X_val, y, y_val = train_test_split(
    X, y, stratify=y, test_size=0.1, random_state=random_seed)

def base_classifiers(X, y, X_val, y_val):
    classifier_params = {
        KNeighborsClassifier() : {
            'n_neighbors': list(range(1,16)),
            'weights' : ['distance', 'uniform']
        },
        DecisionTreeClassifier() : {
            'criterion': ['gini', 'entropy']
        }
    }

    cv_grid_results = {}
    for classifier, params in classifier_params.items():

        kfold = StratifiedKFold(10, shuffle=True, random_state=42)
        cv_grid = GridSearchCV(classifier, params, scoring='accuracy', cv=kfold, verbose=1, n_jobs=-1)
        cv_grid.fit(X, y)

        class_ = type(classifier).__name__
        print(class_, cv_grid.best_params_)

        val_col_space = 20
        print("10-fold CV mean score: {:.4f}".format(cv_grid.best_score_))
        print()

        cv_grid_results[class_] = pd.DataFrame(cv_grid.cv_results_)
        best = cv_grid.best_estimator_
        y_pred = best.predict(X_val)
        accuracy = accuracy_score(y_val, y_pred)
        print("final validation score:", accuracy)
base_classifiers(X,y, X_val, y_val)

Fitting 10 folds for each of 30 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   22.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KNeighborsClassifier {'n_neighbors': 14, 'weights': 'distance'}
10-fold CV mean score: 0.5792

final validation score: 0.5947712418300654
Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    1.7s finished


DecisionTreeClassifier {'criterion': 'entropy'}
10-fold CV mean score: 0.5536

final validation score: 0.5800653594771242


In [22]:
for classifier, result in cv_grid_results.items():
    print(classifier)
    brief_results = pd.DataFrame(list(result['params']))
    brief_results.rename(lambda c : c.lstrip('model__'), axis=1, inplace=True)
    brief_results['mean test score'] = result['mean_test_score']
    print(brief_results)
    print()

KNeighborsClassifier
    n_neighbors   weights  mean test score
0             1  distance         0.564890
1             1   uniform         0.564890
2             2  distance         0.566362
3             2   uniform         0.555247
4             3  distance         0.577476
5             3   uniform         0.556718
6             4  distance         0.581072
7             4   uniform         0.542988
8             5  distance         0.584832
9             5   uniform         0.548055
10            6  distance         0.585976
11            6   uniform         0.543478
12            7  distance         0.585485
13            7   uniform         0.541026
14            8  distance         0.585649
15            8   uniform         0.533508
16            9  distance         0.586139
17            9   uniform         0.537104
18           10  distance         0.583524
19           10   uniform         0.540046
20           11  distance         0.583851
21           11   uniform        

In [24]:
from IPython.utils.text import columnize

abs_correlations = abs(X.corrwith(y))
ord_corr = abs_correlations.sort_values(ascending=False)
corr_pretty_list = list(map(lambda x : "{:2} {:.4f}".format(x[0], x[1]), zip(ord_corr.index, ord_corr)))
print(columnize(corr_pretty_list))

10 0.2007  39 0.0852  48 0.0639  13 0.0494  43 0.0376  41 0.0217  15 0.0056
18 0.1952  30 0.0835  40 0.0584  21 0.0485  37 0.0371   5 0.0204  31 0.0043
11 0.1904  29 0.0825  28 0.0584  51 0.0472   9 0.0335   7 0.0170   6 0.0029
19 0.1801  20 0.0739  45 0.0577  17 0.0461  24 0.0332  47 0.0163   4 nan   
12 0.1507   1 0.0689   3 0.0569  50 0.0447  44 0.0326  46 0.0118  34 nan   
27 0.1052  36 0.0659  22 0.0549   8 0.0442  52 0.0301  23 0.0105
25 0.0899  38 0.0641  49 0.0539  14 0.0388  33 0.0300  16 0.0104
26 0.0889  42 0.0639   0 0.0530  32 0.0388  35 0.0259   2 0.0101



Genetic

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from pyeasyga import pyeasyga
from itertools import compress
import random


def fitness_corr(individual, idx_corr):
    fitness = 0
    n = individual.count(1)
    if n > 0:
        fitness = sum(corr for idx, corr in compress(idx_corr, individual))
        fitness /= n
    return fitness

def fitness_knn(individual, X_y):
    X, y = X_y
    fitness = 0
    selected_attrs = list(compress(X.columns, individual))
    if selected_attrs:
        X = X[selected_attrs]
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, stratify=y, test_size=0.3, random_state=random_seed)

        knn = KNeighborsClassifier()
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        fitness = accuracy_score(y_test, y_pred)
    return fitness

scaler = StandardScaler()
X.update(scaler.fit_transform(X))
X_val.update(scaler.transform(X_val))

ga = pyeasyga.GeneticAlgorithm((X, y), generations=50)

ga.fitness_function = fitness_knn
ga.run()

score, individual = ga.best_individual()
selected_attrs = list(compress(X.columns, individual))
print(score, len(selected_attrs), selected_attrs)
print()

X = X[selected_attrs]
X_val = X_val[selected_attrs]

base_classifiers(X, y,X_val, y_val)

0.3710653753026634 2 [0, 1]

Fitting 10 folds for each of 30 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    1.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KNeighborsClassifier {'n_neighbors': 15, 'weights': 'uniform'}
10-fold CV mean score: 0.4177

final validation score: 0.3137254901960784
Fitting 10 folds for each of 2 candidates, totalling 20 fits
DecisionTreeClassifier {'criterion': 'gini'}
10-fold CV mean score: 0.3663

final validation score: 0.24509803921568626


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.3s finished


Function definition based on reference: https://pyswarms.readthedocs.io/en/latest/examples/feature_subset_selection.html#using-binary-pso

In [None]:
def f_per_particle(m, alpha):
    if np.count_nonzero(m) == 0:
        X_subset = X
    else:
        X_subset = X.loc[:, m==1]
    P = abs(X_subset.corrwith(y)).sum()
    return (alpha * (1.0 - P))

In [None]:
def f(x, alpha = 0.8):
    n_particles = x.shape[0]
    j = [f_per_particle(x[i], alpha) for i in range(n_particles)]
    return np.array(j)

In [None]:
import pyswarms as ps
from pyswarms.discrete import BinaryPSO

In [None]:
options = {'c1': 0.5, 'c2': 0.5, 'w':0.9, 'k': 30, 'p':2}
dimensions = 52 # dimensions should be the number of features

In [None]:
optimizer = ps.discrete.BinaryPSO(n_particles=30, dimensions=dimensions, options=options)

In [None]:
cost, pos = optimizer.optimize(f, print_step=100, iters=1000, verbose=2)

In [None]:
print(pos)

In [None]:
pipe = Pipeline([
    ('scaling', StandardScaler()),
    ('classifier', DecisionTreeClassifier())
])

params = {
    'classifier__criterion': ['gini', 'entropy']
}

kfold = StratifiedKFold(10, shuffle=True, random_state=42)
dt_grid = GridSearchCV(pipe, params, scoring='accuracy', cv=kfold, verbose=1, n_jobs=-1)
dt_grid.fit(X[pos], y)

print(dt_grid.best_params_)

val_col_space = 20
print("{:{}}: {:.4f}".format("10-fold CV", val_col_space, dt_grid.best_score_))

In [10]:
pipe = Pipeline([
    ('scaling', StandardScaler()),
    ('classifier', KNeighborsClassifier())
])

params = {
    'classifier__n_neighbors': range(1,15),
    'classifier__weights': ['uniform', 'distance']
}

kfold = StratifiedKFold(10, shuffle=True, random_state=42)
knn_grid = GridSearchCV(pipe, params, scoring='accuracy', cv=kfold, verbose=1, n_jobs=-1)
knn_grid.fit(X[pos], y)

print(knn_grid.best_params_)

val_col_space = 20
print("{:{}}: {:.4f}".format("10-fold CV", val_col_space, knn_grid.best_score_))

NameError: name 'pos' is not defined

In [None]:
from sklearn.decomposition import PCA

In [None]:
pipe = Pipeline([
    ('scaling', StandardScaler()),
    ('pca', PCA(0.96)),
    ('classifier', KNeighborsClassifier())
])

params = {
    'classifier__n_neighbors': range(1,15),
    'classifier__weights': ['uniform', 'distance']
}

kfold = StratifiedKFold(10, shuffle=True, random_state=42)
knn_grid = GridSearchCV(pipe, params, scoring='accuracy', cv=kfold, verbose=1, n_jobs=-1)
knn_grid.fit(X[pos], y)

print(knn_grid.best_params_)

val_col_space = 20
print("{:{}}: {:.4f}".format("10-fold CV", val_col_space, knn_grid.best_score_))

In [None]:
pipe = Pipeline([
    ('scaling', StandardScaler()),
    ('pca', PCA(0.96)),
    ('classifier', DecisionTreeClassifier())
])

params = {
    'classifier__criterion': ['gini', 'entropy']
}

kfold = StratifiedKFold(10, shuffle=True, random_state=42)
dt_grid = GridSearchCV(pipe, params, scoring='accuracy', cv=kfold, verbose=1, n_jobs=-1)
dt_grid.fit(X[pos], y)

print(dt_grid.best_params_)

val_col_space = 20
print("{:{}}: {:.4f}".format("10-fold CV", val_col_space, dt_grid.best_score_))

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
pipe = Pipeline([
    ('scaling', StandardScaler()),
    ('lda', LinearDiscriminantAnalysis()),
    ('classifier', KNeighborsClassifier())
])

params = {
    'classifier__n_neighbors': range(1,15),
    'classifier__weights': ['uniform', 'distance']
}

kfold = StratifiedKFold(10, shuffle=True, random_state=42)
knn_grid = GridSearchCV(pipe, params, scoring='accuracy', cv=kfold, verbose=1, n_jobs=-1)
knn_grid.fit(X[pos], y)

print(knn_grid.best_params_)

val_col_space = 20
print("{:{}}: {:.4f}".format("10-fold CV", val_col_space, knn_grid.best_score_))

In [None]:
pipe = Pipeline([
    ('scaling', StandardScaler()),
    ('lda', LinearDiscriminantAnalysis()),
    ('classifier', DecisionTreeClassifier())
])

params = {
    'classifier__criterion': ['gini', 'entropy']
}

kfold = StratifiedKFold(10, shuffle=True, random_state=42)
dt_grid = GridSearchCV(pipe, params, scoring='accuracy', cv=kfold, verbose=1, n_jobs=-1)
dt_grid.fit(X[pos], y)

print(dt_grid.best_params_)

val_col_space = 20
print("{:{}}: {:.4f}".format("10-fold CV", val_col_space, dt_grid.best_score_))