#### Features:
1. Fraction of clauses that are unit clauses. <!-- exactly one literal -->
2. Fraction of clauses that are Horn clauses. <!-- at most one non-negated literal -->
3. Fraction of clauses that are ground Clauses. <!-- ? -->
4. Fraction of clauses that are demodulators. <!-- equality used as rule to rewrite newly inferred clause -->
5. Fraction of clauses that are rewrite rules (oriented demodulators). <!-- ? -->
6. Fraction of clauses that are purely positive.
7. Fraction of clauses that are purely negative.
8. Fraction of clauses that are mixed positive and negative.
9. Maximum clause length. <!-- number of literals -->
10. Average clause length.
11. Maximum clause depth. <!-- see below -->
12. Average clause depth.
13. Maximum clause weight. <!-- defined by prover; probably its symbol count, excluding commas, parentheses, negation symbols, and disjunction symbols -->
14. Average clause weight.

<!-- 
Depth of Term, Atom, Literal, Clause
* depth of variable, constant, or propositional atom: 0;
* depth of term or atom with arguments: one more than the maximum argument depth;
* depth of literal: depth of its atom (negation signs don't count);
* depth of clause: maximum of depths of literals;
* For example, p(x) | -p(f(x)) has depth 2.
-->

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv("data/all-data-raw.csv", header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.73684,0.00188,0.73872,0.073308,0.18797,-100.0,-100.0,-100.0,-100.0,-100.0
1,0.83307,0.99682,0.83307,0.76948,0,0.77107,0.068363,0.16057,6,1.2734,...,0.74248,0.00188,0.74436,0.067669,0.18797,0.08,0.08,0.2,0.08,0.08
2,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.7406,0.00188,0.74248,0.069549,0.18797,-100.0,-100.0,-100.0,-100.0,-100.0
3,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.72932,0.00188,0.7312,0.080827,0.18797,-100.0,-100.0,-100.0,-100.0,-100.0
4,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.7312,0.00188,0.73308,0.078947,0.18797,-100.0,-100.0,-100.0,-100.0,-100.0


In [3]:
def best_heuristic(row, time_cols):
    n_heuristics = 5
    h_times = row[time_cols].reset_index(drop=True)
    h_times.replace({-100.0 : np.nan}, inplace=True)
    idx, min_time = h_times.idxmin(), h_times.min()
    if np.isnan(min_time):
       return 0
    else:
       return idx+1

time_cols = list(range(53, 58))
df['heuristic'] = df.apply(lambda r : best_heuristic(r, time_cols), axis=1)
df.drop(time_cols, axis=1, inplace=True)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,heuristic
0,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.020202,0.80639,0.99624,0.80263,0.73684,0.00188,0.73872,0.073308,0.18797,0
1,0.83307,0.99682,0.83307,0.76948,0,0.77107,0.068363,0.16057,6,1.2734,...,0.020202,0.80639,0.99624,0.80263,0.74248,0.00188,0.74436,0.067669,0.18797,1
2,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.020202,0.80639,0.99624,0.80263,0.7406,0.00188,0.74248,0.069549,0.18797,0
3,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.020202,0.80639,0.99624,0.80263,0.72932,0.00188,0.7312,0.080827,0.18797,0
4,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.020202,0.80639,0.99624,0.80263,0.7312,0.00188,0.73308,0.078947,0.18797,0


In [4]:
df['heuristic'].value_counts().sort_index()

0    2554
1    1089
2     486
3     748
4     617
5     624
Name: heuristic, dtype: int64

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline

X, y = df.drop(['heuristic'], axis=1).astype('float64'), df['heuristic']

classifier_params = {
    KNeighborsClassifier() : {
        'model__n_neighbors': [1,3,12,16],
        'model__weights' : ['distance']
    },
    DecisionTreeClassifier() : {
        'model__criterion': ['gini', 'entropy']
    }
}

cv_grid_results = {}
for classifier, params in classifier_params.items():
    pipe = Pipeline([
        ('scaling', StandardScaler()),
        ('model', classifier)
    ])


    kfold = StratifiedKFold(10, shuffle=True, random_state=42)
    cv_grid = GridSearchCV(pipe, params, scoring='accuracy', cv=kfold, verbose=1, n_jobs=-1)
    cv_grid.fit(X, y)

    class_ = type(classifier).__name__
    print(class_, cv_grid.best_params_)

    val_col_space = 20
    print("10-fold CV mean score: {:.4f}".format(cv_grid.best_score_))
    print()
    
    cv_grid_results[class_] = pd.DataFrame(cv_grid.cv_results_)

Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   23.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KNeighborsClassifier {'model__n_neighbors': 12, 'model__weights': 'distance'}
10-fold CV mean score: 0.6036

Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    3.4s finished


DecisionTreeClassifier {'model__criterion': 'gini'}
10-fold CV mean score: 0.5629





In [8]:
for classifier, result in cv_grid_results.items():
    print(classifier)
    brief_results = pd.DataFrame(list(result['params']))
    brief_results.rename(lambda c : c.lstrip('model__'), axis=1, inplace=True)
    brief_results['mean test score'] = result['mean_test_score']
    print(brief_results)
    print()

KNeighborsClassifier
   n_neighbors   weights  mean test score
0            1  distance         0.588918
1            3  distance         0.599869
2           12  distance         0.603629
3           16  distance         0.600196

DecisionTreeClassifier
  criterion  mean test score
0      gini         0.562929
1   entropy         0.552795



In [19]:
from IPython.utils.text import columnize

abs_correlations = abs(X.corrwith(y))
ord_corr = abs_correlations.sort_values(ascending=False)
corr_pretty_list = list(map(lambda x : "{:2} {:.4f}".format(x[0], x[1]), zip(ord_corr.index, ord_corr)))
print(columnize(corr_pretty_list))

10 0.2026  39 0.0860  42 0.0625   0 0.0513  32 0.0407   5 0.0200   6 0.0064
18 0.1917  30 0.0846  45 0.0587  51 0.0490  43 0.0393  41 0.0198  15 0.0028
11 0.1884  29 0.0828  28 0.0578  21 0.0478  44 0.0337  47 0.0186  31 0.0025
19 0.1771  36 0.0701  40 0.0573  50 0.0467  24 0.0323  46 0.0175   4 nan   
12 0.1553  20 0.0693   3 0.0568  17 0.0455  52 0.0315   7 0.0153  34 nan   
27 0.1043   1 0.0673  49 0.0554   8 0.0421   9 0.0305  16 0.0129
25 0.0915  48 0.0649  22 0.0537  37 0.0410  33 0.0303   2 0.0123
26 0.0872  38 0.0627  13 0.0528  14 0.0407  35 0.0289  23 0.0106



Function definition based on reference: https://pyswarms.readthedocs.io/en/latest/examples/feature_subset_selection.html#using-binary-pso

In [None]:
def f_per_particle(m, alpha):
    if np.count_nonzero(m) == 0:
        X_subset = X
    else:
        X_subset = X.loc[:, m==1]
    P = abs(X_subset.corrwith(y)).sum()
    return (alpha * (1.0 - P)
        + (1.0 - alpha) * (1 - (X_subset.shape[1] / len(X.columns))))

In [None]:
def f(x, alpha = 0.8):
    n_particles = x.shape[0]
    j = [f_per_particle(x[i], alpha) for i in range(n_particles)]
    return np.array(j)

In [None]:
import pyswarms as ps
from pyswarms.discrete import BinaryPSO

In [None]:
options = {'c1': 0.5, 'c2': 0.5, 'w':0.9, 'k': 30, 'p':2}
dimensions = 52 # dimensions should be the number of features

In [None]:
optimizer = ps.discrete.BinaryPSO(n_particles=30, dimensions=dimensions, options=options)

In [None]:
cost, pos = optimizer.optimize(f, print_step=100, iters=1000, verbose=2)

In [None]:
print(pos)

In [None]:
pipe = Pipeline([
    ('scaling', StandardScaler()),
    ('classifier', DecisionTreeClassifier())
])

params = {
    'classifier__criterion': ['gini', 'entropy']
}

kfold = StratifiedKFold(10, shuffle=True, random_state=42)
dt_grid = GridSearchCV(pipe, params, scoring='accuracy', cv=kfold, verbose=1, n_jobs=-1)
dt_grid.fit(X[pos], y)

print(dt_grid.best_params_)

val_col_space = 20
print("{:{}}: {:.4f}".format("10-fold CV", val_col_space, dt_grid.best_score_))

In [None]:
pipe = Pipeline([
    ('scaling', StandardScaler()),
    ('classifier', KNeighborsClassifier())
])

params = {
    'classifier__n_neighbors': range(1,15),
    'classifier__weights': ['uniform', 'distance']
}

kfold = StratifiedKFold(10, shuffle=True, random_state=42)
knn_grid = GridSearchCV(pipe, params, scoring='accuracy', cv=kfold, verbose=1, n_jobs=-1)
knn_grid.fit(X[pos], y)

print(knn_grid.best_params_)

val_col_space = 20
print("{:{}}: {:.4f}".format("10-fold CV", val_col_space, knn_grid.best_score_))

In [None]:
from sklearn.decomposition import PCA

In [None]:
pipe = Pipeline([
    ('scaling', StandardScaler()),
    ('pca', PCA(0.96)),
    ('classifier', KNeighborsClassifier())
])

params = {
    'classifier__n_neighbors': range(1,15),
    'classifier__weights': ['uniform', 'distance']
}

kfold = StratifiedKFold(10, shuffle=True, random_state=42)
knn_grid = GridSearchCV(pipe, params, scoring='accuracy', cv=kfold, verbose=1, n_jobs=-1)
knn_grid.fit(X[pos], y)

print(knn_grid.best_params_)

val_col_space = 20
print("{:{}}: {:.4f}".format("10-fold CV", val_col_space, knn_grid.best_score_))

In [None]:
pipe = Pipeline([
    ('scaling', StandardScaler()),
    ('pca', PCA(0.96)),
    ('classifier', DecisionTreeClassifier())
])

params = {
    'classifier__criterion': ['gini', 'entropy']
}

kfold = StratifiedKFold(10, shuffle=True, random_state=42)
dt_grid = GridSearchCV(pipe, params, scoring='accuracy', cv=kfold, verbose=1, n_jobs=-1)
dt_grid.fit(X[pos], y)

print(dt_grid.best_params_)

val_col_space = 20
print("{:{}}: {:.4f}".format("10-fold CV", val_col_space, dt_grid.best_score_))

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
pipe = Pipeline([
    ('scaling', StandardScaler()),
    ('lda', LinearDiscriminantAnalysis()),
    ('classifier', KNeighborsClassifier())
])

params = {
    'classifier__n_neighbors': range(1,15),
    'classifier__weights': ['uniform', 'distance']
}

kfold = StratifiedKFold(10, shuffle=True, random_state=42)
knn_grid = GridSearchCV(pipe, params, scoring='accuracy', cv=kfold, verbose=1, n_jobs=-1)
knn_grid.fit(X[pos], y)

print(knn_grid.best_params_)

val_col_space = 20
print("{:{}}: {:.4f}".format("10-fold CV", val_col_space, knn_grid.best_score_))

In [None]:
pipe = Pipeline([
    ('scaling', StandardScaler()),
    ('lda', LinearDiscriminantAnalysis()),
    ('classifier', DecisionTreeClassifier())
])

params = {
    'classifier__criterion': ['gini', 'entropy']
}

kfold = StratifiedKFold(10, shuffle=True, random_state=42)
dt_grid = GridSearchCV(pipe, params, scoring='accuracy', cv=kfold, verbose=1, n_jobs=-1)
dt_grid.fit(X[pos], y)

print(dt_grid.best_params_)

val_col_space = 20
print("{:{}}: {:.4f}".format("10-fold CV", val_col_space, dt_grid.best_score_))