#### Features:
1. Fraction of clauses that are unit clauses. <!-- exactly one literal -->
2. Fraction of clauses that are Horn clauses. <!-- at most one non-negated literal -->
3. Fraction of clauses that are ground Clauses. <!-- ? -->
4. Fraction of clauses that are demodulators. <!-- equality used as rule to rewrite newly inferred clause -->
5. Fraction of clauses that are rewrite rules (oriented demodulators). <!-- ? -->
6. Fraction of clauses that are purely positive.
7. Fraction of clauses that are purely negative.
8. Fraction of clauses that are mixed positive and negative.
9. Maximum clause length. <!-- number of literals -->
10. Average clause length.
11. Maximum clause depth. <!-- see below -->
12. Average clause depth.
13. Maximum clause weight. <!-- defined by prover; probably its symbol count, excluding commas, parentheses, negation symbols, and disjunction symbols -->
14. Average clause weight.

<!-- 
Depth of Term, Atom, Literal, Clause
* depth of variable, constant, or propositional atom: 0;
* depth of term or atom with arguments: one more than the maximum argument depth;
* depth of literal: depth of its atom (negation signs don't count);
* depth of clause: maximum of depths of literals;
* For example, p(x) | -p(f(x)) has depth 2.
-->

In [79]:
import pandas as pd

df = pd.read_csv("data/all-data-raw.csv", header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.73684,0.00188,0.73872,0.073308,0.18797,-100.0,-100.0,-100.0,-100.0,-100.0
1,0.83307,0.99682,0.83307,0.76948,0,0.77107,0.068363,0.16057,6,1.2734,...,0.74248,0.00188,0.74436,0.067669,0.18797,0.08,0.08,0.2,0.08,0.08
2,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.7406,0.00188,0.74248,0.069549,0.18797,-100.0,-100.0,-100.0,-100.0,-100.0
3,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.72932,0.00188,0.7312,0.080827,0.18797,-100.0,-100.0,-100.0,-100.0,-100.0
4,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.7312,0.00188,0.73308,0.078947,0.18797,-100.0,-100.0,-100.0,-100.0,-100.0


In [80]:
h_times = df.iloc[:, -5:]
print(h_times.head())

import numpy as np
def best_heuristic(row):
    n_heuristics = 5
    h_times = row[-n_heuristics:].reset_index(drop=True)
    h_times.replace({-100.0 : np.nan}, inplace=True)
    idx, min_time = h_times.idxmin(), h_times.min()
    if np.isnan(min_time):
       return 0
    else:
       return idx+1

df['heuristic'] = df.apply(best_heuristic, axis=1)
df.drop([53, 54, 55, 56, 57], axis=1, inplace=True)
df.head()

       53      54     55      56      57
0 -100.00 -100.00 -100.0 -100.00 -100.00
1    0.08    0.08    0.2    0.08    0.08
2 -100.00 -100.00 -100.0 -100.00 -100.00
3 -100.00 -100.00 -100.0 -100.00 -100.00
4 -100.00 -100.00 -100.0 -100.00 -100.00


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,heuristic
0,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.020202,0.80639,0.99624,0.80263,0.73684,0.00188,0.73872,0.073308,0.18797,0
1,0.83307,0.99682,0.83307,0.76948,0,0.77107,0.068363,0.16057,6,1.2734,...,0.020202,0.80639,0.99624,0.80263,0.74248,0.00188,0.74436,0.067669,0.18797,1
2,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.020202,0.80639,0.99624,0.80263,0.7406,0.00188,0.74248,0.069549,0.18797,0
3,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.020202,0.80639,0.99624,0.80263,0.72932,0.00188,0.7312,0.080827,0.18797,0
4,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.020202,0.80639,0.99624,0.80263,0.7312,0.00188,0.73308,0.078947,0.18797,0


In [81]:
df['heuristic'].value_counts()

0    2554
1    1089
3     748
5     624
4     617
2     486
Name: heuristic, dtype: int64

In [98]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

X, y = df.drop(['heuristic'], axis=1).astype('float64'), df['heuristic']
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=44)

pipe = Pipeline([
    ('scaling', StandardScaler()),
    ('classifier', KNeighborsClassifier())
])

params = {
    'classifier__n_neighbors': range(1,15),
    'classifier__weights': ['uniform', 'distance']
}

kfold = StratifiedKFold(10, shuffle=True, random_state=42)
knn_grid = GridSearchCV(pipe, params, scoring='accuracy', cv=kfold, verbose=1, n_jobs=-1)
knn_grid.fit(X, y)

print(knn_grid.best_params_)

val_col_space = 20
print("{:{}}: {:.4f}".format("10-fold CV", val_col_space, knn_grid.best_score_))

Fitting 10 folds for each of 28 candidates, totalling 280 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   27.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 280 out of 280 | elapsed:  3.1min finished


NameError: name 'classifier' is not defined

In [102]:
print(knn_grid.best_params_)
val_col_space = 20
print("{:{}}: {:.4f}".format("10-fold CV", val_col_space, knn_grid.best_score_))

{'classifier__n_neighbors': 12, 'classifier__weights': 'distance'}
10-fold CV          : 0.6036


In [103]:
knn_grid.cv_results_



{'mean_fit_time': array([0.1021359 , 0.05731089, 0.07179599, 0.06776688, 0.066731  ,
        0.0665082 , 0.06477678, 0.06517074, 0.06712677, 0.06862223,
        0.06526237, 0.06904685, 0.06701689, 0.06774733, 0.06505749,
        0.06624551, 0.06955681, 0.06706414, 0.06563666, 0.06473808,
        0.06790586, 0.06899359, 0.06533141, 0.0688319 , 0.06593912,
        0.06330769, 0.06838744, 0.07005229]),
 'std_fit_time': array([0.04052453, 0.01193452, 0.0051924 , 0.00260733, 0.00475699,
        0.00366516, 0.00665532, 0.00647001, 0.01259429, 0.00478825,
        0.00891787, 0.00445975, 0.00541554, 0.00596335, 0.00550141,
        0.00508964, 0.00953098, 0.00315979, 0.00560787, 0.00888291,
        0.00508041, 0.0036896 , 0.00710937, 0.00616511, 0.00947686,
        0.00923715, 0.00281709, 0.00325507]),
 'mean_score_time': array([0.1313055 , 0.11531773, 0.18309433, 0.181218  , 0.20303285,
        0.19797404, 0.20172992, 0.217187  , 0.22107725, 0.22898333,
        0.28980911, 0.24118516, 0.251607