#### Features:
1. Fraction of clauses that are unit clauses. <!-- exactly one literal -->
2. Fraction of clauses that are Horn clauses. <!-- at most one non-negated literal -->
3. Fraction of clauses that are ground Clauses. <!-- ? -->
4. Fraction of clauses that are demodulators. <!-- equality used as rule to rewrite newly inferred clause -->
5. Fraction of clauses that are rewrite rules (oriented demodulators). <!-- ? -->
6. Fraction of clauses that are purely positive.
7. Fraction of clauses that are purely negative.
8. Fraction of clauses that are mixed positive and negative.
9. Maximum clause length. <!-- number of literals -->
10. Average clause length.
11. Maximum clause depth. <!-- see below -->
12. Average clause depth.
13. Maximum clause weight. <!-- defined by prover; probably its symbol count, excluding commas, parentheses, negation symbols, and disjunction symbols -->
14. Average clause weight.

<!-- 
Depth of Term, Atom, Literal, Clause
* depth of variable, constant, or propositional atom: 0;
* depth of term or atom with arguments: one more than the maximum argument depth;
* depth of literal: depth of its atom (negation signs don't count);
* depth of clause: maximum of depths of literals;
* For example, p(x) | -p(f(x)) has depth 2.
-->

In [1]:
import pandas as pd

df = pd.read_csv("data/all-data-raw.csv", header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.73684,0.00188,0.73872,0.073308,0.18797,-100.0,-100.0,-100.0,-100.0,-100.0
1,0.83307,0.99682,0.83307,0.76948,0,0.77107,0.068363,0.16057,6,1.2734,...,0.74248,0.00188,0.74436,0.067669,0.18797,0.08,0.08,0.2,0.08,0.08
2,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.7406,0.00188,0.74248,0.069549,0.18797,-100.0,-100.0,-100.0,-100.0,-100.0
3,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.72932,0.00188,0.7312,0.080827,0.18797,-100.0,-100.0,-100.0,-100.0,-100.0
4,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.7312,0.00188,0.73308,0.078947,0.18797,-100.0,-100.0,-100.0,-100.0,-100.0


In [2]:
h_times = df.iloc[:, -5:]
print(h_times.head())

import numpy as np
def best_heuristic(row):
    n_heuristics = 5
    h_times = row[-n_heuristics:].reset_index(drop=True)
    h_times.replace({-100.0 : np.nan}, inplace=True)
    idx, min_time = h_times.idxmin(), h_times.min()
    if np.isnan(min_time):
       return 0
    else:
       return idx+1

df['heuristic'] = df.apply(best_heuristic, axis=1)
df.drop([53, 54, 55, 56, 57], axis=1, inplace=True)
df.head()

       53      54     55      56      57
0 -100.00 -100.00 -100.0 -100.00 -100.00
1    0.08    0.08    0.2    0.08    0.08
2 -100.00 -100.00 -100.0 -100.00 -100.00
3 -100.00 -100.00 -100.0 -100.00 -100.00
4 -100.00 -100.00 -100.0 -100.00 -100.00


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,heuristic
0,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.020202,0.80639,0.99624,0.80263,0.73684,0.00188,0.73872,0.073308,0.18797,0
1,0.83307,0.99682,0.83307,0.76948,0,0.77107,0.068363,0.16057,6,1.2734,...,0.020202,0.80639,0.99624,0.80263,0.74248,0.00188,0.74436,0.067669,0.18797,1
2,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.020202,0.80639,0.99624,0.80263,0.7406,0.00188,0.74248,0.069549,0.18797,0
3,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.020202,0.80639,0.99624,0.80263,0.72932,0.00188,0.7312,0.080827,0.18797,0
4,0.83307,0.99682,0.83307,0.76789,0,0.76948,0.069952,0.16057,6,1.2734,...,0.020202,0.80639,0.99624,0.80263,0.7312,0.00188,0.73308,0.078947,0.18797,0


In [3]:
df['heuristic'].value_counts()

0    2554
1    1089
3     748
5     624
4     617
2     486
Name: heuristic, dtype: int64

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

X, y = df.drop(['heuristic'], axis=1).astype('float64'), df['heuristic']
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=44)

pipe = Pipeline([
    ('scaling', StandardScaler()),
    ('classifier', KNeighborsClassifier())
])

params = {
    'classifier__n_neighbors': range(1,15),
    'classifier__weights': ['uniform', 'distance']
}

kfold = StratifiedKFold(10, shuffle=True, random_state=42)
knn_grid = GridSearchCV(pipe, params, scoring='accuracy', cv=kfold, verbose=1, n_jobs=-1)
knn_grid.fit(X, y)

print(knn_grid.best_params_)

val_col_space = 20
print("{:{}}: {:.4f}".format("10-fold CV", val_col_space, knn_grid.best_score_))

Fitting 10 folds for each of 28 candidates, totalling 280 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   54.2s


{'classifier__n_neighbors': 12, 'classifier__weights': 'distance'}
10-fold CV          : 0.6036


[Parallel(n_jobs=-1)]: Done 280 out of 280 | elapsed:  1.4min finished


In [5]:
print(knn_grid.best_params_)
val_col_space = 20
print("{:{}}: {:.4f}".format("10-fold CV", val_col_space, knn_grid.best_score_))

{'classifier__n_neighbors': 12, 'classifier__weights': 'distance'}
10-fold CV          : 0.6036


In [6]:
knn_grid.cv_results_



{'mean_fit_time': array([0.11825981, 0.03362541, 0.03699412, 0.03575296, 0.04209585,
        0.04558358, 0.0352242 , 0.03949215, 0.04093268, 0.05376897,
        0.03690968, 0.03709061, 0.03668926, 0.03348157, 0.03599117,
        0.03630178, 0.04776905, 0.04026332, 0.05947323, 0.04398112,
        0.03845041, 0.04905937, 0.0332494 , 0.03382137, 0.03902576,
        0.03489501, 0.03498204, 0.03381867]),
 'std_fit_time': array([0.10088705, 0.00231472, 0.00705138, 0.00958448, 0.01108501,
        0.01312359, 0.00456915, 0.01366133, 0.01052374, 0.01821758,
        0.00455387, 0.00553366, 0.00529744, 0.00259006, 0.00525909,
        0.00792402, 0.01527929, 0.00787149, 0.01822189, 0.01698977,
        0.01046339, 0.01867844, 0.00146975, 0.00196738, 0.00593294,
        0.00289371, 0.00512175, 0.00163039]),
 'mean_score_time': array([0.07264209, 0.06371558, 0.08572564, 0.08180783, 0.12632093,
        0.10404484, 0.10190511, 0.10317721, 0.13589313, 0.15261486,
        0.11658344, 0.12129705, 0.125598

In [18]:
results_knn = pd.DataFrame(knn_grid.cv_results_)



In [19]:
results_knn.head()
results_knn.drop(['params'], axis= 1)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__n_neighbors,param_classifier__weights,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
0,0.11826,0.100887,0.072642,0.013302,1,uniform,0.587948,0.592834,0.587948,0.591205,...,0.978016,0.976381,0.973665,0.9753,0.977302,0.974582,0.975676,0.971869,0.97541,0.001669
1,0.033625,0.002315,0.063716,0.005857,1,distance,0.587948,0.592834,0.587948,0.591205,...,0.978016,0.976381,0.973665,0.9753,0.977302,0.974582,0.975676,0.971869,0.97541,0.001669
2,0.036994,0.007051,0.085726,0.010903,2,uniform,0.584691,0.565147,0.576547,0.592834,...,0.790153,0.791061,0.791137,0.788231,0.789541,0.789942,0.792521,0.790563,0.790436,0.001153
3,0.035753,0.009584,0.081808,0.011301,2,distance,0.589577,0.59772,0.587948,0.584691,...,0.977108,0.977108,0.974392,0.9753,0.976938,0.974946,0.974769,0.975862,0.975791,0.000936
4,0.042096,0.011085,0.126321,0.033232,3,uniform,0.568404,0.552117,0.578176,0.561889,...,0.75436,0.756177,0.758264,0.751181,0.75177,0.753631,0.755491,0.750998,0.753969,0.002185
5,0.045584,0.013124,0.104045,0.018449,3,distance,0.596091,0.59772,0.594463,0.605863,...,0.978743,0.978379,0.976934,0.977661,0.978028,0.976398,0.976765,0.975681,0.977244,0.000899
6,0.035224,0.004569,0.101905,0.012146,4,uniform,0.558632,0.568404,0.560261,0.565147,...,0.713118,0.71548,0.721032,0.713767,0.714545,0.715142,0.718279,0.711615,0.715666,0.002834
7,0.039492,0.013661,0.103177,0.021462,4,distance,0.59772,0.600977,0.586319,0.596091,...,0.978924,0.978379,0.977116,0.977842,0.978573,0.976761,0.976584,0.976225,0.977498,0.000859
8,0.040933,0.010524,0.135893,0.032437,5,uniform,0.561889,0.561889,0.547231,0.547231,...,0.690225,0.68968,0.697058,0.690883,0.693118,0.689179,0.693229,0.69147,0.69182,0.002299
9,0.053769,0.018218,0.152615,0.048457,5,distance,0.59772,0.600977,0.591205,0.591205,...,0.979106,0.978198,0.976934,0.977842,0.978573,0.976761,0.976584,0.976951,0.977589,0.000796


In [17]:
from sklearn.tree import DecisionTreeClassifier

pipe = Pipeline([
    ('scaling', StandardScaler()),
    ('classifier', DecisionTreeClassifier())
])

params = {
    'classifier__criterion': ['gini', 'entropy']
}

kfold = StratifiedKFold(10, shuffle=True, random_state=42)
dt_grid = GridSearchCV(pipe, params, scoring='accuracy', cv=kfold, verbose=1, n_jobs=-1)
dt_grid.fit(X, y)

print(dt_grid.best_params_)

val_col_space = 20
print("{:{}}: {:.4f}".format("10-fold CV", val_col_space, dt_grid.best_score_))

Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


{'classifier__criterion': 'gini'}
10-fold CV          : 0.5590


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    2.2s finished


In [21]:
results_dt = pd.DataFrame(dt_grid.cv_results_)
results_dt.head()
results_dt.drop(['params'], axis= 1)



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__criterion,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,...,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
0,0.190236,0.018137,0.001315,0.000286,gini,0.592834,0.54886,0.576547,0.555375,0.535948,...,0.979106,0.978743,0.977116,0.977842,0.978754,0.976943,0.976765,0.977132,0.977734,0.000804
1,0.389734,0.011747,0.001036,5.1e-05,entropy,0.566775,0.550489,0.537459,0.568404,0.521242,...,0.979106,0.978743,0.977116,0.977842,0.978754,0.976943,0.976765,0.977132,0.977734,0.000804
