# Lazy FCA
##### Shenker Anastasia

At first I used one train and test sets without cross-validation for making an algorithms. In the second part of the work I used 'tic-tac-toe' dataset with KFold cross-validation for checking the metrics of these algorithms.

## Dataset Splitting

In [1]:
import pandas as pd
import numpy as np

The function below splits the table into 2 numpy arrays: one for target feature (y), the other for the massive of all other features (x).

In [2]:
def x_y_split(data):
    
    data = data.replace(to_replace='positive', value=1)
    data = data.replace(to_replace='negative', value=0)
    y = np.array(data['V10'])
    del data['V10']
    x = np.array(data)
    
    return x, y

The function below splits x and y into train and test sets and also splits train test into positive and negative contexts.

In [3]:
def test_train_split(df_train, df_test):
    
    x_train, y_train = x_y_split(df_train)
    x_test, y_test = x_y_split(df_test)
    
    x_train_pos = x_train[y_train == 1]
    x_train_neg = x_train[y_train == 0]
    
    return x_train, y_train, x_test, y_test, x_train_pos, x_train_neg

In [4]:
df_train = pd.read_csv('train1.csv', sep=',')
df_test = pd.read_csv('test1.csv', sep=',')

In [5]:
df_train

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10
0,x,x,x,x,o,o,x,o,o,positive
1,x,x,x,x,o,o,o,o,x,positive
2,x,x,x,x,o,o,o,b,b,positive
3,x,x,x,x,o,o,b,o,b,positive
4,x,x,x,x,o,o,b,b,o,positive
...,...,...,...,...,...,...,...,...,...,...
860,o,x,x,x,o,o,o,x,x,negative
861,o,x,o,x,x,o,x,o,x,negative
862,o,x,o,x,o,x,x,o,x,negative
863,o,x,o,o,x,x,x,o,x,negative


For some functions the features the quantitative form of nominal features may be needed.

In [49]:
df_train_dummy = pd.get_dummies(df_train)
del df_train_dummy['V10_negative']
df_train_dummy.rename(columns={'V10_positive': 'V10'}, inplace=True)

df_test_dummy = pd.get_dummies(df_test)
del df_test_dummy['V10_negative']
df_test_dummy.rename(columns={'V10_positive': 'V10'}, inplace=True)

In [51]:
df_train_dummy

Unnamed: 0,V1_b,V1_o,V1_x,V2_b,V2_o,V2_x,V3_b,V3_o,V3_x,V4_b,...,V7_b,V7_o,V7_x,V8_b,V8_o,V8_x,V9_b,V9_o,V9_x,V10
0,0,0,1,0,0,1,0,0,1,0,...,0,0,1,0,1,0,0,1,0,1
1,0,0,1,0,0,1,0,0,1,0,...,0,1,0,0,1,0,0,0,1,1
2,0,0,1,0,0,1,0,0,1,0,...,0,1,0,1,0,0,1,0,0,1
3,0,0,1,0,0,1,0,0,1,0,...,1,0,0,0,1,0,1,0,0,1
4,0,0,1,0,0,1,0,0,1,0,...,1,0,0,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
860,0,1,0,0,0,1,0,0,1,0,...,0,1,0,0,0,1,0,0,1,0
861,0,1,0,0,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,1,0
862,0,1,0,0,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,1,0
863,0,1,0,0,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,1,0


In [6]:
x_train, y_train, x_test, y_test, x_train_pos, x_train_neg = test_train_split(df_train, df_test)

In [52]:
x_train_dummy, y_train_dummy, x_test_dummy, y_test_dummy, x_train_pos_dummy, x_train_neg_dummy = test_train_split(df_train_dummy, df_test_dummy)

In [7]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

def metrics_test(y_test, y_pred):
    
    y_test = np.array(y_test)
    y_pred = np.array(y_pred)
    
    TP = np.sum(y_test * y_pred)
    TN = np.sum(y_test + y_pred == 0)
    FP = np.sum((y_test  == 0) * (y_pred == 1))
    FN = np.sum((y_test  == 1) * (y_pred == 0))
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

#     print('True Positive: {:.4f}'.format(TP))
#     print('True Negative: {:.4f}'.format(TN))
#     print('False Positive: {:.4f}'.format(FP))
#     print('False Negativee: {:.4f}'.format(FN))
#     print('Accuracy score: {:.4f}'.format(acc))
#     print('Roc AUC Score: {:.4f}'.format(roc_auc))
#     print('Precision Score: {:.4f}'.format(prc))
#     print('Recall Score: {:.4f}'.format(rcl))
    
    return TP, TN, FP, FN, acc, prec, rec, roc_auc

In [216]:
def print_metrics(metrics):

    print('True Positive: {:.4f}'.format(metrics[0]))
    print('True Negative: {:.4f}'.format(metrics[1]))
    print('False Positive: {:.4f}'.format(metrics[2]))
    print('False Negative: {:.4f}'.format(metrics[3]))
    print('Accuracy score: {:.4f}'.format(metrics[4]))
    print('Precision Score: {:.4f}'.format(metrics[5]))
    print('Recall Score: {:.4f}'.format(metrics[6]))
    print('Roc AUC Score: {:.4f}'.format(metrics[7]))
    print('Time of algorithm work: {:.4f}'.format(metrics[8]))

In [8]:
def make_intent(test_string):
    return set([i+':'+str(k) for i, k in zip(list(df_train),test_string)])

The following function is a function for getting metrics after algorithm launching.

In [99]:
def alg_launch(alg_n, x_train, y_train, x_test, y_test, x_train_pos, x_train_neg):
    
    # time on
    import timeit
    start = timeit.default_timer()
    
    if alg_n == 1:
        y_pred = alg_1(x_test, x_train_pos, x_train_neg)
        
    if alg_n == 2:
        y_pred = alg_2(x_test, x_train_pos, x_train_neg)
        
    if alg_n == 3:
        y_pred = alg_3(x_test_dummy, x_train_pos_dummy, x_train_neg_dummy)
    
    metrics = []
    
    TP, TN, FP, FN, acc, prec, rec, roc_auc = metrics_test(y_test=y_test, y_pred=y_pred)
    for metric in [TP, TN, FP, FN, acc, prec, rec, roc_auc]:
        metrics.append(metric)
    
    # time off    
    stop = timeit.default_timer()
    time = stop - start
    metrics.append(time)
    
    return metrics

## Algorithm

#### Algorithm 1

An example is classified positively if each of its intersections with objects from the plus-context is nested in no more than 2 descriptions from the minus-context (and vice versa).

In [120]:
import random

def alg_1(x_test, x_train_pos, x_train_neg):
    
    y_pred = []
    clas = 0
    threshold = 3
    
    for el in x_test:
        labels = {'positive':1, 'negative':1} #ones
        test_string = make_intent(el)
        for el_pos in x_train_pos:
            pos_string = make_intent(el_pos)
            intersection = pos_string & test_string
            neg_intersect = [make_intent(i) for i in x_train_neg if make_intent(i).issuperset(intersection)]
            if len(neg_intersect) > threshold:
                labels["positive"] = 0
                break
        for el_neg in x_train_neg:
            neg_string = make_intent(el_neg)
            intersection = neg_string & test_string
            pos_intersect = [make_intent(i) for i in x_train_pos if make_intent(i).issuperset(intersection)]
            if len(pos_intersect) > threshold:
                labels["negative"] = 0
                break
    
        if labels["positive"] == labels["negative"]:
            clas = random.choice([0,1])
        if labels["positive"] > labels["negative"]:
            clas = 1
        if labels["positive"] < labels["negative"]:
            clas = 0
        y_pred.append(clas)
        
    return y_pred

Metrics:

In [219]:
metrics = alg_launch(1, x_train, y_train, x_test, y_test, x_train_pos, x_train_neg)
print_metrics(metrics)

True Positive: 23.0000
True Negative: 20.0000
False Positive: 18.0000
False Negative: 34.0000
Accuracy score: 0.4526
Precision Score: 0.5610
Recall Score: 0.4035
Roc AUC Score: 0.4649
Time of algorithm work: 2.7878


#### Algorithm 2

Each plus-context object 'votes' for a positive classification if its intersection with the example does not fit into the minus-context descriptions (and vice versa). An example is classified positively if the number of 'votes' for the positive classification prevails (and vice versa).

In [10]:
def alg_2(x_test, x_train_pos, x_train_neg):
    
    y_pred = []
    clas = 0
    
    for el in x_test:
        labels = {'positive':0, 'negative':0} 
        test_string = make_intent(el)
        for el_pos in x_train_pos:
            pos_string = make_intent(el_pos)
            intersection = pos_string & test_string
            neg_intersect = [make_intent(i) for i in x_train_neg if make_intent(i).issuperset(intersection)]
            if not neg_intersect:
                labels["positive"] += 1
        for el_neg in x_train_neg:
            neg_string = make_intent(el_neg)
            intersection = neg_string & test_string
            pos_intersect = [make_intent(i) for i in x_train_pos if make_intent(i).issuperset(intersection)]
            if not pos_intersect:
                labels["negative"] += 1
            
        labels["positive"] = float(labels["positive"]) / len(x_train_pos)
        labels["negative"] = float(labels["negative"]) / len(x_train_neg)
        
        if labels["positive"] == labels["negative"]:
            clas = random.choice([0,1])
        if labels["positive"] > labels["negative"]:
            clas = 1
        if labels["positive"] < labels["negative"]:
            clas = 0
        y_pred.append(clas)
        
    return y_pred

Metrics:

In [101]:
metrics = alg_launch(2, x_train, y_train, x_test, y_test, x_train_pos, x_train_neg)
print_metrics(metrics)

True Positive: 61.0000
True Negative: 32.0000
False Positive: 0.0000
False Negative: 0.0000
Accuracy score: 1.0000
Precision Score: 1.0000
Recall Score: 1.0000
Roc AUC Score: 1.0000
Time of algorithm work: 277.7055


#### Algorithm 3

An example is classified positively if its intersection with plus-context is greater than with minus-context and vice versa.

In [108]:
def alg_3(x_test, x_train_pos, x_train_neg):

    def Intersect(string, context):
        intersections = 0
        for i in context:
            intersections += ((string * i).sum())
        return intersections
        
    y_pred = []
        
    for string in x_test:
        plus = Intersect(string, x_train_pos)
        minus = Intersect(string, x_train_neg)
        if plus > minus:
            y = 1
        elif plus == minus:
            y = random.choice([0,1])
        else:
            y = 0
        y_pred.append(y)
       
    return y_pred 

Metrics:

In [109]:
metrics = alg_launch(3, x_train, y_train, x_test, y_test, x_train_pos, x_train_neg)
print_metrics(metrics)

True Positive: 61.0000
True Negative: 0.0000
False Positive: 32.0000
False Negative: 0.0000
Accuracy score: 0.6559
Precision Score: 0.6559
Recall Score: 1.0000
Roc AUC Score: 0.5000
Time of algorithm work: 0.3512


## Tic-tac-toe dataset

This part contains of the implementation of the best of above algorithms for 'tic-tac-toe' dataset with KFold cross-validation with 10 splits.
According to accuracy, the best algorithm was the 3rd one.

In [125]:
data = pd.read_table('tic-tac-toe.data', sep = ',', names = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10'], index_col = False)

In [127]:
data_dummy = pd.get_dummies(data)
del data_dummy['V10_negative']
data_dummy.rename(columns={'V10_positive': 'V10'}, inplace=True)

In [129]:
data_dummy

Unnamed: 0,V1_b,V1_o,V1_x,V2_b,V2_o,V2_x,V3_b,V3_o,V3_x,V4_b,...,V7_b,V7_o,V7_x,V8_b,V8_o,V8_x,V9_b,V9_o,V9_x,V10
0,0,0,1,0,0,1,0,0,1,0,...,0,0,1,0,1,0,0,1,0,1
1,0,0,1,0,0,1,0,0,1,0,...,0,1,0,0,0,1,0,1,0,1
2,0,0,1,0,0,1,0,0,1,0,...,0,1,0,0,1,0,0,0,1,1
3,0,0,1,0,0,1,0,0,1,0,...,0,1,0,1,0,0,1,0,0,1
4,0,0,1,0,0,1,0,0,1,0,...,1,0,0,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
953,0,1,0,0,0,1,0,0,1,0,...,0,1,0,0,0,1,0,0,1,0
954,0,1,0,0,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,1,0
955,0,1,0,0,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,1,0
956,0,1,0,0,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,1,0


In [131]:
x, y = x_y_split(data_dummy)

Implementation of the KFold cross-validation with 10 splits.

In [206]:
from sklearn.model_selection import KFold

x_trains = []
x_tests = []
y_trains = []
y_tests = []

fold = KFold(10, shuffle=True)
for train_index, test_index in fold.split(x):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    x_trains.append(x_train)
    x_tests.append(x_test)
    y_trains.append(y_train)
    y_tests.append(y_test)
    
exec('{} = x_trains'.format(','.join([f'x_train{i}' for i in range(len(x_trains))])))
exec('{} = x_tests'.format(','.join([f'x_test{i}' for i in range(len(x_tests))])))
exec('{} = y_trains'.format(','.join([f'y_train{i}' for i in range(len(y_trains))])))
exec('{} = y_tests'.format(','.join([f'y_test{i}' for i in range(len(y_tests))])))

Deviding 10 train sets into positive and negative contexts.

In [210]:
def contexts(train_set):
    x_train_pos = x_train[y_train == 1]
    x_train_neg = x_train[y_train == 0]
    
    return x_train_pos, x_train_neg

x_train_poses = []
x_train_negs = []

for i in x_trains:
    x_train_pos, x_train_neg = contexts(i)
    
    x_train_poses.append(x_train_pos)
    x_train_negs.append(x_train_neg)

exec('{} = x_train_poses'.format(','.join([f'x_train_pos{i}' for i in range(len(x_train_poses))])))
exec('{} = x_train_negs'.format(','.join([f'x_train_neg{i}' for i in range(len(x_train_negs))])))

The following function is the implementation of launchiing algorithm function for cross-validation case. It returns the average metrics of cross-validated algorithm.

In [258]:
def alg_launch_cv(alg_n, x_trains, y_trains, x_tests, y_tests, x_train_poses, x_train_negs):
    
    # time on
    import timeit
    start = timeit.default_timer()
    
    y_preds = []
    for i in range(10):
        
        if alg_n == 1:
            y_pred = alg_1(x_tests[i], x_train_pos[i], x_train_neg[i])
            y_preds.append(y_pred)
        
        if alg_n == 2:
            y_pred = alg_2(x_tests[i], x_train_pos[i], x_train_neg[i])
            y_preds.append(y_pred)
        
        if alg_n == 3:
            y_pred = alg_3(x_tests[i], x_train_pos[i], x_train_neg[i])
            y_preds.append(y_pred)
    
    metrics_10 = []
    for i in range(10):
        metrics = metrics_test(y_test=y_tests[i], y_pred=y_preds[i])
        metrics_10.append(metrics)
        avg_metrics = np.mean(np.array(metrics_10), axis=0)
    
    # time off    
    stop = timeit.default_timer()
    time = stop - start
    
    return avg_metrics, time

In [256]:
metrics, time = alg_launch_cv(3, x_trains, y_trains, x_tests, y_tests, x_train_poses, x_train_negs)
metrics = metrics.tolist()
metrics.append(time)
print_metrics(metrics)

True Positive: 32.4000
True Negative: 15.5000
False Positive: 17.7000
False Negative: 30.2000
Accuracy score: 0.5001
Precision Score: 0.6495
Recall Score: 0.5183
Roc AUC Score: 0.4950
Time of algorithm work: 0.2804
