In [1]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn import metrics
from datetime import datetime
import sklearn.tree as tree
from sklearn.linear_model import LogisticRegression
import copy
from sklearn.base import BaseEstimator, ClassifierMixin
import random

# Lazy FCA

Оформим алгоритм `Lazy FCA` как класс, реализующий интерфейс пакета `sklearn` для ML моделей.

In [2]:
class LazyFCA(BaseEstimator, ClassifierMixin):
    def __init__(
        self, threshold=0.5, 
        random=False, sample_share=0.5, 
        bias='random', random_seed=None):
        
        self.threshold = threshold
        self.random = random
        self.sample_share = sample_share
        self.bias = bias
        self.random_seed = random_seed
        self.binary_mapping = dict()
    
    def fit(self, X, y):
        pd.options.mode.chained_assignment = None
        X = self.scaled_X(X)
        y = self.scaled_y(y)
        self.positive_sample = X[y == 1]
        self.negative_sample = X[y == 0]

        if self.random:
            sample_size = int(self.sample_share * self.positive_sample.shape[0])
            self.positive_sample = self.positive_sample.sample(
                    n=sample_size, random_state=self.random_seed)
            self.negative_sample = self.negative_sample.sample(
                    n=sample_size, random_state=self.random_seed)
        
        self.positive_obj = {}
        self.negative_obj = {}
        pos = self.positive_sample
        neg = self.negative_sample
        for i_col in X.columns:
            self.positive_obj[i_col] = pos[i_col][pos[i_col] == 1].index
            self.negative_obj[i_col] = neg[i_col][neg[i_col] == 1].index
        
    def predict(self, X):
        pd.options.mode.chained_assignment = None
        random.seed(self.random_seed)
        X = self.scaled_X(X)
        predictions = []
        for i_obj in range(X.shape[0]):
            i_extent = self.extent(X.iloc[i_obj])
            support_pos  = self.calculate_support(i_extent, 'positive')
            support_neg  = self.calculate_support(i_extent, 'negative')
            
            if support_neg == support_pos:
                if self.bias == 'random':
                    prediction = random.choice([True, False])
                elif self.bias == 'positive':
                    prediction = True
                else:
                    prediction = False
            else: 
                prediction = support_pos > support_neg
            predictions.append(self.binary_mapping[prediction])
        return predictions
    
    def scaled_X(self, X_dataset):
        intervals = 5
        for i_col in X_dataset.columns:
            values = list(X_dataset[i_col].unique())

            if len(values) == 2 and 0 in values and 1 in values:
                continue
            elif len(values) == 1 and (0 in values or 1 in values):
                continue
            
            elif len(values) <= 2 or X_dataset[i_col].dtypes == np.dtype('O'):
                values = sorted(list(X_dataset[i_col].unique()))
                for i_val in values:
                    X_dataset['{}_{}'.format(i_col, i_val)]\
                        = (X_dataset[i_col] == i_val).astype(int)
            
            elif X_dataset[i_col].dtype == np.dtype('int64'):
                min_val = X_dataset[i_col].min()
                max_val = X_dataset[i_col].max()
                gap = max_val - min_val
                start = min_val + gap / intervals
                finish = max_val - gap / intervals
                k = 0
                for i in np.linspace(start, finish, intervals):
                    X_dataset['{}_{}'.format(i_col, k)]\
                        = (X_dataset[i_col] >= i).astype(int)
                    k += 1

            X_dataset.drop([i_col], axis=1, inplace = True)
        return X_dataset
    
    def scaled_y(self, y_series):
        values = sorted(y_series.unique())
        if len(values) != 2:
            raise Exception('Only a binary target feature is possible')
        self.binary_mapping[False] = values[0]
        self.binary_mapping[True] = values[1]
        return (y_series == values[1]).astype(int)

    def calculate_support(self, obj_ext, base):

        base_sample = (self.positive_sample if base == 'positive' 
                else self.negative_sample)
        review_sample = (self.negative_sample if base == 'positive' 
                else self.positive_sample)
        review_obj = (self.negative_obj if base == 'positive' 
                else self.positive_obj)

        res = 0
        for _, i_obj in base_sample.iterrows():
            i_inters = self.intersection(
                obj_ext, self.extent(i_obj))
            support_card = 0
            if i_inters: 
                support = review_obj[i_inters[0]]
                for i_col in i_inters:
                    support = self.intersection(support, review_obj[i_col])
                    if not support: break
                support_card = len(support) / review_sample.shape[0]
                if support_card < self.threshold:
                    res += len(i_inters) / len(obj_ext)
        
        res = res / base_sample.shape[0]
        return res

    def extent(self, series):
        return series[series == 1].index.tolist()

    def intersection(self, L, R):
        return [val for val in L if val in R]

    def belongs(self, sub, base):
        return len(self.intersection(sub, base)) == len(sub)

# Tic-Tac-Toe Dataset

Функция шкалирования для датасета по крестикам-ноликам

In [3]:
def scale(dataset):
    for i in range(9):
        str_i = str(i + 1)
        dataset['v' + str_i] = (dataset['V' + str_i] == 'x').astype(int)
    dataset['v10'] = (dataset['V10'] == 'positive').astype(int)
    dataset.drop(['V' + str(i+1) for i in range(10)], axis=1, inplace = True)
    return dataset

Функция тренерующая переданную модель `model` на датасете по крестикам-ноликам и вычисляющая точность предсказаний полученной модели.

In [4]:
def tic_tac_toe(model, progress_bar=False):
    results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'seconds': []}

    for i in range(10):
        if progress_bar:
            print(f'Progress: {i + 1} / 10')

        train_data = scale(pd.read_csv(f'tic-tac-toe/train{i + 1}.csv'))
        X_train = train_data.iloc[:, :-1]
        y_train = train_data.iloc[:, -1]

        model.fit(X_train, y_train)

        test_data = scale(pd.read_csv(f'tic-tac-toe/test{i + 1}.csv'))
        X_test = test_data.iloc[:, :-1]
        y_test = test_data.iloc[:, -1]

        s = datetime.now()
        y_pred = model.predict(X_test)
        f = datetime.now()

        results['accuracy'].append(metrics.accuracy_score(y_test, y_pred))
        results['precision'].append(metrics.precision_score(y_test, y_pred))
        results['recall'].append(metrics.recall_score(y_test, y_pred))
        results['f1'].append(metrics.f1_score(y_test, y_pred))

        results['seconds'].append((f - s).seconds)

    return pd.DataFrame(results)

## Lazy FCA

Начнем с `Lazy FCA` натренерованной на $\frac{1}{5}$ всего датасета.

In [5]:
model = LazyFCA(
    threshold=0.000001, bias='negative', 
    random=True, sample_share=0.2, random_seed=1)
tic_tac_toe(model)

Unnamed: 0,accuracy,precision,recall,f1,seconds
0,1.0,1.0,1.0,1.0,7
1,0.988506,0.980769,1.0,0.990291,7
2,0.99,0.984848,1.0,0.992366,8
3,0.966292,0.951613,1.0,0.975207,8
4,0.988764,0.984127,1.0,0.992,7
5,0.988235,0.982456,1.0,0.99115,6
6,0.973684,0.958904,1.0,0.979021,10
7,1.0,1.0,1.0,1.0,9
8,1.0,1.0,1.0,1.0,9
9,0.989011,0.983333,1.0,0.991597,6


Теперь посмотрим на `Lazy FCA` на полном датасете.

In [6]:
model = LazyFCA(threshold=0.000001, bias='negative')
tic_tac_toe(model)

Unnamed: 0,accuracy,precision,recall,f1,seconds
0,1.0,1.0,1.0,1.0,35
1,1.0,1.0,1.0,1.0,37
2,1.0,1.0,1.0,1.0,40
3,1.0,1.0,1.0,1.0,34
4,1.0,1.0,1.0,1.0,47
5,1.0,1.0,1.0,1.0,33
6,1.0,1.0,1.0,1.0,40
7,1.0,1.0,1.0,1.0,43
8,1.0,1.0,1.0,1.0,36
9,1.0,1.0,1.0,1.0,34


### Decision Tree

Сравним результаты `Lazy FCA` с классической моделью `Decision Tree`

In [7]:
model = tree.DecisionTreeClassifier(criterion='entropy')

tic_tac_toe(model)

Unnamed: 0,accuracy,precision,recall,f1,seconds
0,0.989247,1.0,0.983607,0.991736,0
1,0.954023,0.979592,0.941176,0.96,0
2,0.99,0.984848,1.0,0.992366,0
3,0.988764,1.0,0.983051,0.991453,0
4,0.988764,1.0,0.983871,0.99187,0
5,0.988235,0.982456,1.0,0.99115,0
6,1.0,1.0,1.0,1.0,0
7,0.971963,0.972973,0.986301,0.979592,0
8,0.990291,1.0,0.985714,0.992806,0
9,0.989011,1.0,0.983051,0.991453,0


Даже модель, натренированная на 20% от всех данных оказалась лучше дерева решений, а полная модель достигла абсолютной точности.

# Titanic Dataset

Рассмотрим теперь работу `Lazy FCA` на знаменитом датасете - данных о смертности пассажиров Титаника, и сравним полученную точность с точностью логистической регрессии.

In [8]:
titanic_data = pd.read_csv('titanic/train.csv')\
    .drop(columns=['Name', 'Ticket', 'PassengerId', 'Cabin'])\
    .dropna()\
    .rename(columns={"Survived": "target"})
titanic_data

Unnamed: 0,target,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
885,0,3,female,39.0,0,5,29.1250,Q
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
889,1,1,male,26.0,0,0,30.0000,C


Шкалирование численных и категориальных данных. Численные разбиваются на `intervals` равных интервалов и для каждого интервала создается своя фича. В категориальных данных для каждой категории создается своя фича.

In [9]:
def scaling(data, numeric, categorical, intervals=5):
    for attr in numeric:
        min_val = data[attr].min()
        max_val = data[attr].max()
        gap = max_val - min_val
        k = 0
        for i in np.linspace(min_val + gap / intervals, max_val - gap / intervals, intervals):
            data[attr + '_' + str(k)] = (data[attr] >= i).astype(int)
            k += 1
        data = data.drop(attr, axis=1)
        
    for attr in categorical:
        for i in data[attr].unique():
            data[attr + '_' + str(i)] = (data[attr] == i).astype(int)
        data = data.drop(attr, axis=1)
    return data

In [10]:
titanic_data = scaling(titanic_data, numeric=['Age', 'Fare'], categorical=['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked'])
titanic_data

Unnamed: 0,target,Age_0,Age_1,Age_2,Age_3,Age_4,Fare_0,Fare_1,Fare_2,Fare_3,...,Parch_0,Parch_1,Parch_2,Parch_5,Parch_3,Parch_4,Parch_6,Embarked_S,Embarked_C,Embarked_Q
0,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
1,1,1,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
2,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
3,1,1,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
4,0,1,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,1,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
886,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
887,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
889,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0


Функция тренерующая переданную модель `model` на датасете по пассажирам титаника и вычисляющая точность предсказаний полученной модели.

In [11]:
def titanc(model, progress_bar=False):
    columns = list(titanic_data.columns)
    columns.remove('target')

    X = titanic_data.loc[:, columns]
    y = titanic_data.target

    results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'seconds': []}
    for k in range(10):
        if progress_bar:
            print(f'Progress: {k + 1} / 10')
        X_train, X_test, y_train, y_test = model_selection\
            .train_test_split(X, y, test_size=0.33, random_state=k)

        model.fit(X_train, y_train)

        s = datetime.now()
        y_pred = model.predict(X_test)
        f = datetime.now()

        results['accuracy'].append(metrics.accuracy_score(y_test, y_pred))
        results['precision'].append(metrics.precision_score(y_test, y_pred))
        results['recall'].append(metrics.recall_score(y_test, y_pred))
        results['f1'].append(metrics.f1_score(y_test, y_pred))
        
        results['seconds'].append((f - s).seconds)

    return pd.DataFrame(results)

### Lazy classification

Найдем сначала перебором лучшие параметры для нашей модели, используя 10% от всего датасета при тренеровке моделей.

In [12]:
for i in ['random', 'positive', 'negative']:
    for j in np.linspace(0.1, 0.9, 5):
        model = LazyFCA(threshold=j, bias=i, random=True, sample_share=0.1)
        res = titanc(model)
        print()
        print('Parameters:', model)
        print()

        print(res)
        print()
        print('F1:', res['f1'].mean())


Parameters: LazyFCA(random=True, sample_share=0.1, threshold=0.1)

   accuracy  precision    recall        f1  seconds
0  0.702128   0.609524  0.688172  0.646465        3
1  0.527660   0.458101  0.854167  0.596364        2
2  0.761702   0.701149  0.670330  0.685393        2
3  0.791489   0.772152  0.663043  0.713450        3
4  0.791489   0.773333  0.644444  0.703030        3
5  0.574468   0.459459  0.772727  0.576271        3
6  0.748936   0.672897  0.750000  0.709360        2
7  0.634043   0.537879  0.739583  0.622807        2
8  0.748936   0.679612  0.729167  0.703518        2
9  0.591489   0.465753  0.790698  0.586207        3

F1: 0.6542864431070503

Parameters: LazyFCA(random=True, sample_share=0.1, threshold=0.30000000000000004)

   accuracy  precision    recall        f1  seconds
0  0.740426   0.710526  0.580645  0.639053        3
1  0.719149   0.750000  0.468750  0.576923        3
2  0.714894   0.671429  0.516484  0.583851        2
3  0.710638   0.785714  0.358696  0.492537  

Лучшими по $F_1$ метрике оказались параметры `bias` $=$ `random` и `threshold` $=0.9$. Запустим нашу модель на всех данных, используя эти параметры.

In [15]:
model = LazyFCA(threshold=0.9, bias='random')
res = titanc(model)

print(res)
print()
print('F1:', res['f1'].mean())

   accuracy  precision    recall        f1  seconds
0  0.765957   0.796875  0.548387  0.649682       52
1  0.774468   0.811594  0.583333  0.678788       51
2  0.761702   0.753623  0.571429  0.650000       61
3  0.761702   0.750000  0.586957  0.658537       58
4  0.787234   0.794118  0.600000  0.683544       70
5  0.787234   0.779412  0.602273  0.679487       58
6  0.748936   0.793651  0.520833  0.628931       56
7  0.795745   0.852941  0.604167  0.707317       52
8  0.774468   0.786667  0.614583  0.690058       62
9  0.812766   0.850000  0.593023  0.698630       57

F1: 0.672497398340006


### Logistic regression

Сравним теперь результаты `Lazy FCA` с классической моделью логистической регрессии.

In [16]:
model = LogisticRegression(solver='lbfgs', random_state=0)
res = titanc(model)

print(res)
print()
print('F1:', res['f1'].mean())

   accuracy  precision    recall        f1  seconds
0  0.753191   0.684211  0.698925  0.691489        0
1  0.770213   0.710000  0.739583  0.724490        0
2  0.765957   0.691489  0.714286  0.702703        0
3  0.778723   0.738095  0.673913  0.704545        0
4  0.791489   0.735632  0.711111  0.723164        0
5  0.787234   0.711111  0.727273  0.719101        0
6  0.748936   0.703297  0.666667  0.684492        0
7  0.808511   0.800000  0.708333  0.751381        0
8  0.787234   0.734694  0.750000  0.742268        0
9  0.795745   0.720930  0.720930  0.720930        0

F1: 0.716456374814656


К сожалению, наш алгоритм, даже использующий весь датасет, не смог обойти по $F_1$ метрике логистическую регрессию.