In [99]:
import pandas as pd
import numpy as np
import faiss
from catboost import CatBoostClassifier, Pool, metrics, cv

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import roc_curve
from sklearn.dummy import DummyClassifier

import lightgbm as lgb


In [3]:
base = pd.read_csv('data/base.csv', index_col=0)
train = pd.read_csv('data/train.csv', index_col=0)

In [4]:
scaler = StandardScaler()
scaler.fit(base)

base_scaled = scaler.transform(base)

In [58]:
base_scaled_df = pd.DataFrame(base_scaled, index=base.index)

In [5]:
train_target = train['Target']
train_features = train.drop(['Target'], axis=1)

train_features_scaled = scaler.transform(train_features)

## 3. FAISS Similarity Search

In [6]:
# download resulted Faiss flat index on 100 neighbors
I = np.load('data/i_ndarray.npy')

In [100]:
I.shape

(10000, 100)

In [8]:
base_index = {k: v for k, v in enumerate(base.index.to_list())}

acc = 0
for target_base_name, k_closest_vectors in zip(train_target[:1000].values.tolist(), I.tolist()):
    acc += int(target_base_name in [base_index[v] for v in k_closest_vectors])

print(100 * acc / len(I))

0.786


In [9]:
# decrease query indicies as there is not enough RAM on my laptop
I = I[:10000]

## 4. LightGBM Classfication

### 4.1 Preparing Features for a Classifier

In [113]:
def build_hstaked_matrix(matrix1, matrix2, index, base_df, y=None, k_ann=100, dims=72, qn=10000):
    '''
    Takes two scaled 3D (qn x k_ann x dims) matricies 
    and transforms them into horizontally stacked 2D matrix;
    along with y vector if it's present

    matrix1 is train or validation matrix;
    matrix2 is a base index-filtered matrix
    '''
    # prepare matrix1 - tile rows
    matrix1_tiled = np.tile(matrix1[:qn], k_ann).reshape(qn, k_ann, dims)
    
    # prepare matrix2
    index_flat = index.ravel()  # flatten index
    # take only those from base that are in flatten
    index_base = matrix2[index_flat].reshape(qn, k_ann, dims)  # create new 3d array
    
    if y is not None:
        last_dim = 1 # it needs for reshape in final result
        if np.all(y == 1):
            last_axis = y
        else:
            y_tiled = np.tile(y[:qn], k_ann)
            I_base_names = base_df.index[index_flat]
            last_axis = np.equal(y_tiled, I_base_names).astype('int').reshape(qn, k_ann, 1)

        # Concatenate the two arrays on axis=2 along with the check_equal array 
        result = np.concatenate([matrix1_tiled, index_base, last_axis], axis=2)
    else:
        last_dim = 0
        result = np.concatenate([matrix1_tiled, index_base], axis=2)

    result = result.reshape(qn*k_ann, (dims*2)+last_dim)

    return result

In [101]:
train_final = build_hstaked_matrix(train_features_scaled, base_scaled, I, base, train_target)

In [102]:
np.average(train_final[..., -1])*100

0.00019999999999999998

#### To balance target classes let's augment our train matrix

In [103]:
train_tail_90000 = train_features_scaled[10_000:, :]
tail_base_90000 = base_scaled_df.loc[train.values[10000:, -1]]

train_final_tail_90000 = np.concatenate([train_tail_90000, tail_base_90000.values, np.ones((90000, 1))], axis=1)

train_final_tail_90000.shape

(90000, 145)

In [104]:
train_final = np.concatenate([train_final, train_final_tail_90000], axis=0)

In [105]:
X = train_final[:, :-1]
y = train_final[:, -1]

In [106]:
np.average(y)

0.08257064220183487

In [107]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=0.8, 
                                                    random_state=42,
                                                    stratify=y)


### 4.3 Model Training

In [108]:
lgbm_clf = lgb.LGBMClassifier(n_jobs=-1, random_state=42)

lgbm_clf.fit(X_train, y_train)

In [109]:
print('LightGBM Classifier accuracy score:', lgbm_clf.score(X_test, y_test))

LightGBM Classifier accuracy score: 0.9445412844036697


## Validation set

In [110]:
validation = pd.read_csv('data/validation.csv', index_col=0)
validation_answer = pd.read_csv('data/validation_answer.csv', index_col=0)

In [114]:
validation_scaled = scaler.transform(validation)

In [115]:
# by default it takes only first 10_000 rows from validation set
validation_final = build_hstaked_matrix(validation_scaled, base_scaled, I, base)

In [116]:
validation_final.shape

(1000000, 144)

In [134]:
qn = 10_000
k_ann = 100
y_tiled = np.tile(validation_answer.iloc[:qn].values, k_ann).reshape(qn*k_ann,)
I_base_names = base.index[I.ravel()]
last_axis = np.equal(y_tiled, I_base_names).astype('int')

In [130]:
validation_answer.iloc[:10000].values.shape

(10000, 1)

In [140]:
np.average(last_axis)*100

0.00019999999999999998

In [132]:
I_base_names.shape

(1000000,)

In [118]:
print('LightGBM Classifier accuracy score on validation set:', lgbm_clf.score(validation_final, ))

TypeError: score() missing 1 required positional argument: 'y'