In [27]:
import pandas as pd
import numpy as np
import faiss
from catboost import CatBoostClassifier, Pool, metrics, cv

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import roc_curve
from sklearn.dummy import DummyClassifier

import lightgbm as lgb


In [3]:
base = pd.read_csv('data/base.csv', index_col=0)
train = pd.read_csv('data/train.csv', index_col=0)

In [4]:
scaler = StandardScaler()
scaler.fit(base)

base_scaled = scaler.transform(base)

In [58]:
base_scaled_df = pd.DataFrame(base_scaled, index=base.index)

In [5]:
train_target = train['Target']
train_features = train.drop(['Target'], axis=1)

train_features_scaled = scaler.transform(train_features)

## 3. FAISS Similarity Search

In [6]:
# download resulted Faiss search on 100 neighbors
I = np.load('data/i_ndarray.npy')

In [7]:
I.shape

(100000, 100)

In [8]:
base_index = {k: v for k, v in enumerate(base.index.to_list())}

acc = 0
for target_base_name, k_closest_vectors in zip(train_target[:1000].values.tolist(), I.tolist()):
    acc += int(target_base_name in [base_index[v] for v in k_closest_vectors])

print(100 * acc / len(I))

0.786


In [9]:
# decrease query indicies as there is not enough RAM on my laptop
I = I[:10000]

## 4. LightGBM Classfication

### 4.1 Preparing Features for a Classifier

In [None]:
def build_hstaked_matrix(train_features_scaled, base, k_ann, dims):
    # tiled (repeated) rows in train
    train_tiled = np.tile(train_features_scaled[:10000], k_ann).reshape(10000, k_ann, dims)

    I_flat = I.ravel()  # flatten I
    result_values = base_scaled[I_flat]  # take only those from base that are in flatten
    I_base = result_values.reshape(10000, 100, 72)  # create new 3d array



In [14]:
train_tiled = np.tile(train_features_scaled[:10000], 100).reshape(10000, 100, 72)  # tiled (repeated) rows in train
I_flat = I.ravel()  # flatten I
result_values = base_scaled[I_flat]  # take only those from base that are in flatten
I_base = result_values.reshape(10000, 100, 72)  # create new 3d array

In [11]:
train_target_tiled = np.tile(train_target[:10000], 100)
I_base_names = base.index[I_flat]

In [12]:
check_equal = np.equal(train_target_tiled, I_base_names).astype('int').reshape(10000, 100, 1)

In [15]:
# Concatenate the two arrays on axis=2 along with the check_equal array 
train_final = np.concatenate([train_tiled[..., :], I_base[..., :], check_equal], axis=2)

In [37]:
np.average(train_final[..., -1])*100

0.00019999999999999998

In [19]:
train_final = train_final.reshape(1_000_000, 145)

#### To balance target classes let's augment our train matrix

In [76]:
train_tail_90000 = train_features_scaled[10_000:, :]

In [77]:
tail_base_90000 = base_scaled_df.loc[train.values[10000:, -1]]

In [79]:
train_final_tail_90000 = np.concatenate([train_tail_90000, tail_base_90000.values, np.ones((90000, 1))], axis=1)

In [80]:
train_final_tail_90000.shape

(90000, 145)

In [81]:
train_final = np.concatenate([train_final, train_final_tail_90000], axis=0)

In [82]:
X = train_final[:, :-1]
y = train_final[:, -1]

In [83]:
np.average(y)

0.0909109090909091

In [84]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=0.8, 
                                                    random_state=42,
                                                    stratify=y)


### 4.3 Model Training

In [85]:
lgbm_clf = lgb.LGBMClassifier(n_jobs=-1, random_state=42)

lgbm_clf.fit(X_train, y_train)

In [86]:
print('LightGBM Classifier accuracy score:', lgbm_clf.score(X_test, y_test))

LightGBM Classifier accuracy score: 0.9392227272727273


## Validation set

In [87]:
validation = pd.read_csv('data/validation.csv', index_col=0)
validation_answer = pd.read_csv('data/validation_answer.csv', index_col=0)

In [None]:
X_scaled = scaler.transform(X)

    